From 2e8cfcac93596fb630310ca975b72a62208381d7 Mon Sep 17 00:00:00 2001 From: Tanya Schlusser Date: Sat, 15 Dec 2018 22:20:12 +0100 Subject: [PATCH 01/80] ARROW-3230: [Python] Missing comparisons on ChunkedArray, Table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `__eq__` method to `Table`, `Column`, and `ChunkedArray`, plus relevant tests. Author: Tanya Schlusser Author: Krisztián Szűcs Closes #3183 from tanyaschlusser/ARROW-3230 and squashes the following commits: 0ea512e0 minor fixes 2ea12f3c Add '__eq__' method to Table, Column, and ChunkedArray and remove '__richcmp__' from Column 47d24973 Add '==' and '!=' tests for Table, Column, and ChunkedArray --- python/pyarrow/table.pxi | 26 ++++++++++++++++++-------- python/pyarrow/tests/test_table.py | 9 +++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index cf3411dc03616..4d52f26e749fc 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -117,6 +117,12 @@ cdef class ChunkedArray: else: index -= self.chunked_array.chunk(j).get().length() + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + def equals(self, ChunkedArray other): """ Return whether the contents of two chunked arrays are equal @@ -411,14 +417,6 @@ cdef class Column: return result.getvalue() - def __richcmp__(Column self, Column other, int op): - if op == cp.Py_EQ: - return self.equals(other) - elif op == cp.Py_NE: - return not self.equals(other) - else: - raise TypeError('Invalid comparison') - def __getitem__(self, key): return self.data[key] @@ -540,6 +538,12 @@ cdef class Column: def __array__(self, dtype=None): return self.data.__array__(dtype=dtype) + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + def equals(self, Column other): """ Check if contents of two columns are equal @@ -1111,6 +1115,12 @@ cdef class Table: return pyarrow_wrap_table(flattened) + def __eq__(self, other): + try: + return self.equals(other) + except TypeError: + return NotImplemented + def equals(self, Table other): """ Check if contents of two tables are equal diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index ecbf93bd3e8b0..847b1a4ca550d 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -117,6 +117,8 @@ def eq(xarrs, yarrs): y = pa.chunked_array(yarrs) assert x.equals(y) assert y.equals(x) + assert x == y + assert x != str(y) def ne(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): @@ -129,6 +131,7 @@ def ne(xarrs, yarrs): y = pa.chunked_array(yarrs) assert not x.equals(y) assert not y.equals(x) + assert x != y eq(pa.chunked_array([], type=pa.int32()), pa.chunked_array([], type=pa.int32())) @@ -224,6 +227,9 @@ def test_column_basics(): assert len(column) == 5 assert column.shape == (5,) assert column.to_pylist() == [-10, -5, 0, 5, 10] + assert column == pa.Column.from_array("a", column.data) + assert column != pa.Column.from_array("b", column.data) + assert column != column.data def test_column_factory_function(): @@ -577,6 +583,9 @@ def test_table_basics(): col.data.chunk(col.data.num_chunks) assert table.columns == columns + assert table == pa.Table.from_arrays(columns) + assert table != pa.Table.from_arrays(columns[1:]) + assert table != columns def test_table_from_arrays_preserves_column_metadata(): From d61ae4ae488a74840c464576f59167b3d774f102 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Dec 2018 15:41:37 -0600 Subject: [PATCH 02/80] ARROW-3449: [C++] Fixes to build with CMake 3.2. Document what requires newer CMake This also resolves ARROW-3984 Author: Wes McKinney Closes #3174 from wesm/ARROW-3449 and squashes the following commits: 9f33412c4 Fixes to build with CMake 3.2. Document what features require newer CMake in the README. Add Docker task for CMake 3.2 from kszucs --- cpp/CMakeLists.txt | 4 +++- cpp/Dockerfile | 4 +++- cpp/README.md | 10 +++++++++- cpp/cmake_modules/ThirdpartyToolchain.cmake | 17 ++++++++++++++++- cpp/src/arrow/util/compression-test.cc | 9 +++++++-- cpp/src/gandiva/jni/CMakeLists.txt | 4 ++++ cpp/src/plasma/CMakeLists.txt | 4 ---- cpp/thirdparty/download_dependencies.sh | 2 +- cpp/thirdparty/versions.txt | 1 + dev/tasks/tests.yml | 10 ++++++++++ docker-compose.yml | 16 ++++++++++++++++ 11 files changed, 70 insertions(+), 11 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 54daaf96e8eb6..54ec1e5ef6501 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,6 +16,7 @@ # under the License. cmake_minimum_required(VERSION 3.2) +message(STATUS "Building using CMake version: ${CMAKE_VERSION}") # Extract Arrow version number file(READ "${CMAKE_CURRENT_SOURCE_DIR}/../java/pom.xml" POM_XML) @@ -436,11 +437,12 @@ endif() ############################################################ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) + # Currently the compression tests require at least these libraries; bz2 and + # zstd are optional. See ARROW-3984 set(ARROW_WITH_BROTLI ON) set(ARROW_WITH_LZ4 ON) set(ARROW_WITH_SNAPPY ON) set(ARROW_WITH_ZLIB ON) - set(ARROW_WITH_ZSTD ON) endif() if(ARROW_BUILD_TESTS) diff --git a/cpp/Dockerfile b/cpp/Dockerfile index c4791019634c1..84c00b91cc405 100644 --- a/cpp/Dockerfile +++ b/cpp/Dockerfile @@ -30,6 +30,7 @@ RUN apt-get update -y -q && \ wget # install conda and required packages +ARG EXTRA_CONDA_PKGS ENV PATH=/opt/conda/bin:$PATH \ CONDA_PREFIX=/opt/conda ADD ci/docker_install_conda.sh \ @@ -39,7 +40,8 @@ ADD ci/docker_install_conda.sh \ RUN arrow/ci/docker_install_conda.sh && \ conda install -c conda-forge \ --file arrow/ci/conda_env_cpp.yml \ - --file arrow/ci/conda_env_unix.yml && \ + --file arrow/ci/conda_env_unix.yml \ + $EXTRA_CONDA_PKGS && \ conda clean --all ENV CC=gcc \ diff --git a/cpp/README.md b/cpp/README.md index 1f12117e8d01e..71aa98ed9c924 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -30,7 +30,7 @@ in-source and out-of-source builds with the latter one being preferred. Building Arrow requires: * A C++11-enabled compiler. On Linux, gcc 4.8 and higher should be sufficient. -* CMake +* CMake 3.2 or higher * Boost On Ubuntu/Debian you can install the requirements with: @@ -459,6 +459,14 @@ both of these options would be used rarely. Current known uses-cases when they a * Parameterized tests in google test. +## CMake version requirements + +We support CMake 3.2 and higher. Some features require a newer version of CMake: + +* Building the benchmarks requires 3.6 or higher +* Building zstd from source requires 3.7 or higher +* Building Gandiva JNI bindings requires 3.11 or higher + [1]: https://brew.sh/ [2]: https://github.com/apache/arrow/blob/master/cpp/apidoc/Windows.md [3]: https://google.github.io/styleguide/cppguide.html diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 8f3fc2cabe3c2..c007b1c225bb9 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -407,6 +407,13 @@ else() # disable autolinking in boost add_definitions(-DBOOST_ALL_NO_LIB) endif() + + if (DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT) + # In older versions of CMake (such as 3.2), the system paths for Boost will + # be looked in first even if we set $BOOST_ROOT or pass -DBOOST_ROOT + set(Boost_NO_SYSTEM_PATHS ON) + endif() + if (ARROW_BOOST_USE_SHARED) # Find shared Boost libraries. set(Boost_USE_STATIC_LIBS OFF) @@ -629,8 +636,11 @@ if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS endif() if(ARROW_BUILD_BENCHMARKS) - if("$ENV{GBENCHMARK_HOME}" STREQUAL "") + if(CMAKE_VERSION VERSION_LESS 3.6) + message(FATAL_ERROR "Building gbenchmark from source requires at least CMake 3.6") + endif() + if(NOT MSVC) set(GBENCHMARK_CMAKE_CXX_FLAGS "-fPIC -std=c++11 ${EP_CXX_FLAGS}") endif() @@ -1095,6 +1105,11 @@ if (ARROW_WITH_ZSTD) "-DCMAKE_C_FLAGS=${EP_C_FLAGS}") endif() + if(CMAKE_VERSION VERSION_LESS 3.7) + message(FATAL_ERROR "Building zstd using ExternalProject requires \ +at least CMake 3.7") + endif() + ExternalProject_Add(zstd_ep ${EP_LOG_OPTIONS} CMAKE_ARGS ${ZSTD_CMAKE_ARGS} diff --git a/cpp/src/arrow/util/compression-test.cc b/cpp/src/arrow/util/compression-test.cc index e0e6f4837f201..22bec001bfd45 100644 --- a/cpp/src/arrow/util/compression-test.cc +++ b/cpp/src/arrow/util/compression-test.cc @@ -448,17 +448,22 @@ TEST_P(CodecTest, StreamingRoundtrip) { INSTANTIATE_TEST_CASE_P(TestGZip, CodecTest, ::testing::Values(Compression::GZIP)); -INSTANTIATE_TEST_CASE_P(TestZSTD, CodecTest, ::testing::Values(Compression::ZSTD)); - INSTANTIATE_TEST_CASE_P(TestSnappy, CodecTest, ::testing::Values(Compression::SNAPPY)); INSTANTIATE_TEST_CASE_P(TestLZ4, CodecTest, ::testing::Values(Compression::LZ4)); INSTANTIATE_TEST_CASE_P(TestBrotli, CodecTest, ::testing::Values(Compression::BROTLI)); +// bz2 requires a binary installation, there is no ExternalProject #if ARROW_WITH_BZ2 INSTANTIATE_TEST_CASE_P(TestBZ2, CodecTest, ::testing::Values(Compression::BZ2)); #endif +// The ExternalProject for zstd does not build on CMake < 3.7, so we do not +// require it here +#ifdef ARROW_WITH_ZSTD +INSTANTIATE_TEST_CASE_P(TestZSTD, CodecTest, ::testing::Values(Compression::ZSTD)); +#endif + } // namespace util } // namespace arrow diff --git a/cpp/src/gandiva/jni/CMakeLists.txt b/cpp/src/gandiva/jni/CMakeLists.txt index 9f7bc526dbf5b..ab04f536b4dd2 100644 --- a/cpp/src/gandiva/jni/CMakeLists.txt +++ b/cpp/src/gandiva/jni/CMakeLists.txt @@ -17,6 +17,10 @@ project(gandiva_jni) +if(CMAKE_VERSION VERSION_LESS 3.11) + message(FATAL_ERROR "Building the Gandiva JNI bindings requires CMake version >= 3.11") +endif() + # Find JNI find_package(JNI REQUIRED) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 317835bb7ac44..15d16af0fb9aa 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -20,16 +20,12 @@ add_custom_target(plasma) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../python/cmake_modules") - -find_package(PythonLibsNew REQUIRED) find_package(Threads) # The SO version is also the ABI version set(PLASMA_SO_VERSION "${ARROW_SO_VERSION}") set(PLASMA_FULL_SO_VERSION "${ARROW_FULL_SO_VERSION}") -include_directories(SYSTEM ${PYTHON_INCLUDE_DIRS}) include_directories("${FLATBUFFERS_INCLUDE_DIR}" "${CMAKE_CURRENT_LIST_DIR}/" "${CMAKE_CURRENT_LIST_DIR}/thirdparty/" "${CMAKE_CURRENT_LIST_DIR}/../") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200809L") diff --git a/cpp/thirdparty/download_dependencies.sh b/cpp/thirdparty/download_dependencies.sh index ea63a8a41fb4e..de7d23ca2ef5e 100755 --- a/cpp/thirdparty/download_dependencies.sh +++ b/cpp/thirdparty/download_dependencies.sh @@ -38,7 +38,7 @@ download_dependency() { # --show-progress will not output to stdout, it is safe to pipe the result of # the script into eval. - wget --quiet --show-progress --continue --output-document="${out}" "${url}" + wget --quiet --continue --output-document="${out}" "${url}" } main() { diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 705f56c0e6130..fc539da73945b 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -61,6 +61,7 @@ DEPENDENCIES=( "ARROW_ORC_URL orc-${ORC_VERSION}.tar.gz https://github.com/apache/orc/archive/rel/release-${ORC_VERSION}.tar.gz" "ARROW_PROTOBUF_URL protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/google/protobuf/releases/download/${PROTOBUF_VERSION}/protobuf-all-${PROTOBUF_VERSION:1}.tar.gz" "ARROW_RAPIDJSON_URL rapidjson-${RAPIDJSON_VERSION}.tar.gz https://github.com/miloyip/rapidjson/archive/${RAPIDJSON_VERSION}.tar.gz" + "ARROW_RE2_URL re2-${RE2_VERSION}.tar.gz https://github.com/google/re2/archive/${RE2_VERSION}.tar.gz" "ARROW_SNAPPY_URL snappy-${SNAPPY_VERSION}.tar.gz https://github.com/google/snappy/releases/download/${SNAPPY_VERSION}/snappy-${SNAPPY_VERSION}.tar.gz" "ARROW_THRIFT_URL thrift-${THRIFT_VERSION}.tar.gz http://archive.apache.org/dist/thrift/${THRIFT_VERSION}/thrift-${THRIFT_VERSION}.tar.gz" "ARROW_ZLIB_URL zlib-${ZLIB_VERSION}.tar.gz http://zlib.net/fossils/zlib-${ZLIB_VERSION}.tar.gz" diff --git a/dev/tasks/tests.yml b/dev/tasks/tests.yml index d51fa7eac7a35..d9493b606e5a0 100644 --- a/dev/tasks/tests.yml +++ b/dev/tasks/tests.yml @@ -22,6 +22,7 @@ groups: - docker-rust - docker-cpp - docker-cpp-alpine + - docker-cpp-cmake32 - docker-c_glib - docker-go - docker-python-2.7 @@ -45,6 +46,7 @@ groups: cpp-python: - docker-cpp - docker-cpp-alpine + - docker-cpp-cmake32 - docker-python-2.7 - docker-python-2.7-alpine - docker-python-3.6 @@ -87,6 +89,14 @@ tasks: - docker-compose build cpp-alpine - docker-compose run cpp-alpine + docker-cpp-cmake32: + platform: linux + template: docker-tests/travis.linux.yml + params: + commands: + - docker-compose build cpp-cmake32 + - docker-compose run cpp-cmake32 + docker-c_glib: platform: linux template: docker-tests/travis.linux.yml diff --git a/docker-compose.yml b/docker-compose.yml index 51f1a49542212..d3a7990d5cc23 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -57,6 +57,22 @@ services: PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data volumes: *ubuntu-volumes + cpp-cmake32: + # Usage: + # docker-compose build cpp-cmake32 + # docker-compose run cpp-cmake32 + image: arrow:cpp-cmake32 + shm_size: 2G + build: + context: . + dockerfile: cpp/Dockerfile + args: + EXTRA_CONDA_PKGS: cmake=3.2 + environment: + ARROW_ORC: "OFF" + PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data + volumes: *ubuntu-volumes + cpp-alpine: # Usage: # docker-compose build cpp-alpine From 784d1cd04603f6f1f97904a62dac153d2569d2dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 15 Dec 2018 15:56:51 -0600 Subject: [PATCH 03/80] ARROW-4044: [Packaging/Python] Add hypothesis test dependency to pyarrow conda recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crossbow tests: [kszucs/crossbow/build-374](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-374) Author: Krisztián Szűcs Closes #3187 from kszucs/ARROW-4044 and squashes the following commits: 9bc4d880b add hypothesis to test requires --- dev/tasks/conda-recipes/pyarrow/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/tasks/conda-recipes/pyarrow/meta.yaml b/dev/tasks/conda-recipes/pyarrow/meta.yaml index 167056ba68e9c..7c653876765b5 100644 --- a/dev/tasks/conda-recipes/pyarrow/meta.yaml +++ b/dev/tasks/conda-recipes/pyarrow/meta.yaml @@ -58,6 +58,7 @@ test: requires: - pytest + - hypothesis commands: - pytest --pyargs pyarrow From 055496cd9a040d64d4a00d773261e61e7caac31b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 15 Dec 2018 15:57:40 -0600 Subject: [PATCH 04/80] ARROW-4029: [C++] Exclude headers with 'internal' from installation. Document header file conventions in README In reviewing what usages of the `install` command we have, I added a helper function to add `.pc` files to reduce code duplication Author: Wes McKinney Closes #3176 from wesm/ARROW-4029 and squashes the following commits: f5b3811fc Exclude headers with 'internal' from installation. Document in README. Add function to reduce code duplication in adding pkg-config files --- cpp/README.md | 6 ++++++ cpp/cmake_modules/BuildUtils.cmake | 18 +++++++++++++++++- cpp/src/arrow/CMakeLists.txt | 7 +------ cpp/src/arrow/array/CMakeLists.txt | 10 +--------- cpp/src/arrow/compute/CMakeLists.txt | 7 +------ cpp/src/arrow/compute/kernels/CMakeLists.txt | 6 +----- cpp/src/arrow/flight/CMakeLists.txt | 7 +------ cpp/src/arrow/gpu/CMakeLists.txt | 10 +--------- cpp/src/arrow/io/CMakeLists.txt | 11 +---------- cpp/src/arrow/python/CMakeLists.txt | 7 +------ cpp/src/arrow/util/string_view/CMakeLists.txt | 2 +- cpp/src/arrow/util/variant/CMakeLists.txt | 8 +------- cpp/src/gandiva/CMakeLists.txt | 7 +------ cpp/src/parquet/CMakeLists.txt | 8 +------- cpp/src/plasma/CMakeLists.txt | 7 +------ 15 files changed, 36 insertions(+), 85 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index 71aa98ed9c924..010387dbd4de3 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -313,6 +313,12 @@ which use the default pool without explicitly passing it. You can disable these constructors in your application (so that you are accounting properly for all memory allocations) by defining `ARROW_NO_DEFAULT_MEMORY_POOL`. +### Header files + +We use the `.h` extension for C++ header files. Any header file name not +containing `internal` is considered to be a public header, and will be +automatically installed by the build. + ### Error Handling and Exceptions For error handling, we use `arrow::Status` values instead of throwing C++ diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 1abe97eecc59f..7585ae9da8fa8 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -583,7 +583,23 @@ function(ARROW_INSTALL_ALL_HEADERS PATH) set(ARG_PATTERN "*.h") endif() file(GLOB CURRENT_DIRECTORY_HEADERS ${ARG_PATTERN}) + + set(PUBLIC_HEADERS) + foreach(HEADER ${CURRENT_DIRECTORY_HEADERS}) + if (NOT ((HEADER MATCHES "internal"))) + LIST(APPEND PUBLIC_HEADERS ${HEADER}) + endif() + endforeach() install(FILES - ${CURRENT_DIRECTORY_HEADERS} + ${PUBLIC_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PATH}") endfunction() + +function(ARROW_ADD_PKG_CONFIG MODULE) + configure_file(${MODULE}.pc.in + "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" + @ONLY) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/${MODULE}.pc" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +endfunction() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b13c9b66ac48d..bec290df2aa37 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -252,12 +252,7 @@ endforeach() ARROW_INSTALL_ALL_HEADERS("arrow") # pkg-config support -configure_file(arrow.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow") ####################################### # Unit tests diff --git a/cpp/src/arrow/array/CMakeLists.txt b/cpp/src/arrow/array/CMakeLists.txt index a789c88dd9d31..4a8ce3490abd1 100644 --- a/cpp/src/arrow/array/CMakeLists.txt +++ b/cpp/src/arrow/array/CMakeLists.txt @@ -16,12 +16,4 @@ # under the License. # Headers: top level -install(FILES - builder_adaptive.h - builder_base.h - builder_binary.h - builder_decimal.h - builder_dict.h - builder_nested.h - builder_primitive.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/array") +ARROW_INSTALL_ALL_HEADERS("arrow/array") diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index 242937005cf9c..75d152b0bafa3 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -18,12 +18,7 @@ ARROW_INSTALL_ALL_HEADERS("arrow/compute") # pkg-config support -configure_file(arrow-compute.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-compute.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-compute.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow-compute") ####################################### # Unit tests diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 923c8c3bd4e81..a5a142b5c28ce 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -15,8 +15,4 @@ # specific language governing permissions and limitations # under the License. -install(FILES - boolean.h - cast.h - hash.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/compute/kernels") +ARROW_INSTALL_ALL_HEADERS("arrow/compute/kernels") diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index bc22d60b7131a..aa56269a8953e 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -18,12 +18,7 @@ add_custom_target(arrow_flight) # Header files -install(FILES - api.h - client.h - server.h - types.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/flight") +ARROW_INSTALL_ALL_HEADERS("arrow/flight") SET(ARROW_FLIGHT_STATIC_LINK_LIBS grpc_grpcpp diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index c37779aefa9aa..8b69c654bb1fe 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -64,15 +64,7 @@ install(FILES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") ARROW_INSTALL_ALL_HEADERS("arrow/gpu") - -# pkg-config support -configure_file(arrow-cuda.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-cuda.pc" - @ONLY) - -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-cuda.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow-cuda") set(ARROW_CUDA_TEST_LINK_LIBS arrow_cuda_shared diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index 80d68fb503bb9..13b577f7d41b2 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -41,13 +41,4 @@ ADD_ARROW_BENCHMARK(memory-benchmark PREFIX "arrow-io") # Headers: top level -install(FILES - api.h - buffered.h - compressed.h - file.h - hdfs.h - interfaces.h - memory.h - readahead.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/io") +ARROW_INSTALL_ALL_HEADERS("arrow/io") diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 4913083537340..98c105ae623ce 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -94,12 +94,7 @@ endif() ARROW_INSTALL_ALL_HEADERS("arrow/python") # pkg-config support -configure_file(arrow-python.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-python.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-python.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("arrow-python") # ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/string_view/CMakeLists.txt b/cpp/src/arrow/util/string_view/CMakeLists.txt index bae6bdb807d92..7e553077db1ad 100644 --- a/cpp/src/arrow/util/string_view/CMakeLists.txt +++ b/cpp/src/arrow/util/string_view/CMakeLists.txt @@ -17,4 +17,4 @@ install(FILES string_view.hpp - DESTINATION include/arrow/util/string_view) + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/util/string_view") diff --git a/cpp/src/arrow/util/variant/CMakeLists.txt b/cpp/src/arrow/util/variant/CMakeLists.txt index 0ebb2516246ed..b7a5692b6207c 100644 --- a/cpp/src/arrow/util/variant/CMakeLists.txt +++ b/cpp/src/arrow/util/variant/CMakeLists.txt @@ -19,10 +19,4 @@ # arrow_util_variant ####################################### -install(FILES - optional.h - recursive_wrapper.h - variant_cast.h - variant_io.h - variant_visitor.h - DESTINATION include/arrow/util/variant) +ARROW_INSTALL_ALL_HEADERS("arrow/util/variant") diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 9763f297b0b8b..da0d3bba69147 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -96,12 +96,7 @@ include(GNUInstallDirs) ARROW_INSTALL_ALL_HEADERS("gandiva") # pkg-config support -configure_file(gandiva.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/gandiva.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/gandiva.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("gandiva") set(GANDIVA_STATIC_TEST_LINK_LIBS gandiva_static diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 6b7846b709d0b..995c39adb7d35 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -249,13 +249,7 @@ install(FILES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") # pkg-config support -configure_file(parquet.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/parquet.pc" - @ONLY) - -install(FILES - "${CMAKE_CURRENT_BINARY_DIR}/parquet.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("parquet") ADD_PARQUET_TEST(bloom_filter-test) ADD_PARQUET_TEST(column_reader-test) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 15d16af0fb9aa..83c201d0f45a0 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -150,12 +150,7 @@ install(TARGETS plasma_store_server DESTINATION ${CMAKE_INSTALL_BINDIR}) # pkg-config support -configure_file(plasma.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/plasma.pc" - @ONLY) -install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/plasma.pc" - DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +ARROW_ADD_PKG_CONFIG("plasma") if(ARROW_PLASMA_JAVA_CLIENT) # Plasma java client support From ec154d232ed5585721e0ef12d61c1a6e2c06fdae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Sat, 15 Dec 2018 16:00:27 -0600 Subject: [PATCH 05/80] =?UTF-8?q?ARROW-2026:=20[C++]=20Enforce=20use=5Fdep?= =?UTF-8?q?recated=5Fint96=5Ftimestamps=20to=20all=20time=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …stamps fields. This changes the behavior of `use_deprecated_int96_timestamps` to support all timestamp fields irregardless of the time unit. It would previously only apply this conversion to fields with Nanosecond resolution. People will only use this option when they use a system that only supports INT96 timestamps, systems that also support INT64 timestamps in other resolutions would not need the option. A notable API change is that this option now take precedence over the coerce_timestamps option. Author: François Saint-Jacques Closes #3173 from fsaintjacques/ARROW-2026-parquet-int96-conversion and squashes the following commits: 2897a7278 ARROW-2026: Enforce use_deprecated_int96_timestamps to all timestamps fields. --- .../parquet/arrow/arrow-reader-writer-test.cc | 185 +++++++++--------- cpp/src/parquet/arrow/reader.cc | 16 +- cpp/src/parquet/arrow/schema.cc | 73 ++++--- cpp/src/parquet/arrow/writer.cc | 74 ++++--- cpp/src/parquet/arrow/writer.h | 62 ++++-- cpp/src/parquet/types.h | 21 ++ python/pyarrow/parquet.py | 4 +- python/pyarrow/tests/test_parquet.py | 5 +- 8 files changed, 256 insertions(+), 184 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 07124ebb3057a..4e62a22c350ff 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -1193,65 +1193,116 @@ void MakeDateTimeTypesTable(std::shared_ptr* out, bool nanos_as_micros = auto f0 = field("f0", ::arrow::date32()); auto f1 = field("f1", ::arrow::timestamp(TimeUnit::MILLI)); auto f2 = field("f2", ::arrow::timestamp(TimeUnit::MICRO)); - std::shared_ptr<::arrow::Field> f3; - if (nanos_as_micros) { - f3 = field("f3", ::arrow::timestamp(TimeUnit::MICRO)); - } else { - f3 = field("f3", ::arrow::timestamp(TimeUnit::NANO)); - } + auto f3_unit = nanos_as_micros ? TimeUnit::MICRO : TimeUnit::NANO; + auto f3 = field("f3", ::arrow::timestamp(f3_unit)); auto f4 = field("f4", ::arrow::time32(TimeUnit::MILLI)); auto f5 = field("f5", ::arrow::time64(TimeUnit::MICRO)); + std::shared_ptr<::arrow::Schema> schema(new ::arrow::Schema({f0, f1, f2, f3, f4, f5})); std::vector t32_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; - std::vector t64_values = {1489269000000, 1489270000000, 1489271000000, - 1489272000000, 1489272000000, 1489273000000}; + std::vector t64_ns_values = {1489269000000, 1489270000000, 1489271000000, + 1489272000000, 1489272000000, 1489273000000}; std::vector t64_us_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; + std::vector t64_ms_values = {1489269, 1489270, 1489271, + 1489272, 1489272, 1489273}; std::shared_ptr a0, a1, a2, a3, a4, a5; ArrayFromVector<::arrow::Date32Type, int32_t>(f0->type(), is_valid, t32_values, &a0); - ArrayFromVector<::arrow::TimestampType, int64_t>(f1->type(), is_valid, t64_values, &a1); - ArrayFromVector<::arrow::TimestampType, int64_t>(f2->type(), is_valid, t64_values, &a2); - if (nanos_as_micros) { - ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, t64_us_values, - &a3); - } else { - ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, t64_values, - &a3); - } + ArrayFromVector<::arrow::TimestampType, int64_t>(f1->type(), is_valid, t64_ms_values, + &a1); + ArrayFromVector<::arrow::TimestampType, int64_t>(f2->type(), is_valid, t64_us_values, + &a2); + auto f3_data = nanos_as_micros ? t64_us_values : t64_ns_values; + ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, f3_data, &a3); ArrayFromVector<::arrow::Time32Type, int32_t>(f4->type(), is_valid, t32_values, &a4); - ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_values, &a5); + ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5); std::vector> columns = { std::make_shared("f0", a0), std::make_shared("f1", a1), std::make_shared("f2", a2), std::make_shared("f3", a3), std::make_shared("f4", a4), std::make_shared("f5", a5)}; + *out = Table::Make(schema, columns); } TEST(TestArrowReadWrite, DateTimeTypes) { - std::shared_ptr
table; + std::shared_ptr
table, result; MakeDateTimeTypesTable(&table); - // Use deprecated INT96 type - std::shared_ptr
result; - ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( - table, false /* use_threads */, table->num_rows(), {}, &result, - ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build())); - - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); - // Cast nanaoseconds to microseconds and use INT64 physical type ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result)); - std::shared_ptr
expected; MakeDateTimeTypesTable(&table, true); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); } +TEST(TestArrowReadWrite, UseDeprecatedInt96) { + using ::arrow::ArrayFromVector; + using ::arrow::field; + using ::arrow::schema; + + std::vector is_valid = {true, true, true, false, true, true}; + + auto t_s = ::arrow::timestamp(TimeUnit::SECOND); + auto t_ms = ::arrow::timestamp(TimeUnit::MILLI); + auto t_us = ::arrow::timestamp(TimeUnit::MICRO); + auto t_ns = ::arrow::timestamp(TimeUnit::NANO); + + std::vector s_values = {1489269, 1489270, 1489271, 1489272, 1489272, 1489273}; + std::vector ms_values = {1489269000, 1489270000, 1489271000, + 1489272001, 1489272000, 1489273000}; + std::vector us_values = {1489269000000, 1489270000000, 1489271000000, + 1489272000001, 1489272000000, 1489273000000}; + std::vector ns_values = {1489269000000000LL, 1489270000000000LL, + 1489271000000000LL, 1489272000000001LL, + 1489272000000000LL, 1489273000000000LL}; + + std::shared_ptr a_s, a_ms, a_us, a_ns; + ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, is_valid, s_values, &a_s); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ms, is_valid, ms_values, &a_ms); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_us, is_valid, us_values, &a_us); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, is_valid, ns_values, &a_ns); + + // Each input is typed with a unique TimeUnit + auto input_schema = schema( + {field("f_s", t_s), field("f_ms", t_ms), field("f_us", t_us), field("f_ns", t_ns)}); + auto input = Table::Make( + input_schema, + {std::make_shared("f_s", a_s), std::make_shared("f_ms", a_ms), + std::make_shared("f_us", a_us), std::make_shared("f_ns", a_ns)}); + + // When reading parquet files, all int96 schema fields are converted to + // timestamp nanoseconds + auto ex_schema = schema({field("f_s", t_ns), field("f_ms", t_ns), field("f_us", t_ns), + field("f_ns", t_ns)}); + auto ex_result = Table::Make( + ex_schema, + {std::make_shared("f_s", a_ns), std::make_shared("f_ms", a_ns), + std::make_shared("f_us", a_ns), std::make_shared("f_ns", a_ns)}); + + std::shared_ptr
result; + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( + input, false /* use_threads */, input->num_rows(), {}, &result, + ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build())); + + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); + + // Ensure enable_deprecated_int96_timestamps as precedence over + // coerce_timestamps. + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(input, false /* use_threads */, + input->num_rows(), {}, &result, + ArrowWriterProperties::Builder() + .enable_deprecated_int96_timestamps() + ->coerce_timestamps(TimeUnit::MILLI) + ->build())); + + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); +} + TEST(TestArrowReadWrite, CoerceTimestamps) { using ::arrow::ArrayFromVector; using ::arrow::field; @@ -1297,6 +1348,12 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { {std::make_shared("f_s", a_ms), std::make_shared("f_ms", a_ms), std::make_shared("f_us", a_ms), std::make_shared("f_ns", a_ms)}); + std::shared_ptr
milli_result; + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( + input, false /* use_threads */, input->num_rows(), {}, &milli_result, + ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MILLI)->build())); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_milli_result, *milli_result)); + // Result when coercing to microseconds auto s3 = std::shared_ptr<::arrow::Schema>( new ::arrow::Schema({field("f_s", t_us), field("f_ms", t_us), field("f_us", t_us), @@ -1306,13 +1363,6 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { {std::make_shared("f_s", a_us), std::make_shared("f_ms", a_us), std::make_shared("f_us", a_us), std::make_shared("f_ns", a_us)}); - std::shared_ptr
milli_result; - ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( - input, false /* use_threads */, input->num_rows(), {}, &milli_result, - ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MILLI)->build())); - - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_milli_result, *milli_result)); - std::shared_ptr
micro_result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, µ_result, @@ -1457,65 +1507,6 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_table, *result)); } -// Regression for ARROW-2802 -TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) { - using ::arrow::Column; - using ::arrow::default_memory_pool; - using ::arrow::Field; - using ::arrow::Schema; - using ::arrow::Table; - using ::arrow::TimestampBuilder; - using ::arrow::TimestampType; - using ::arrow::TimeUnit; - - auto timestamp_type = std::make_shared(TimeUnit::NANO); - - TimestampBuilder builder(timestamp_type, default_memory_pool()); - for (std::int64_t ii = 0; ii < 10; ++ii) { - ASSERT_OK(builder.Append(1000000000L * ii)); - } - std::shared_ptr values; - ASSERT_OK(builder.Finish(&values)); - - std::vector> fields; - auto field = std::make_shared("nanos", timestamp_type); - fields.emplace_back(field); - - auto schema = std::make_shared(fields); - - std::vector> columns; - auto column = std::make_shared("nanos", values); - columns.emplace_back(column); - - auto table = Table::Make(schema, columns); - - auto arrow_writer_properties = ArrowWriterProperties::Builder() - .coerce_timestamps(TimeUnit::MICRO) - ->enable_deprecated_int96_timestamps() - ->build(); - - std::shared_ptr
result; - DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result, - arrow_writer_properties); - - ASSERT_EQ(table->num_columns(), result->num_columns()); - ASSERT_EQ(table->num_rows(), result->num_rows()); - - auto actual_column = result->column(0); - auto data = actual_column->data(); - auto expected_values = - static_cast<::arrow::NumericArray*>(values.get())->raw_values(); - for (int ii = 0; ii < data->num_chunks(); ++ii) { - auto chunk = - static_cast<::arrow::NumericArray*>(data->chunk(ii).get()); - auto values = chunk->raw_values(); - for (int64_t jj = 0; jj < chunk->length(); ++jj, ++expected_values) { - // Check that the nanos have been converted to micros - ASSERT_EQ(*expected_values / 1000, values[jj]); - } - } -} - void MakeDoubleTable(int num_columns, int num_rows, int nchunks, std::shared_ptr
* out) { std::shared_ptr<::arrow::Column> column; @@ -2289,11 +2280,13 @@ TEST_P(TestNestedSchemaRead, DeepNestedSchemaRead) { INSTANTIATE_TEST_CASE_P(Repetition_type, TestNestedSchemaRead, ::testing::Values(Repetition::REQUIRED, Repetition::OPTIONAL)); -TEST(TestImpalaConversion, NanosecondToImpala) { +TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) { // June 20, 2017 16:32:56 and 123456789 nanoseconds int64_t nanoseconds = INT64_C(1497976376123456789); - Int96 expected = {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}}; + Int96 calculated; + + Int96 expected = {{UINT32_C(632093973), UINT32_C(13871), UINT32_C(2457925)}}; internal::NanosecondsToImpalaTimestamp(nanoseconds, &calculated); ASSERT_EQ(expected, calculated); } diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 2a7730d42ad23..7830b6abc75d1 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -77,18 +77,6 @@ namespace arrow { using ::arrow::BitUtil::BytesForBits; -constexpr int64_t kJulianToUnixEpochDays = 2440588LL; -constexpr int64_t kMillisecondsInADay = 86400000LL; -constexpr int64_t kNanosecondsInADay = kMillisecondsInADay * 1000LL * 1000LL; - -static inline int64_t impala_timestamp_to_nanoseconds(const Int96& impala_timestamp) { - int64_t days_since_epoch = impala_timestamp.value[2] - kJulianToUnixEpochDays; - int64_t nanoseconds = 0; - - memcpy(&nanoseconds, &impala_timestamp.value, sizeof(int64_t)); - return days_since_epoch * kNanosecondsInADay + nanoseconds; -} - template using ArrayType = typename ::arrow::TypeTraits::ArrayType; @@ -1045,7 +1033,7 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { auto data_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { - *data_ptr++ = impala_timestamp_to_nanoseconds(values[i]); + *data_ptr++ = Int96GetNanoSeconds(values[i]); } if (reader->nullable_values()) { @@ -1072,7 +1060,7 @@ struct TransferFunctor<::arrow::Date64Type, Int32Type> { auto out_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { - *out_ptr++ = static_cast(values[i]) * kMillisecondsInADay; + *out_ptr++ = static_cast(values[i]) * kMillisecondsPerDay; } if (reader->nullable_values()) { diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index d0014a6f3aa2a..af9fbc91a5042 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -423,45 +423,66 @@ Status StructToNode(const std::shared_ptr<::arrow::StructType>& type, return Status::OK(); } +static LogicalType::type LogicalTypeFromArrowTimeUnit(::arrow::TimeUnit::type time_unit) { + switch (time_unit) { + case ::arrow::TimeUnit::MILLI: + return LogicalType::TIMESTAMP_MILLIS; + case ::arrow::TimeUnit::MICRO: + return LogicalType::TIMESTAMP_MICROS; + case ::arrow::TimeUnit::SECOND: + case ::arrow::TimeUnit::NANO: + // No equivalent parquet logical type. + break; + } + + return LogicalType::NONE; +} + static Status GetTimestampMetadata(const ::arrow::TimestampType& type, const ArrowWriterProperties& properties, ParquetType::type* physical_type, LogicalType::type* logical_type) { - auto unit = type.unit(); - *physical_type = ParquetType::INT64; + const bool coerce = properties.coerce_timestamps_enabled(); + const auto unit = coerce ? properties.coerce_timestamps_unit() : type.unit(); - if (properties.coerce_timestamps_enabled()) { - auto coerce_unit = properties.coerce_timestamps_unit(); - if (coerce_unit == ::arrow::TimeUnit::MILLI) { - *logical_type = LogicalType::TIMESTAMP_MILLIS; - } else if (coerce_unit == ::arrow::TimeUnit::MICRO) { - *logical_type = LogicalType::TIMESTAMP_MICROS; - } else { - return Status::NotImplemented( - "Can only coerce Arrow timestamps to milliseconds" - " or microseconds"); + // The user is explicitly asking for Impala int96 encoding, there is no + // logical type. + if (properties.support_deprecated_int96_timestamps()) { + *physical_type = ParquetType::INT96; + return Status::OK(); + } + + *physical_type = ParquetType::INT64; + *logical_type = LogicalTypeFromArrowTimeUnit(unit); + + // The user is requesting that all timestamp columns are casted to a specific + // type. Only 2 TimeUnit are supported by arrow-parquet. + if (coerce) { + switch (unit) { + case ::arrow::TimeUnit::MILLI: + case ::arrow::TimeUnit::MICRO: + break; + case ::arrow::TimeUnit::NANO: + case ::arrow::TimeUnit::SECOND: + return Status::NotImplemented( + "Can only coerce Arrow timestamps to milliseconds" + " or microseconds"); } + return Status::OK(); } - if (unit == ::arrow::TimeUnit::MILLI) { - *logical_type = LogicalType::TIMESTAMP_MILLIS; - } else if (unit == ::arrow::TimeUnit::MICRO) { + // Until ARROW-3729 is resolved, nanoseconds are explicitly converted to + // int64 microseconds when deprecated int96 is not requested. + if (type.unit() == ::arrow::TimeUnit::NANO) *logical_type = LogicalType::TIMESTAMP_MICROS; - } else if (unit == ::arrow::TimeUnit::NANO) { - if (properties.support_deprecated_int96_timestamps()) { - *physical_type = ParquetType::INT96; - // No corresponding logical type - } else { - *logical_type = LogicalType::TIMESTAMP_MICROS; - } - } else { + else if (type.unit() == ::arrow::TimeUnit::SECOND) return Status::NotImplemented( "Only MILLI, MICRO, and NANOS units supported for Arrow timestamps with " "Parquet."); - } + return Status::OK(); -} +} // namespace arrow Status FieldToNode(const std::shared_ptr& field, const WriterProperties& properties, @@ -698,7 +719,7 @@ int32_t DecimalSize(int32_t precision) { } DCHECK(false); return -1; -} +} // namespace arrow } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 402cbf0f2027c..bce9f37026c97 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -386,7 +386,11 @@ class ArrowColumnWriter { Status WriteBatch(int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const typename ParquetType::c_type* values) { - auto typed_writer = static_cast*>(writer_); + auto typed_writer = + ::arrow::internal::checked_cast*>(writer_); + // WriteBatch was called with type mismatching the writer_'s type. This + // could be a schema conversion problem. + DCHECK(typed_writer); PARQUET_CATCH_NOT_OK( typed_writer->WriteBatch(num_levels, def_levels, rep_levels, values)); return Status::OK(); @@ -397,7 +401,11 @@ class ArrowColumnWriter { const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const typename ParquetType::c_type* values) { - auto typed_writer = static_cast*>(writer_); + auto typed_writer = + ::arrow::internal::checked_cast*>(writer_); + // WriteBatchSpaced was called with type mismatching the writer_'s type. This + // could be a schema conversion problem. + DCHECK(typed_writer); PARQUET_CATCH_NOT_OK(typed_writer->WriteBatchSpaced( num_levels, def_levels, rep_levels, valid_bits, valid_bits_offset, values)); return Status::OK(); @@ -570,20 +578,42 @@ NULLABLE_BATCH_FAST_PATH(DoubleType, ::arrow::DoubleType, double) NULLABLE_BATCH_FAST_PATH(Int64Type, ::arrow::TimestampType, int64_t) NONNULLABLE_BATCH_FAST_PATH(Int64Type, ::arrow::TimestampType, int64_t) +#define CONV_CASE_LOOP(ConversionFunction) \ + for (int64_t i = 0; i < num_values; i++) \ + ConversionFunction(arrow_values[i], &output[i]); + +static void ConvertArrowTimestampToParquetInt96(const int64_t* arrow_values, + int64_t num_values, + ::arrow::TimeUnit ::type unit_type, + Int96* output) { + switch (unit_type) { + case TimeUnit::NANO: + CONV_CASE_LOOP(internal::NanosecondsToImpalaTimestamp); + break; + case TimeUnit::MICRO: + CONV_CASE_LOOP(internal::MicrosecondsToImpalaTimestamp); + break; + case TimeUnit::MILLI: + CONV_CASE_LOOP(internal::MillisecondsToImpalaTimestamp); + break; + case TimeUnit::SECOND: + CONV_CASE_LOOP(internal::SecondsToImpalaTimestamp); + break; + } +} + +#undef CONV_CASE_LOOP + template <> Status ArrowColumnWriter::WriteNullableBatch( const ::arrow::TimestampType& type, int64_t num_values, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const int64_t* values) { - Int96* buffer; + Int96* buffer = nullptr; RETURN_NOT_OK(ctx_->GetScratchData(num_values, &buffer)); - if (type.unit() == TimeUnit::NANO) { - for (int i = 0; i < num_values; i++) { - internal::NanosecondsToImpalaTimestamp(values[i], &buffer[i]); - } - } else { - return Status::NotImplemented("Only NANO timestamps are supported for Int96 writing"); - } + + ConvertArrowTimestampToParquetInt96(values, num_values, type.unit(), buffer); + return WriteBatchSpaced(num_levels, def_levels, rep_levels, valid_bits, valid_bits_offset, buffer); } @@ -592,15 +622,11 @@ template <> Status ArrowColumnWriter::WriteNonNullableBatch( const ::arrow::TimestampType& type, int64_t num_values, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const int64_t* values) { - Int96* buffer; + Int96* buffer = nullptr; RETURN_NOT_OK(ctx_->GetScratchData(num_values, &buffer)); - if (type.unit() == TimeUnit::NANO) { - for (int i = 0; i < num_values; i++) { - internal::NanosecondsToImpalaTimestamp(values[i], buffer + i); - } - } else { - return Status::NotImplemented("Only NANO timestamps are supported for Int96 writing"); - } + + ConvertArrowTimestampToParquetInt96(values, num_values, type.unit(), buffer); + return WriteBatch(num_levels, def_levels, rep_levels, buffer); } @@ -611,21 +637,15 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level const bool is_nanosecond = type.unit() == TimeUnit::NANO; - // In the case where support_deprecated_int96_timestamps was specified - // and coerce_timestamps_enabled was specified, a nanosecond column - // will have a physical type of int64. In that case, we fall through - // to the else if below. - // - // See https://issues.apache.org/jira/browse/ARROW-2082 - if (is_nanosecond && ctx_->properties->support_deprecated_int96_timestamps() && - !ctx_->properties->coerce_timestamps_enabled()) { + if (ctx_->properties->support_deprecated_int96_timestamps()) { + // The user explicitly required to use Int96 storage. return TypedWriteBatch(values, num_levels, def_levels, rep_levels); } else if (is_nanosecond || (ctx_->properties->coerce_timestamps_enabled() && (type.unit() != ctx_->properties->coerce_timestamps_unit()))) { // Casting is required. This covers several cases - // * Nanoseconds -> cast to microseconds + // * Nanoseconds -> cast to microseconds (until ARROW-3729 is resolved) // * coerce_timestamps_enabled_, cast all timestamps to requested unit return WriteTimestampsCoerce(ctx_->properties->truncated_timestamps_allowed(), values, num_levels, def_levels, rep_levels); diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index 2538c028002e4..50cb4cfea7d8d 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -45,19 +45,19 @@ class PARQUET_EXPORT ArrowWriterProperties { class Builder { public: Builder() - : write_nanos_as_int96_(false), + : write_timestamps_as_int96_(false), coerce_timestamps_enabled_(false), coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), truncated_timestamps_allowed_(false) {} virtual ~Builder() {} Builder* disable_deprecated_int96_timestamps() { - write_nanos_as_int96_ = false; + write_timestamps_as_int96_ = false; return this; } Builder* enable_deprecated_int96_timestamps() { - write_nanos_as_int96_ = true; + write_timestamps_as_int96_ = true; return this; } @@ -79,19 +79,19 @@ class PARQUET_EXPORT ArrowWriterProperties { std::shared_ptr build() { return std::shared_ptr(new ArrowWriterProperties( - write_nanos_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, + write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, truncated_timestamps_allowed_)); } private: - bool write_nanos_as_int96_; + bool write_timestamps_as_int96_; bool coerce_timestamps_enabled_; ::arrow::TimeUnit::type coerce_timestamps_unit_; bool truncated_timestamps_allowed_; }; - bool support_deprecated_int96_timestamps() const { return write_nanos_as_int96_; } + bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } ::arrow::TimeUnit::type coerce_timestamps_unit() const { @@ -105,12 +105,12 @@ class PARQUET_EXPORT ArrowWriterProperties { bool coerce_timestamps_enabled, ::arrow::TimeUnit::type coerce_timestamps_unit, bool truncated_timestamps_allowed) - : write_nanos_as_int96_(write_nanos_as_int96), + : write_timestamps_as_int96_(write_nanos_as_int96), coerce_timestamps_enabled_(coerce_timestamps_enabled), coerce_timestamps_unit_(coerce_timestamps_unit), truncated_timestamps_allowed_(truncated_timestamps_allowed) {} - const bool write_nanos_as_int96_; + const bool write_timestamps_as_int96_; const bool coerce_timestamps_enabled_; const ::arrow::TimeUnit::type coerce_timestamps_unit_; const bool truncated_timestamps_allowed_; @@ -208,24 +208,52 @@ namespace internal { * Timestamp conversion constants */ constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588); -constexpr int64_t kNanosecondsPerDay = INT64_C(86400000000000); -/** - * Converts nanosecond timestamps to Impala (Int96) format - */ -inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, - Int96* impala_timestamp) { - int64_t julian_days = (nanoseconds / kNanosecondsPerDay) + kJulianEpochOffsetDays; +template +inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) { + int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays; (*impala_timestamp).value[2] = (uint32_t)julian_days; - int64_t last_day_nanos = nanoseconds % kNanosecondsPerDay; + int64_t last_day_units = time % UnitPerDay; int64_t* impala_last_day_nanos = reinterpret_cast(impala_timestamp); - *impala_last_day_nanos = last_day_nanos; + *impala_last_day_nanos = last_day_units * NanosecondsPerUnit; +} + +constexpr int64_t kSecondsInNanos = INT64_C(1000000000); + +inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp(seconds, + impala_timestamp); +} + +constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000); + +inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + milliseconds, impala_timestamp); +} + +constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000); + +inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + microseconds, impala_timestamp); +} + +constexpr int64_t kNanosecondsInNanos = INT64_C(1); + +inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + nanoseconds, impala_timestamp); } } // namespace internal } // namespace arrow + } // namespace parquet #endif // PARQUET_ARROW_WRITER_H diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index b27718027b0da..1812f5547abc2 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -175,6 +175,19 @@ struct FixedLenByteArray { using FLBA = FixedLenByteArray; +// Julian day at unix epoch. +// +// The Julian Day Number (JDN) is the integer assigned to a whole solar day in +// the Julian day count starting from noon Universal time, with Julian day +// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC, +// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian +// calendar), +constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588); +constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24); +constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000); +constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000); +constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000); + MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; }; STRUCT_END(Int96, 12); @@ -192,6 +205,14 @@ static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); } +static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { + int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays; + int64_t nanoseconds = 0; + + memcpy(&nanoseconds, &i96.value, sizeof(int64_t)); + return days_since_epoch * kNanosecondsPerDay + nanoseconds; +} + static inline std::string Int96ToString(const Int96& a) { std::ostringstream result; std::copy(a.value, a.value + 3, std::ostream_iterator(result, " ")); diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index b89145adc4433..feaa890fc6cd9 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -284,8 +284,8 @@ def _sanitize_table(table, new_schema, flavor): Specify if we should use dictionary encoding in general or only for some columns. use_deprecated_int96_timestamps : boolean, default None - Write nanosecond resolution timestamps to INT96 Parquet - format. Defaults to False unless enabled by flavor argument + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. coerce_timestamps : string, default None Cast timestamps a particular resolution. Valid values: {None, 'ms', 'us'} diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 5c27a9b86a369..82c80e9e09d13 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -844,7 +844,7 @@ def test_date_time_types(): a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') - start = pd.Timestamp('2000-01-01').value / 1000 + start = pd.Timestamp('2001-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) @@ -892,8 +892,9 @@ def test_date_time_types(): # date64 as date32 # time32[s] to time32[ms] + # 'timestamp[ms]' is saved as INT96 timestamp # 'timestamp[ns]' is saved as INT96 timestamp - expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], + expected = pa.Table.from_arrays([a1, a1, a7, a4, a5, ex_a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', From 1fd2a25ec0b890a12837cbbfb4c431d2506d1845 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 15 Dec 2018 23:18:42 +0100 Subject: [PATCH 06/80] ARROW-3953: [Python] Compat with pandas 0.24 rename of MultiIndex labels -> codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Joris Van den Bossche Author: Krisztián Szűcs Closes #3120 from jorisvandenbossche/pandas-multiindex-codes and squashes the following commits: e5442a5e test no warns 329f3e47 Compat with pandas 0.24 rename of MultiIndex labels -> codes --- python/pyarrow/pandas_compat.py | 14 +++++++++++--- python/pyarrow/tests/test_convert_pandas.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index ec0e490291384..0eebcf6e1eec3 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -726,6 +726,14 @@ def _pandas_type_to_numpy_type(pandas_type): return np.dtype(pandas_type) +def _get_multiindex_codes(mi): + # compat for pandas < 0.24 (MI labels renamed to codes). + if isinstance(mi, pd.MultiIndex): + return mi.codes if hasattr(mi, 'codes') else mi.labels + else: + return None + + def _reconstruct_columns_from_metadata(columns, column_indexes): """Construct a pandas MultiIndex from `columns` and column index metadata in `column_indexes`. @@ -752,7 +760,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] - labels = getattr(columns, 'labels', None) or [ + labels = _get_multiindex_codes(columns) or [ pd.RangeIndex(len(level)) for level in levels ] @@ -779,7 +787,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): new_levels.append(level) - return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names) + return pd.MultiIndex(new_levels, labels, names=columns.names) def _table_to_blocks(options, block_table, memory_pool, categories): @@ -796,7 +804,7 @@ def _table_to_blocks(options, block_table, memory_pool, categories): def _flatten_single_level_multiindex(index): if isinstance(index, pd.MultiIndex) and index.nlevels == 1: levels, = index.levels - labels, = index.labels + labels, = _get_multiindex_codes(index) # Cheaply check that we do not somehow have duplicate column names if not index.is_unique: diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index ce9d6d117acb2..4d283b3150606 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -176,6 +176,16 @@ def test_multiindex_columns_unicode(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) _check_pandas_roundtrip(df, preserve_index=True) + def test_multiindex_doesnt_warn(self): + # ARROW-3953: pandas 0.24 rename of MultiIndex labels to codes + columns = pd.MultiIndex.from_arrays([['one', 'two'], ['X', 'Y']]) + df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) + + with pytest.warns(None) as record: + _check_pandas_roundtrip(df, preserve_index=True) + + assert len(record) == 0 + def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) _check_pandas_roundtrip(df, preserve_index=True) From 715cba576db31bf643885cee6d3eb02f90ab001b Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Sun, 16 Dec 2018 10:29:15 +0900 Subject: [PATCH 07/80] ARROW-4035: [Ruby] Support msys2 mingw dependencies Author: Yosuke Shiro Closes #3181 from shiro615/support-msys2-mingw-dependencies and squashes the following commits: e20dce3c Support msys2 mingw dependencies --- ruby/red-arrow/red-arrow.gemspec | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec index cca87749ea19c..3f0f68aa332cf 100644 --- a/ruby/red-arrow/red-arrow.gemspec +++ b/ruby/red-arrow/red-arrow.gemspec @@ -52,4 +52,6 @@ Gem::Specification.new do |spec| spec.add_development_dependency("bundler") spec.add_development_dependency("rake") spec.add_development_dependency("test-unit") + + spec.metadata["msys2_mingw_dependencies"] = "apache-arrow" end From ac047b2fd2893b711116bf8c6b7df8b60c6dd8e3 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Sun, 16 Dec 2018 22:58:47 +0900 Subject: [PATCH 08/80] ARROW-4048: [GLib] Return ChunkedArray instead of Array in gparquet_arrow_file_reader_read_column Because `FileReader::ReadColumn(int i, std::shared_ptr* out)` is deprecated since 0.12. Author: Yosuke Shiro Closes #3192 from shiro615/glib-return-chunked-array-instead-of-array and squashes the following commits: b814c9a0 Add arrow_ prefix to Arrow C++ objects bd9c466e Return ChunkedArray instead of Array in gparquet_arrow_file_reader_read_column --- c_glib/parquet-glib/arrow-file-reader.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp b/c_glib/parquet-glib/arrow-file-reader.cpp index 398e85b02c08a..5c16e827fc14b 100644 --- a/c_glib/parquet-glib/arrow-file-reader.cpp +++ b/c_glib/parquet-glib/arrow-file-reader.cpp @@ -310,8 +310,8 @@ gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader, return NULL; } - std::shared_ptr arrow_array; - status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_array); + std::shared_ptr arrow_chunked_array; + status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_chunked_array); if (!garrow_error_check(error, status, "[parquet][arrow][file-reader][read-column]")) { @@ -319,7 +319,7 @@ gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader, } auto arrow_field = arrow_schema->field(0); - auto arrow_column = std::make_shared(arrow_field, arrow_array); + auto arrow_column = std::make_shared(arrow_field, arrow_chunked_array); return garrow_column_new_raw(&arrow_column); } From 77d3a46e14c5292024619c1fb08bba444c42b52c Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 16 Dec 2018 14:01:53 -0600 Subject: [PATCH 09/80] ARROW-4049: [C++] Arrow never use glog even though glog is linked. The following is a part of arrow/util/logging.cc. ``` #ifdef ARROW_USE_GLOG typedef google::LogMessage LoggingProvider; #else typedef CerrLog LoggingProvider; #endif ``` As you see, when ARROW_USE_GLOG is defined, glog is intended to be used but it's not never defined and glog is never used. I've fixed this by adding `add_definition` command when CMake variable `ARROW_USE_GLOG` is ON. Author: Kousuke Saruta Closes #3196 from sarutak/arrow-use-glog and squashes the following commits: 87be74161 Fix to use glog --- cpp/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 54ec1e5ef6501..e3cc3f560a95f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -713,6 +713,7 @@ endif() if (ARROW_USE_GLOG) SET(ARROW_STATIC_LINK_LIBS glog_static ${ARROW_STATIC_LINK_LIBS}) + add_definitions("-DARROW_USE_GLOG") endif() if (ARROW_STATIC_LINK_LIBS) From 5d1934fc3f5c65f70a3966b71c68941b2fd8d362 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Mon, 17 Dec 2018 09:57:20 +0900 Subject: [PATCH 10/80] ARROW-4034: [Ruby] Add support :append option to FileOutputStream Author: Kouhei Sutou Closes #3193 from kou/ruby-file-output-stream-append and squashes the following commits: 6240f4b7 Add support :append option to FileOutputStream --- ruby/red-arrow/lib/arrow/field.rb | 1 + .../red-arrow/lib/arrow/file-output-stream.rb | 34 ++++++++++++ ruby/red-arrow/lib/arrow/loader.rb | 1 + ruby/red-arrow/lib/arrow/table.rb | 1 + .../red-arrow/test/test-file-output-stream.rb | 54 +++++++++++++++++++ 5 files changed, 91 insertions(+) create mode 100644 ruby/red-arrow/lib/arrow/file-output-stream.rb create mode 100644 ruby/red-arrow/test/test-file-output-stream.rb diff --git a/ruby/red-arrow/lib/arrow/field.rb b/ruby/red-arrow/lib/arrow/field.rb index b1ed1149deca9..be5865fd5564c 100644 --- a/ruby/red-arrow/lib/arrow/field.rb +++ b/ruby/red-arrow/lib/arrow/field.rb @@ -18,6 +18,7 @@ module Arrow class Field alias_method :initialize_raw, :initialize + private :initialize_raw def initialize(name, data_type) case data_type when String, Symbol diff --git a/ruby/red-arrow/lib/arrow/file-output-stream.rb b/ruby/red-arrow/lib/arrow/file-output-stream.rb new file mode 100644 index 0000000000000..f39ad14cacf5b --- /dev/null +++ b/ruby/red-arrow/lib/arrow/file-output-stream.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class FileOutputStream + alias_method :initialize_raw, :initialize + private :initialize_raw + def initialize(path, options={}) + append = nil + case options + when true, false + append = options + when Hash + append = options[:append] + end + append = false if append.nil? + initialize_raw(path, append) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index 736f25bd60438..2092e461c1786 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -44,6 +44,7 @@ def require_libraries require "arrow/date64-array" require "arrow/date64-array-builder" require "arrow/field" + require "arrow/file-output-stream" require "arrow/path-extension" require "arrow/record" require "arrow/record-batch" diff --git a/ruby/red-arrow/lib/arrow/table.rb b/ruby/red-arrow/lib/arrow/table.rb index 524517f03b9e6..69a1de31722a3 100644 --- a/ruby/red-arrow/lib/arrow/table.rb +++ b/ruby/red-arrow/lib/arrow/table.rb @@ -29,6 +29,7 @@ def load(path, options={}) end alias_method :initialize_raw, :initialize + private :initialize_raw def initialize(schema_or_raw_table_or_columns, columns=nil) if columns.nil? if schema_or_raw_table_or_columns[0].is_a?(Column) diff --git a/ruby/red-arrow/test/test-file-output-stream.rb b/ruby/red-arrow/test/test-file-output-stream.rb new file mode 100644 index 0000000000000..559406a4e1efe --- /dev/null +++ b/ruby/red-arrow/test/test-file-output-stream.rb @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFileOutputStream < Test::Unit::TestCase + sub_test_case(".open") do + def setup + @file = Tempfile.open("arrow-file-output-stream") + @file.write("Hello") + @file.close + end + + def test_default + Arrow::FileOutputStream.open(@file.path) do |file| + file.write(" World") + end + assert_equal(" World", File.read(@file.path)) + end + + def test_options_append + Arrow::FileOutputStream.open(@file.path, append: true) do |file| + file.write(" World") + end + assert_equal("Hello World", File.read(@file.path)) + end + + def test_append_true + Arrow::FileOutputStream.open(@file.path, true) do |file| + file.write(" World") + end + assert_equal("Hello World", File.read(@file.path)) + end + + def test_append_false + Arrow::FileOutputStream.open(@file.path, false) do |file| + file.write(" World") + end + assert_equal(" World", File.read(@file.path)) + end + end +end From 63fd350045edca00f4ddd0c2de23f87fecd3f323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 12:34:27 +0100 Subject: [PATCH 11/80] ARROW-4043: [Packaging/Docker] Python tests on alpine miss pytest dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crossbow tests: [kszucs/crossbow/build-376](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-376) Author: Krisztián Szűcs Closes #3186 from kszucs/ARROW-4043 and squashes the following commits: d4bb8149 missing requirements.txt ab88181d remove redundant pandas dependency b2a89dff install tests dependencies from requirements-test.txt --- python/Dockerfile.alpine | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/Dockerfile.alpine b/python/Dockerfile.alpine index 7eedeac2860b0..ba0f2eb23f549 100644 --- a/python/Dockerfile.alpine +++ b/python/Dockerfile.alpine @@ -27,8 +27,10 @@ RUN export PYTHON_MAJOR=${PYTHON_VERSION:0:1} && \ pip install --upgrade pip setuptools # install python requirements -ADD python/requirements.txt /arrow/python/ -RUN pip install -r /arrow/python/requirements.txt cython pandas +ADD python/requirements.txt \ + python/requirements-test.txt \ + /arrow/python/ +RUN pip install -r /arrow/python/requirements-test.txt cython ENV ARROW_PYTHON=ON \ PYARROW_WITH_PARQUET=0 From 51f5e94612c92e81017898ab753f04dd55a868d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 15:56:45 +0100 Subject: [PATCH 12/80] ARROW-4041: [CI] Python 2.7 run uses Python 3.6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Closes #3190 from kszucs/ARROW-4041 and squashes the following commits: 75d3cc91 remove python from env file 3abec8a7 single conda create command ba6a820e don't update python on travis --- ci/conda_env_python.yml | 1 - ci/travis_install_toolchain.sh | 1 - ci/travis_script_python.sh | 15 ++++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index c187155275eaa..d3756cbcfa8c9 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -22,7 +22,6 @@ nomkl numpy pandas pytest -python rsync setuptools setuptools_scm diff --git a/ci/travis_install_toolchain.sh b/ci/travis_install_toolchain.sh index 86ac56d043b96..82031e8fd362f 100755 --- a/ci/travis_install_toolchain.sh +++ b/ci/travis_install_toolchain.sh @@ -31,7 +31,6 @@ if [ ! -e $CPP_TOOLCHAIN ]; then --file=$TRAVIS_BUILD_DIR/ci/conda_env_cpp.yml \ ${CONDA_LLVM} \ ccache \ - curl \ ninja \ nomkl \ python=3.6 diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 6d96ebe2dfb0b..b8385c3834266 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -32,9 +32,6 @@ PYARROW_PYTEST_FLAGS=" -r sxX --durations=15 --parquet" PYTHON_VERSION=$1 CONDA_ENV_DIR=$TRAVIS_BUILD_DIR/pyarrow-test-$PYTHON_VERSION -conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl -conda activate $CONDA_ENV_DIR - # We should use zlib in the target Python directory to avoid loading # wrong libpython on macOS at run-time. If we use zlib in # $ARROW_BUILD_TOOLCHAIN and libpython3.6m.dylib exists in both @@ -44,19 +41,23 @@ conda activate $CONDA_ENV_DIR # python-test fails. export ZLIB_HOME=$CONDA_ENV_DIR -python --version -which python - if [ $ARROW_TRAVIS_PYTHON_JVM == "1" ]; then CONDA_JVM_DEPS="jpype1" fi -conda install -y -q \ +conda create -y -q -p $CONDA_ENV_DIR \ --file $TRAVIS_BUILD_DIR/ci/conda_env_python.yml \ + cmake \ pip \ numpy=1.13.1 \ + python=${PYTHON_VERSION} \ ${CONDA_JVM_DEPS} +conda activate $CONDA_ENV_DIR + +python --version +which python + if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ] && [ "$PYTHON_VERSION" == "3.6" ]; then # Install documentation dependencies conda install -y -c conda-forge --file ci/conda_env_sphinx.yml From 4cfd6d3877e28624e271e022f7c98a8b1e3c5a5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 16:12:36 +0100 Subject: [PATCH 13/80] ARROW-4045: [Packaging/Python] Add hypothesis test dependency to wheel crossbow tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test builds: [kszucs/crossbow/build-383](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-383) Author: Krisztián Szűcs Closes #3188 from kszucs/ARROW-4045 and squashes the following commits: a2bcdaf4 correct path 11093d97 missing cython on osx f06fb949 pin numpy version in appveyor.yml cabedfba remove last pandas version from tasks.yml 5309a5b2 requirements-wheel 97bc6ead fix requirements.txt path on osx c17d6748 win aa05e743 linux 6be30182 osx --- dev/release/rat_exclude_files.txt | 1 + dev/release/verify-release-candidate.sh | 2 +- dev/tasks/python-wheels/appveyor.yml | 2 +- dev/tasks/python-wheels/linux-test.sh | 2 +- dev/tasks/python-wheels/osx-build.sh | 12 ++++-------- dev/tasks/python-wheels/travis.linux.yml | 1 - dev/tasks/python-wheels/travis.osx.yml | 2 -- dev/tasks/python-wheels/win-build.bat | 2 +- dev/tasks/tasks.yml | 20 -------------------- python/manylinux1/build_arrow.sh | 11 +++-------- python/requirements-test.txt | 1 - python/requirements-wheel.txt | 4 ++++ 12 files changed, 16 insertions(+), 44 deletions(-) create mode 100644 python/requirements-wheel.txt diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index e274d97548068..f2e3f164fa284 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -130,6 +130,7 @@ python/pyarrow/includes/__init__.pxd python/pyarrow/tests/__init__.py python/requirements.txt python/requirements-test.txt +python/requirements-wheel.txt pax_global_header MANIFEST.in __init__.pxd diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 45404b03dfb8a..71324ec12f7c5 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -189,7 +189,7 @@ test_and_install_cpp() { test_python() { pushd python - pip install -r requirements-test.txt + pip install -r requirements.txt -r requirements-test.txt python setup.py build_ext --inplace --with-parquet --with-plasma py.test pyarrow -v --pdb diff --git a/dev/tasks/python-wheels/appveyor.yml b/dev/tasks/python-wheels/appveyor.yml index 016041a6c6701..c220f922bc45c 100644 --- a/dev/tasks/python-wheels/appveyor.yml +++ b/dev/tasks/python-wheels/appveyor.yml @@ -20,7 +20,7 @@ os: Visual Studio 2015 environment: ARCH: "64" GENERATOR: Visual Studio 14 2015 Win64 - NUMPY: "{{ numpy_version }}" + NUMPY: "1.14.5" PYTHON: "{{ python_version }}" MSVC_DEFAULT_OPTIONS: ON ARROW_SRC: C:\apache-arrow diff --git a/dev/tasks/python-wheels/linux-test.sh b/dev/tasks/python-wheels/linux-test.sh index 163730a9f38da..234ce8d561cec 100755 --- a/dev/tasks/python-wheels/linux-test.sh +++ b/dev/tasks/python-wheels/linux-test.sh @@ -30,5 +30,5 @@ python -c "import pyarrow.parquet" python -c "import pyarrow.plasma" # Run pyarrow tests -pip install pytest pandas +pip install -r /arrow/python/requirements-test.txt pytest --pyargs pyarrow diff --git a/dev/tasks/python-wheels/osx-build.sh b/dev/tasks/python-wheels/osx-build.sh index 5c69904ff4348..22c44c157337f 100755 --- a/dev/tasks/python-wheels/osx-build.sh +++ b/dev/tasks/python-wheels/osx-build.sh @@ -99,9 +99,8 @@ function build_wheel { # build will also work with newer NumPy versions. export ARROW_HOME=`pwd`/arrow-dist export PARQUET_HOME=`pwd`/arrow-dist - if [ -n "$BUILD_DEPENDS" ]; then - pip install $(pip_opts) $BUILD_DEPENDS - fi + + pip install $(pip_opts) -r python/requirements-wheel.txt cython pushd cpp mkdir build @@ -161,10 +160,6 @@ function install_run { wheelhouse="$PWD/python/dist" - # Install test dependencies and built wheel - if [ -n "$TEST_DEPENDS" ]; then - pip install $(pip_opts) $TEST_DEPENDS - fi # Install compatible wheel pip install $(pip_opts) \ $(python $multibuild_dir/supported_wheels.py $wheelhouse/*.whl) @@ -179,7 +174,8 @@ function install_run { python -c "import pyarrow.plasma" # Run pyarrow tests - pip install pytest pytest-faulthandler + pip install $(pip_opts) -r python/requirements-test.txt + py.test --pyargs pyarrow popd diff --git a/dev/tasks/python-wheels/travis.linux.yml b/dev/tasks/python-wheels/travis.linux.yml index 9a8f804d1cc51..17888ccc9f1bb 100644 --- a/dev/tasks/python-wheels/travis.linux.yml +++ b/dev/tasks/python-wheels/travis.linux.yml @@ -42,7 +42,6 @@ script: - docker run --shm-size=2g -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.version }} -e PYTHON_VERSIONS="{{ python_version }},{{ unicode_width }}" - -e WHEEL_VERSION={{ wheel_version }} -v $PWD:/io -v $PWD/../../:/arrow quay.io/xhochy/arrow_manylinux1_x86_64_base:latest /io/build_arrow.sh diff --git a/dev/tasks/python-wheels/travis.osx.yml b/dev/tasks/python-wheels/travis.osx.yml index 2f0d168a3fb46..c6bd010da4ebc 100644 --- a/dev/tasks/python-wheels/travis.osx.yml +++ b/dev/tasks/python-wheels/travis.osx.yml @@ -29,8 +29,6 @@ env: - PYARROW_VERSION={{ arrow.version }} - PYARROW_BUILD_VERBOSE=1 - MB_PYTHON_VERSION={{ python_version }} - - BUILD_DEPENDS="wheel=={{ wheel_version }} numpy=={{ numpy_version }} cython==0.27.3 six" - - TEST_DEPENDS="numpy=={{ numpy_version }} pandas=={{ pandas_version }} six" before_install: - git clone https://github.com/matthew-brett/multibuild # TODO pin it diff --git a/dev/tasks/python-wheels/win-build.bat b/dev/tasks/python-wheels/win-build.bat index 22e306ab1f1eb..f85c8e8b7490e 100644 --- a/dev/tasks/python-wheels/win-build.bat +++ b/dev/tasks/python-wheels/win-build.bat @@ -82,7 +82,7 @@ popd @rem test the wheel call deactivate conda create -n wheel-test -q -y python=%PYTHON% ^ - numpy=%NUMPY% pandas pytest + numpy=%NUMPY% pandas pytest hypothesis call activate wheel-test pip install --no-index --find-links=%ARROW_SRC%\python\dist\ pyarrow diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index bd49616f6bd3e..ea104d507eec1 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -146,7 +146,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 2.7 unicode_width: 16 test_docker_images: [] @@ -157,7 +156,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 2.7 unicode_width: 32 test_docker_images: @@ -169,7 +167,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.5 unicode_width: 16 test_docker_images: @@ -181,7 +178,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.6 unicode_width: 16 test_docker_images: @@ -193,7 +189,6 @@ tasks: platform: linux template: python-wheels/travis.linux.yml params: - wheel_version: 0.31.1 python_version: 3.7 unicode_width: 16 test_docker_images: @@ -207,10 +202,7 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 2.7 - wheel_version: 0.31.1 artifacts: - pyarrow-{version}-cp27-cp27m-macosx_10_6_intel.whl @@ -218,10 +210,7 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.5 - wheel_version: 0.31.1 artifacts: - pyarrow-{version}-cp35-cp35m-macosx_10_6_intel.whl @@ -229,10 +218,7 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.6 - wheel_version: 0.31.1 artifacts: - pyarrow-{version}-cp36-cp36m-macosx_10_6_intel.whl @@ -240,10 +226,7 @@ tasks: platform: osx template: python-wheels/travis.osx.yml params: - numpy_version: 1.14.5 - pandas_version: 0.23.0 python_version: 3.7 - wheel_version: 0.31.1 artifacts: - pyarrow-{version}-cp37-cp37m-macosx_10_6_intel.whl @@ -253,7 +236,6 @@ tasks: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.5 artifacts: - pyarrow-{version}-cp35-cp35m-win_amd64.whl @@ -262,7 +244,6 @@ tasks: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.6 artifacts: - pyarrow-{version}-cp36-cp36m-win_amd64.whl @@ -271,7 +252,6 @@ tasks: platform: win template: python-wheels/appveyor.yml params: - numpy_version: 1.14.5 python_version: 3.7 artifacts: - pyarrow-{version}-cp37-cp37m-win_amd64.whl diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 904297375ef25..b1d8f8588dfc5 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -64,11 +64,6 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do fi fi - # pin wheel, because auditwheel is not compatible with wheel=0.32 - # pin after installing tensorflow, because it updates to wheel=0.32 - # TODO(kszucs): remove after auditwheel properly supports wheel>0.31 - $PIP install "wheel==${WHEEL_VERSION:-0.31.1}" - echo "=== (${PYTHON}) Building Arrow C++ libraries ===" ARROW_BUILD_DIR=/tmp/build-PY${PYTHON}-${U_WIDTH} mkdir -p "${ARROW_BUILD_DIR}" @@ -96,6 +91,9 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do # Check that we don't expose any unwanted symbols /io/scripts/check_arrow_visibility.sh + echo "=== (${PYTHON}) Install the wheel build dependencies ===" + $PIP install -r requirements-wheel.txt + # Clear output directory rm -rf dist/ echo "=== (${PYTHON}) Building wheel ===" @@ -107,9 +105,6 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py bdist_wheel PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py sdist - echo "=== (${PYTHON}) Ensure the existence of mandatory modules ===" - $PIP install -r requirements.txt - echo "=== (${PYTHON}) Tag the wheel with manylinux1 ===" mkdir -p repaired_wheels/ auditwheel -v repair -L . dist/pyarrow-*.whl -w repaired_wheels/ diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 482e88860669a..89af5ecac437c 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -1,4 +1,3 @@ --r requirements.txt pandas pytest hypothesis diff --git a/python/requirements-wheel.txt b/python/requirements-wheel.txt new file mode 100644 index 0000000000000..c44903efd36cb --- /dev/null +++ b/python/requirements-wheel.txt @@ -0,0 +1,4 @@ +wheel==0.31.1 +six>=1.0.0 +numpy==1.14.5 +futures; python_version < "3.2" From 0190e60e4abd4f07428f8d6c04e76f42f70d4ce3 Mon Sep 17 00:00:00 2001 From: "Korn, Uwe" Date: Mon, 17 Dec 2018 10:01:16 -0600 Subject: [PATCH 14/80] ARROW-4054: [Python] Update gtest, flatbuffers and OpenSSL in manylinux1 base image Author: Korn, Uwe Closes #3202 from xhochy/ARROW-4054 and squashes the following commits: d777fe98a ARROW-4054: Update gtest, flatbuffers and OpenSSL in manylinux1 base image --- python/manylinux1/Dockerfile-x86_64_base | 2 +- .../manylinux1/scripts/build_flatbuffers.sh | 2 +- python/manylinux1/scripts/build_gtest.sh | 25 ++++++++++++++----- python/manylinux1/scripts/build_openssl.sh | 10 +++++--- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/python/manylinux1/Dockerfile-x86_64_base b/python/manylinux1/Dockerfile-x86_64_base index d4b84629c1735..8ba205ee3754e 100644 --- a/python/manylinux1/Dockerfile-x86_64_base +++ b/python/manylinux1/Dockerfile-x86_64_base @@ -34,7 +34,7 @@ RUN /install_cmake.sh ADD scripts/build_gtest.sh / RUN /build_gtest.sh -ENV GTEST_HOME /googletest-release-1.7.0 +ENV GTEST_HOME /usr ADD scripts/build_flatbuffers.sh / RUN /build_flatbuffers.sh diff --git a/python/manylinux1/scripts/build_flatbuffers.sh b/python/manylinux1/scripts/build_flatbuffers.sh index 70b184c9a59c9..cae32f5aac959 100755 --- a/python/manylinux1/scripts/build_flatbuffers.sh +++ b/python/manylinux1/scripts/build_flatbuffers.sh @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -export FLATBUFFERS_VERSION=1.9.0 +export FLATBUFFERS_VERSION=1.10.0 curl -sL https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_VERSION}.tar.gz \ -o flatbuffers-${FLATBUFFERS_VERSION}.tar.gz tar xf flatbuffers-${FLATBUFFERS_VERSION}.tar.gz diff --git a/python/manylinux1/scripts/build_gtest.sh b/python/manylinux1/scripts/build_gtest.sh index f921efd489d67..5b29f5ee535c8 100755 --- a/python/manylinux1/scripts/build_gtest.sh +++ b/python/manylinux1/scripts/build_gtest.sh @@ -16,11 +16,24 @@ # specific language governing permissions and limitations # under the License. -curl -sL https://github.com/google/googletest/archive/release-1.7.0.tar.gz -o googletest-release-1.7.0.tar.gz -tar xf googletest-release-1.7.0.tar.gz +GTEST_VERSION=1.8.1 + +curl -sL https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz -o googletest-release-${GTEST_VERSION}.tar.gz +tar xf googletest-release-${GTEST_VERSION}.tar.gz ls -l -pushd googletest-release-1.7.0 -cmake -DCMAKE_CXX_FLAGS='-fPIC' -Dgtest_force_shared_crt=ON . -make -j5 +pushd googletest-release-${GTEST_VERSION} + +mkdir build_so +pushd build_so +cmake -DCMAKE_CXX_FLAGS='-fPIC' -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DBUILD_GMOCK=OFF -GNinja -DCMAKE_INSTALL_PREFIX=/usr .. +ninja install +popd + +mkdir build_a +pushd build_a +cmake -DCMAKE_CXX_FLAGS='-fPIC' -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=OFF -DBUILD_GMOCK=OFF -GNinja -DCMAKE_INSTALL_PREFIX=/usr .. +ninja install +popd + popd -rm -rf googletest-release-1.7.0.tar.gz +rm -rf googletest-release-${GTEST_VERSION}.tar.gz diff --git a/python/manylinux1/scripts/build_openssl.sh b/python/manylinux1/scripts/build_openssl.sh index 1a54d72f04696..622004d37f2c0 100755 --- a/python/manylinux1/scripts/build_openssl.sh +++ b/python/manylinux1/scripts/build_openssl.sh @@ -16,11 +16,13 @@ # specific language governing permissions and limitations # under the License. -wget --no-check-certificate https://www.openssl.org/source/openssl-1.0.2k.tar.gz -O openssl-1.0.2k.tar.gz -tar xf openssl-1.0.2k.tar.gz -pushd openssl-1.0.2k +OPENSSL_VERSION="1.0.2q" + +wget --no-check-certificate https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz -O openssl-${OPENSSL_VERSION}.tar.gz +tar xf openssl-${OPENSSL_VERSION}.tar.gz +pushd openssl-${OPENSSL_VERSION} ./config -fpic shared --prefix=/usr make -j5 make install popd -rm -rf openssl-1.0.2k.tar.gz openssl-1.0.2k +rm -rf openssl-${OPENSSL_VERSION}.tar.gz openssl-${OPENSSL_VERSION} From 39861574f064af741921f80436343268b19a6a2d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 17 Dec 2018 10:07:18 -0600 Subject: [PATCH 15/80] ARROW-3879: [C++] Fix uninitialized member in CudaBufferWriter Author: Antoine Pitrou Closes #3200 from pitrou/ARROW-3879-cuda-writer-uninitialized-member and squashes the following commits: e857fed22 ARROW-3879: Fix uninitialized member in CudaBufferWriter --- cpp/src/arrow/gpu/cuda_memory.cc | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index cf0c51c23af02..a0da580acf927 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -221,9 +221,16 @@ class CudaBufferWriter::CudaBufferWriterImpl { mutable_data_ = buffer->mutable_data(); size_ = buffer->size(); position_ = 0; + closed_ = false; + } + +#define CHECK_CLOSED() \ + if (closed_) { \ + return Status::Invalid("Operation on closed CudaBufferWriter"); \ } Status Seek(int64_t position) { + CHECK_CLOSED(); if (position < 0 || position >= size_) { return Status::IOError("position out of bounds"); } @@ -234,12 +241,17 @@ class CudaBufferWriter::CudaBufferWriterImpl { Status Close() { if (!closed_) { closed_ = true; - RETURN_NOT_OK(Flush()); + RETURN_NOT_OK(FlushInternal()); } return Status::OK(); } Status Flush() { + CHECK_CLOSED(); + return FlushInternal(); + } + + Status FlushInternal() { if (buffer_size_ > 0 && buffer_position_ > 0) { // Only need to flush when the write has been buffered RETURN_NOT_OK( @@ -253,11 +265,13 @@ class CudaBufferWriter::CudaBufferWriterImpl { bool closed() const { return closed_; } Status Tell(int64_t* position) const { + CHECK_CLOSED(); *position = position_; return Status::OK(); } Status Write(const void* data, int64_t nbytes) { + CHECK_CLOSED(); if (nbytes == 0) { return Status::OK(); } @@ -283,11 +297,13 @@ class CudaBufferWriter::CudaBufferWriterImpl { Status WriteAt(int64_t position, const void* data, int64_t nbytes) { std::lock_guard guard(lock_); + CHECK_CLOSED(); RETURN_NOT_OK(Seek(position)); return Write(data, nbytes); } Status SetBufferSize(const int64_t buffer_size) { + CHECK_CLOSED(); if (buffer_position_ > 0) { // Flush any buffered data RETURN_NOT_OK(Flush()); @@ -303,6 +319,8 @@ class CudaBufferWriter::CudaBufferWriterImpl { int64_t buffer_position() const { return buffer_position_; } +#undef CHECK_CLOSED + private: std::shared_ptr context_; std::shared_ptr buffer_; From 836ad52aa54e665704f5bad5234d6cdad83bd20d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 17 Dec 2018 10:08:54 -0600 Subject: [PATCH 16/80] ARROW-4017: [C++] Move vendored libraries in dedicated directory Also update mapbox::variant to v1.1.5 (I'm not sure which version was previously vendored). Author: Antoine Pitrou Closes #3184 from pitrou/ARROW-4017-vendored-libraries and squashes the following commits: fe69566d7 ARROW-4017: Move vendored libraries in dedicated directory --- LICENSE.txt | 6 +- cpp/CMakeLists.txt | 6 +- cpp/build-support/clang_format_exclusions.txt | 7 +- cpp/build-support/lint_cpp_cli.py | 5 +- cpp/cmake_modules/BuildUtils.cmake | 3 +- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/util/CMakeLists.txt | 3 - cpp/src/arrow/util/parsing.h | 2 +- cpp/src/arrow/util/string_view.h | 2 +- cpp/src/arrow/util/variant.h | 1115 +---------------- cpp/src/arrow/util/variant/optional.h | 100 -- cpp/src/arrow/util/variant/variant_cast.h | 114 -- cpp/src/arrow/util/variant/variant_io.h | 72 -- cpp/src/arrow/util/variant/variant_visitor.h | 69 - .../string_view => vendored}/CMakeLists.txt | 6 +- cpp/src/arrow/{util => vendored}/date.h | 0 .../string_view => vendored}/string_view.hpp | 0 .../{util => vendored}/variant/CMakeLists.txt | 6 +- .../variant/recursive_wrapper.hpp} | 14 +- cpp/src/arrow/vendored/variant/variant.hpp | 1029 +++++++++++++++ cpp/src/arrow/vendored/variant/variant_io.hpp | 47 + .../vendored/variant/variant_visitor.hpp | 40 + .../arrow/{util => vendored}/xxhash/xxhash.c | 0 .../arrow/{util => vendored}/xxhash/xxhash.h | 0 .../gandiva/precompiled/epoch_time_point.h | 2 +- cpp/src/gandiva/to_date_holder.cc | 2 +- cpp/src/plasma/client.cc | 2 +- dev/release/rat_exclude_files.txt | 10 +- 28 files changed, 1165 insertions(+), 1498 deletions(-) delete mode 100644 cpp/src/arrow/util/variant/optional.h delete mode 100644 cpp/src/arrow/util/variant/variant_cast.h delete mode 100644 cpp/src/arrow/util/variant/variant_io.h delete mode 100644 cpp/src/arrow/util/variant/variant_visitor.h rename cpp/src/arrow/{util/string_view => vendored}/CMakeLists.txt (88%) rename cpp/src/arrow/{util => vendored}/date.h (100%) rename cpp/src/arrow/{util/string_view => vendored}/string_view.hpp (100%) rename cpp/src/arrow/{util => vendored}/variant/CMakeLists.txt (83%) rename cpp/src/arrow/{util/variant/recursive_wrapper.h => vendored/variant/recursive_wrapper.hpp} (89%) create mode 100644 cpp/src/arrow/vendored/variant/variant.hpp create mode 100644 cpp/src/arrow/vendored/variant/variant_io.hpp create mode 100644 cpp/src/arrow/vendored/variant/variant_visitor.hpp rename cpp/src/arrow/{util => vendored}/xxhash/xxhash.c (100%) rename cpp/src/arrow/{util => vendored}/xxhash/xxhash.h (100%) diff --git a/LICENSE.txt b/LICENSE.txt index 5c9aaddc14ff8..572d3ef548917 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -681,7 +681,7 @@ See the License for the specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- -The file cpp/src/arrow/util/date.h has the following license (MIT) +The file cpp/src/arrow/vendored/date.h has the following license (MIT) The MIT License (MIT) Copyright (c) 2015, 2016, 2017 Howard Hinnant @@ -736,7 +736,7 @@ SOFTWARE. -------------------------------------------------------------------------------- -The file cpp/src/util/string_view/string_view.hpp has the following license +The file cpp/src/arrow/vendored/string_view.hpp has the following license Boost Software License - Version 1.0 - August 17th, 2003 @@ -764,7 +764,7 @@ DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- -The files in cpp/src/arrow/util/xxhash/ have the following license +The files in cpp/src/arrow/vendored/xxhash/ have the following license (BSD 2-Clause License) xxHash Library diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e3cc3f560a95f..f563199c62470 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -373,12 +373,8 @@ if (UNIX) IF(NOT ((item MATCHES "_generated.h") OR (item MATCHES "pyarrow_api.h") OR (item MATCHES "pyarrow_lib.h") OR - (item MATCHES "xxhash.h") OR - (item MATCHES "xxhash.cc") OR (item MATCHES "config.h") OR - (item MATCHES "util/date.h") OR - (item MATCHES "util/string_view/") OR - (item MATCHES "util/variant") OR + (item MATCHES "vendored/") OR (item MATCHES "zmalloc.h") OR (item MATCHES "ae.h"))) LIST(APPEND FILTERED_LINT_FILES ${item}) diff --git a/cpp/build-support/clang_format_exclusions.txt b/cpp/build-support/clang_format_exclusions.txt index c04523af1db81..2964898f4f24d 100644 --- a/cpp/build-support/clang_format_exclusions.txt +++ b/cpp/build-support/clang_format_exclusions.txt @@ -4,11 +4,6 @@ *pyarrow_lib.h *python/config.h *python/platform.h -*util/date.h -*util/string_view/* -*util/variant.h -*util/variant/* *thirdparty/ae/* -*xxhash.cc -*xxhash.h +*vendored/* *RcppExports.cpp* diff --git a/cpp/build-support/lint_cpp_cli.py b/cpp/build-support/lint_cpp_cli.py index 4c26927740dbb..c8b25dfc5e48f 100644 --- a/cpp/build-support/lint_cpp_cli.py +++ b/cpp/build-support/lint_cpp_cli.py @@ -70,13 +70,10 @@ def lint_file(path): EXCLUSIONS = [ 'arrow/python/iterators.h', - 'arrow/util/date.h', 'arrow/util/hashing.h', 'arrow/util/macros.h', 'arrow/util/parallel.h', - 'arrow/util/string_view/string_view.hpp', - 'arrow/util/xxhash/xxhash.c', - 'arrow/util/xxhash/xxhash.h', + 'arrow/vendored', 'arrow/visitor_inline.h', 'gandiva/cache.h', 'gandiva/jni', diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 7585ae9da8fa8..812d0c39e7fa5 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -580,7 +580,8 @@ function(ARROW_INSTALL_ALL_HEADERS PATH) set(multi_value_args PATTERN) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if (NOT ARG_PATTERN) - set(ARG_PATTERN "*.h") + # The .hpp extension is used by some vendored libraries + set(ARG_PATTERN "*.h" "*.hpp") endif() file(GLOB CURRENT_DIRECTORY_HEADERS ${ARG_PATTERN}) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index bec290df2aa37..9291addca0e1c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -280,6 +280,7 @@ add_subdirectory(array) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) +add_subdirectory(vendored) if(ARROW_FLIGHT) add_subdirectory(flight) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index a09797183212f..b13b2f367b022 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -72,6 +72,3 @@ ADD_ARROW_BENCHMARK(int-util-benchmark) ADD_ARROW_BENCHMARK(lazy-benchmark) ADD_ARROW_BENCHMARK(number-parsing-benchmark) ADD_ARROW_BENCHMARK(utf8-util-benchmark) - -add_subdirectory(string_view) -add_subdirectory(variant) diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h index 23e0361235d3e..46d0f7c322b46 100644 --- a/cpp/src/arrow/util/parsing.h +++ b/cpp/src/arrow/util/parsing.h @@ -34,7 +34,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/date.h" +#include "arrow/vendored/date.h" namespace arrow { namespace internal { diff --git a/cpp/src/arrow/util/string_view.h b/cpp/src/arrow/util/string_view.h index 0f35483e3738e..a1a813726e4f0 100644 --- a/cpp/src/arrow/util/string_view.h +++ b/cpp/src/arrow/util/string_view.h @@ -18,7 +18,7 @@ #ifndef ARROW_UTIL_STRING_VIEW_H #define ARROW_UTIL_STRING_VIEW_H -#include "arrow/util/string_view/string_view.hpp" // IWYU pragma: export +#include "arrow/vendored/string_view.hpp" // IWYU pragma: export namespace arrow { namespace util { diff --git a/cpp/src/arrow/util/variant.h b/cpp/src/arrow/util/variant.h index 1aa9aa3732fdf..cb6500aef8044 100644 --- a/cpp/src/arrow/util/variant.h +++ b/cpp/src/arrow/util/variant.h @@ -1,1105 +1,34 @@ -// Copyright (c) MapBox -// All rights reserved. +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at // -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: +// http://www.apache.org/licenses/LICENSE-2.0 // -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. #ifndef ARROW_UTIL_VARIANT_H #define ARROW_UTIL_VARIANT_H -#include -#include // size_t -#include // operator new -#include // runtime_error -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - - -#ifdef _MSC_VER -// https://msdn.microsoft.com/en-us/library/bw1hbe6y.aspx -# ifdef NDEBUG -# define VARIANT_INLINE __forceinline -# else -# define VARIANT_INLINE //__declspec(noinline) -# endif -#else -# ifdef NDEBUG -# define VARIANT_INLINE //inline __attribute__((always_inline)) -# else -# define VARIANT_INLINE __attribute__((noinline)) -# endif -#endif -// clang-format on - -// Exceptions -#if defined( __EXCEPTIONS) || defined( _MSC_VER) -#define HAS_EXCEPTIONS -#endif - -#define VARIANT_MAJOR_VERSION 1 -#define VARIANT_MINOR_VERSION 1 -#define VARIANT_PATCH_VERSION 0 - -#define VARIANT_VERSION (VARIANT_MAJOR_VERSION * 100000) + (VARIANT_MINOR_VERSION * 100) + (VARIANT_PATCH_VERSION) +#include "arrow/vendored/variant/variant.hpp" // IWYU pragma: export namespace arrow { namespace util { -// XXX This should derive from std::logic_error instead of std::runtime_error. -// See https://github.com/mapbox/variant/issues/48 for details. -class bad_variant_access : public std::runtime_error -{ - -public: - explicit bad_variant_access(const std::string& what_arg) - : runtime_error(what_arg) {} - - explicit bad_variant_access(const char* what_arg) - : runtime_error(what_arg) {} - -}; // class bad_variant_access - -#if !defined(ARROW_VARIANT_MINIMIZE_SIZE) -using type_index_t = std::size_t; -#else -#if defined(ARROW_VARIANT_OPTIMIZE_FOR_SPEED) -using type_index_t = std::uint_fast8_t; -#else -using type_index_t = std::uint_least8_t; -#endif -#endif - -namespace detail { - -static constexpr type_index_t invalid_value = type_index_t(-1); - -template -struct direct_type; - -template -struct direct_type -{ - static constexpr type_index_t index = std::is_same::value - ? sizeof...(Types) - : direct_type::index; -}; - -template -struct direct_type -{ - static constexpr type_index_t index = invalid_value; -}; - -#if __cpp_lib_logical_traits >= 201510L - -using std::conjunction; -using std::disjunction; - -#else - -template -struct conjunction : std::true_type {}; - -template -struct conjunction : B1 {}; - -template -struct conjunction : std::conditional::type {}; - -template -struct conjunction : std::conditional, B1>::type {}; - -template -struct disjunction : std::false_type {}; - -template -struct disjunction : B1 {}; - -template -struct disjunction : std::conditional::type {}; - -template -struct disjunction : std::conditional>::type {}; - -#endif - -template -struct convertible_type; - -template -struct convertible_type -{ - static constexpr type_index_t index = std::is_convertible::value - ? disjunction...>::value ? invalid_value : sizeof...(Types) - : convertible_type::index; -}; - -template -struct convertible_type -{ - static constexpr type_index_t index = invalid_value; -}; - -template -struct value_traits -{ - using value_type = typename std::remove_const::type>::type; - using value_type_wrapper = recursive_wrapper; - static constexpr type_index_t direct_index = direct_type::index; - static constexpr bool is_direct = direct_index != invalid_value; - static constexpr type_index_t index_direct_or_wrapper = is_direct ? direct_index : direct_type::index; - static constexpr bool is_direct_or_wrapper = index_direct_or_wrapper != invalid_value; - static constexpr type_index_t index = is_direct_or_wrapper ? index_direct_or_wrapper : convertible_type::index; - static constexpr bool is_valid = index != invalid_value; - static constexpr type_index_t tindex = is_valid ? sizeof...(Types)-index : 0; - using target_type = typename std::tuple_element>::type; -}; - -template -struct enable_if_type -{ - using type = R; -}; - -template -struct result_of_unary_visit -{ - using type = typename std::result_of::type; -}; - -template -struct result_of_unary_visit::type> -{ - using type = typename F::result_type; -}; - -template -struct result_of_binary_visit -{ - using type = typename std::result_of::type; -}; - -template -struct result_of_binary_visit::type> -{ - using type = typename F::result_type; -}; - -template -struct static_max; - -template -struct static_max -{ - static const type_index_t value = arg; -}; - -template -struct static_max -{ - static const type_index_t value = arg1 >= arg2 ? static_max::value : static_max::value; -}; - -template -struct variant_helper; - -template -struct variant_helper -{ - VARIANT_INLINE static void destroy(const type_index_t type_index, void* data) - { - if (type_index == sizeof...(Types)) - { - reinterpret_cast(data)->~T(); - } - else - { - variant_helper::destroy(type_index, data); - } - } - - VARIANT_INLINE static void move(const type_index_t old_type_index, void* old_value, void* new_value) - { - if (old_type_index == sizeof...(Types)) - { - new (new_value) T(std::move(*reinterpret_cast(old_value))); - } - else - { - variant_helper::move(old_type_index, old_value, new_value); - } - } - - VARIANT_INLINE static void copy(const type_index_t old_type_index, const void* old_value, void* new_value) - { - if (old_type_index == sizeof...(Types)) - { - new (new_value) T(*reinterpret_cast(old_value)); - } - else - { - variant_helper::copy(old_type_index, old_value, new_value); - } - } -}; - -template <> -struct variant_helper<> -{ - VARIANT_INLINE static void destroy(const type_index_t, void*) {} - VARIANT_INLINE static void move(const type_index_t, void*, void*) {} - VARIANT_INLINE static void copy(const type_index_t, const void*, void*) {} -}; - -template -struct unwrapper -{ - static T const& apply_const(T const& obj) { return obj; } - static T& apply(T& obj) { return obj; } -}; - -template -struct unwrapper> -{ - static auto apply_const(recursive_wrapper const& obj) - -> typename recursive_wrapper::type const& - { - return obj.get(); - } - static auto apply(recursive_wrapper& obj) - -> typename recursive_wrapper::type& - { - return obj.get(); - } -}; - -template -struct unwrapper> -{ - static auto apply_const(std::reference_wrapper const& obj) - -> typename std::reference_wrapper::type const& - { - return obj.get(); - } - static auto apply(std::reference_wrapper& obj) - -> typename std::reference_wrapper::type& - { - return obj.get(); - } -}; - -template -struct dispatcher; - -template -struct dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v, F&& f) - { - if (v.template is()) - { - return f(unwrapper::apply_const(v.template get_unchecked())); - } - else - { - return dispatcher::apply_const(v, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& v, F&& f) - { - if (v.template is()) - { - return f(unwrapper::apply(v.template get_unchecked())); - } - else - { - return dispatcher::apply(v, std::forward(f)); - } - } -}; - -template -struct dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v, F&& f) - { - return f(unwrapper::apply_const(v.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& v, F&& f) - { - return f(unwrapper::apply(v.template get_unchecked())); - } -}; - -template -struct binary_dispatcher_rhs; - -template -struct binary_dispatcher_rhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - if (rhs.template is()) // call binary functor - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_rhs::apply_const(lhs, rhs, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - if (rhs.template is()) // call binary functor - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_rhs::apply(lhs, rhs, std::forward(f)); - } - } -}; - -template -struct binary_dispatcher_rhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } -}; - -template -struct binary_dispatcher_lhs; - -template -struct binary_dispatcher_lhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - if (lhs.template is()) // call binary functor - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_lhs::apply_const(lhs, rhs, std::forward(f)); - } - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - if (lhs.template is()) // call binary functor - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } - else - { - return binary_dispatcher_lhs::apply(lhs, rhs, std::forward(f)); - } - } -}; - -template -struct binary_dispatcher_lhs -{ - VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) - { - return f(unwrapper::apply_const(lhs.template get_unchecked()), - unwrapper::apply_const(rhs.template get_unchecked())); - } - - VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) - { - return f(unwrapper::apply(lhs.template get_unchecked()), - unwrapper::apply(rhs.template get_unchecked())); - } -}; - -template -struct binary_dispatcher; - -template -struct binary_dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) - { - if (v0.template is()) - { - if (v1.template is()) - { - return f(unwrapper::apply_const(v0.template get_unchecked()), - unwrapper::apply_const(v1.template get_unchecked())); // call binary functor - } - else - { - return binary_dispatcher_rhs::apply_const(v0, v1, std::forward(f)); - } - } - else if (v1.template is()) - { - return binary_dispatcher_lhs::apply_const(v0, v1, std::forward(f)); - } - return binary_dispatcher::apply_const(v0, v1, std::forward(f)); - } - - VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) - { - if (v0.template is()) - { - if (v1.template is()) - { - return f(unwrapper::apply(v0.template get_unchecked()), - unwrapper::apply(v1.template get_unchecked())); // call binary functor - } - else - { - return binary_dispatcher_rhs::apply(v0, v1, std::forward(f)); - } - } - else if (v1.template is()) - { - return binary_dispatcher_lhs::apply(v0, v1, std::forward(f)); - } - return binary_dispatcher::apply(v0, v1, std::forward(f)); - } -}; - -template -struct binary_dispatcher -{ - VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) - { - return f(unwrapper::apply_const(v0.template get_unchecked()), - unwrapper::apply_const(v1.template get_unchecked())); // call binary functor - } - - VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) - { - return f(unwrapper::apply(v0.template get_unchecked()), - unwrapper::apply(v1.template get_unchecked())); // call binary functor - } -}; - -// comparator functors -struct equal_comp -{ - template - bool operator()(T const& lhs, T const& rhs) const - { - return lhs == rhs; - } -}; - -struct less_comp -{ - template - bool operator()(T const& lhs, T const& rhs) const - { - return lhs < rhs; - } -}; - -template -class comparer -{ -public: - explicit comparer(Variant const& lhs) noexcept - : lhs_(lhs) {} - comparer& operator=(comparer const&) = delete; - // visitor - template - bool operator()(T const& rhs_content) const - { - T const& lhs_content = lhs_.template get_unchecked(); - return Comp()(lhs_content, rhs_content); - } - -private: - Variant const& lhs_; -}; - -// hashing visitor -struct hasher -{ - template - std::size_t operator()(const T& hashable) const - { - return std::hash{}(hashable); - } -}; - -} // namespace detail - -struct no_init {}; - -template -class variant -{ - static_assert(sizeof...(Types) > 0, "Template parameter type list of variant can not be empty."); - static_assert(!detail::disjunction...>::value, "Variant can not hold reference types. Maybe use std::reference_wrapper?"); - static_assert(!detail::disjunction...>::value, "Variant can not hold array types."); - static_assert(sizeof...(Types) < std::numeric_limits::max(), "Internal index type must be able to accommodate all alternatives."); -private: - static const std::size_t data_size = detail::static_max::value; - static const std::size_t data_align = detail::static_max::value; -public: - struct adapted_variant_tag; - using types = std::tuple; -private: - using first_type = typename std::tuple_element<0, types>::type; - using data_type = typename std::aligned_storage::type; - using helper_type = detail::variant_helper; - - type_index_t type_index; - data_type data; - -public: - VARIANT_INLINE variant() noexcept(std::is_nothrow_default_constructible::value) - : type_index(sizeof...(Types)-1) - { - static_assert(std::is_default_constructible::value, "First type in variant must be default constructible to allow default construction of variant."); - new (&data) first_type(); - } - - VARIANT_INLINE variant(no_init) noexcept - : type_index(detail::invalid_value) {} - - // http://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers - template , - typename Enable = typename std::enable_if, typename Traits::value_type>::value>::type > - VARIANT_INLINE variant(T&& val) noexcept(std::is_nothrow_constructible::value) - : type_index(Traits::index) - { - new (&data) typename Traits::target_type(std::forward(val)); - } - - VARIANT_INLINE variant(variant const& old) - : type_index(old.type_index) - { - helper_type::copy(old.type_index, &old.data, &data); - } - - VARIANT_INLINE variant(variant&& old) - noexcept(detail::conjunction...>::value) - : type_index(old.type_index) - { - helper_type::move(old.type_index, &old.data, &data); - } - -private: - VARIANT_INLINE void copy_assign(variant const& rhs) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - helper_type::copy(rhs.type_index, &rhs.data, &data); - type_index = rhs.type_index; - } - - VARIANT_INLINE void move_assign(variant&& rhs) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - helper_type::move(rhs.type_index, &rhs.data, &data); - type_index = rhs.type_index; - } - -public: - VARIANT_INLINE variant& operator=(variant&& other) - { - move_assign(std::move(other)); - return *this; - } - - VARIANT_INLINE variant& operator=(variant const& other) - { - copy_assign(other); - return *this; - } - - // conversions - // move-assign - template - VARIANT_INLINE variant& operator=(T&& rhs) noexcept - { - variant temp(std::forward(rhs)); - move_assign(std::move(temp)); - return *this; - } - - // copy-assign - template - VARIANT_INLINE variant& operator=(T const& rhs) - { - variant temp(rhs); - copy_assign(temp); - return *this; - } - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE bool is() const - { - return type_index == detail::direct_type::index; - } - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE bool is() const - { - return type_index == detail::direct_type, Types...>::index; - } - - VARIANT_INLINE bool valid() const - { - return type_index != detail::invalid_value; - } - - template - VARIANT_INLINE void set(Args&&... args) - { - helper_type::destroy(type_index, &data); - type_index = detail::invalid_value; - new (&data) T(std::forward(args)...); - type_index = detail::direct_type::index; - } - - // get_unchecked() - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return *reinterpret_cast(&data); - } - -#ifdef HAS_EXCEPTIONS - // get() - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type::index) - { - return *reinterpret_cast(&data); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return *reinterpret_cast(&data); - } - -#ifdef HAS_EXCEPTIONS - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type::index) - { - return *reinterpret_cast(&data); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // get_unchecked() - T stored as recursive_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return (*reinterpret_cast*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - // get() - T stored as recursive_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return (*reinterpret_cast const*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast const*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // get_unchecked() - T stored as std::reference_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get_unchecked() - { - return (*reinterpret_cast*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - // get() - T stored as std::reference_wrapper - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T& get() - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get_unchecked() const - { - return (*reinterpret_cast const*>(&data)).get(); - } - -#ifdef HAS_EXCEPTIONS - template , Types...>::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE T const& get() const - { - if (type_index == detail::direct_type, Types...>::index) - { - return (*reinterpret_cast const*>(&data)).get(); - } - else - { - throw bad_variant_access("in get()"); - } - } -#endif - - // This function is deprecated because it returns an internal index field. - // Use which() instead. - ARROW_DEPRECATED("Use which() instead") - VARIANT_INLINE type_index_t get_type_index() const - { - return type_index; - } - - VARIANT_INLINE int which() const noexcept - { - return static_cast(sizeof...(Types) - type_index - 1); - } - - template ::index != detail::invalid_value)>::type* = NULLPTR> - VARIANT_INLINE static constexpr int which() noexcept - { - return static_cast(sizeof...(Types)-detail::direct_type::index - 1); - } - - // visitor - // unary - template ::type> - auto VARIANT_INLINE static visit(V const& v, F&& f) - -> decltype(detail::dispatcher::apply_const(v, std::forward(f))) - { - return detail::dispatcher::apply_const(v, std::forward(f)); - } - // non-const - template ::type> - auto VARIANT_INLINE static visit(V& v, F&& f) - -> decltype(detail::dispatcher::apply(v, std::forward(f))) - { - return detail::dispatcher::apply(v, std::forward(f)); - } - - // binary - // const - template ::type> - auto VARIANT_INLINE static binary_visit(V const& v0, V const& v1, F&& f) - -> decltype(detail::binary_dispatcher::apply_const(v0, v1, std::forward(f))) - { - return detail::binary_dispatcher::apply_const(v0, v1, std::forward(f)); - } - // non-const - template ::type> - auto VARIANT_INLINE static binary_visit(V& v0, V& v1, F&& f) - -> decltype(detail::binary_dispatcher::apply(v0, v1, std::forward(f))) - { - return detail::binary_dispatcher::apply(v0, v1, std::forward(f)); - } - - // match - // unary - template - auto VARIANT_INLINE match(Fs&&... fs) const - -> decltype(variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...))) - { - return variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...)); - } - // non-const - template - auto VARIANT_INLINE match(Fs&&... fs) - -> decltype(variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...))) - { - return variant::visit(*this, ::arrow::util::make_visitor(std::forward(fs)...)); - } - - ~variant() noexcept // no-throw destructor - { - helper_type::destroy(type_index, &data); - } - - // comparison operators - // equality - VARIANT_INLINE bool operator==(variant const& rhs) const - { - assert(valid() && rhs.valid()); - if (this->which() != rhs.which()) - { - return false; - } - detail::comparer visitor(*this); - return visit(rhs, visitor); - } - - VARIANT_INLINE bool operator!=(variant const& rhs) const - { - return !(*this == rhs); - } - - // less than - VARIANT_INLINE bool operator<(variant const& rhs) const - { - assert(valid() && rhs.valid()); - if (this->which() != rhs.which()) - { - return this->which() < rhs.which(); - } - detail::comparer visitor(*this); - return visit(rhs, visitor); - } - VARIANT_INLINE bool operator>(variant const& rhs) const - { - return rhs < *this; - } - VARIANT_INLINE bool operator<=(variant const& rhs) const - { - return !(*this > rhs); - } - VARIANT_INLINE bool operator>=(variant const& rhs) const - { - return !(*this < rhs); - } -}; - -// unary visitor interface -// const -template -auto VARIANT_INLINE apply_visitor(F&& f, V const& v) -> decltype(V::visit(v, std::forward(f))) -{ - return V::visit(v, std::forward(f)); -} - -// non-const -template -auto VARIANT_INLINE apply_visitor(F&& f, V& v) -> decltype(V::visit(v, std::forward(f))) -{ - return V::visit(v, std::forward(f)); -} - -// binary visitor interface -// const -template -auto VARIANT_INLINE apply_visitor(F&& f, V const& v0, V const& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) -{ - return V::binary_visit(v0, v1, std::forward(f)); -} - -// non-const -template -auto VARIANT_INLINE apply_visitor(F&& f, V& v0, V& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) -{ - return V::binary_visit(v0, v1, std::forward(f)); -} - -// getter interface - -#ifdef HAS_EXCEPTIONS -template -auto get(T& var)->decltype(var.template get()) -{ - return var.template get(); -} -#endif - -template -ResultType& get_unchecked(T& var) -{ - return var.template get_unchecked(); -} - -#ifdef HAS_EXCEPTIONS -template -auto get(T const& var)->decltype(var.template get()) -{ - return var.template get(); -} -#endif - -template -ResultType const& get_unchecked(T const& var) -{ - return var.template get_unchecked(); -} -// variant_size -template -struct variant_size; - -//variable templates is c++14 -//template -//constexpr std::size_t variant_size_v = variant_size::value; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size - : variant_size {}; - -template -struct variant_size> - : std::integral_constant {}; - -// variant_alternative -template -struct variant_alternative; - -#if defined(__clang__) -#if __has_builtin(__type_pack_element) -#define has_type_pack_element -#endif -#endif - -#if defined(has_type_pack_element) -template -struct variant_alternative> -{ - static_assert(sizeof...(Types) > Index , "Index out of range"); - using type = __type_pack_element; -}; -#else -template -struct variant_alternative> - : variant_alternative> -{ - static_assert(sizeof...(Types) > Index -1 , "Index out of range"); -}; - -template -struct variant_alternative<0, variant> -{ - using type = First; -}; - -#endif - -template -using variant_alternative_t = typename variant_alternative::type; - -template -struct variant_alternative - : std::add_const> {}; - -template -struct variant_alternative - : std::add_volatile> {}; - -template -struct variant_alternative - : std::add_cv> {}; +using mapbox::util::apply_visitor; // seems akin to std::visit +using mapbox::util::bad_variant_access; +using mapbox::util::get; +using mapbox::util::variant; -} // namespace util -} // namespace arrow +} // namespace util +} // namespace arrow -#endif // ARROW_UTIL_VARIANT_H +#endif // ARROW_UTIL_VARIANT_H diff --git a/cpp/src/arrow/util/variant/optional.h b/cpp/src/arrow/util/variant/optional.h deleted file mode 100644 index 4c6671061fe80..0000000000000 --- a/cpp/src/arrow/util/variant/optional.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_OPTIONAL_H -#define ARROW_UTIL_VARIANT_OPTIONAL_H - -#pragma message("This implementation of optional is deprecated. See https://github.com/mapbox/variant/issues/64.") - -#include -#include - -#include - -namespace arrow { -namespace util { - -template -class optional -{ - static_assert(!std::is_reference::value, "optional doesn't support references"); - - struct none_type - { - }; - - variant variant_; - -public: - optional() = default; - - optional(optional const& rhs) - { - if (this != &rhs) - { // protect against invalid self-assignment - variant_ = rhs.variant_; - } - } - - optional(T const& v) { variant_ = v; } - - explicit operator bool() const noexcept { return variant_.template is(); } - - T const& get() const { return variant_.template get(); } - T& get() { return variant_.template get(); } - - T const& operator*() const { return this->get(); } - T operator*() { return this->get(); } - - optional& operator=(T const& v) - { - variant_ = v; - return *this; - } - - optional& operator=(optional const& rhs) - { - if (this != &rhs) - { - variant_ = rhs.variant_; - } - return *this; - } - - template - void emplace(Args&&... args) - { - variant_ = T{std::forward(args)...}; - } - - void reset() { variant_ = none_type{}; } - -}; // class optional - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_OPTIONAL_H diff --git a/cpp/src/arrow/util/variant/variant_cast.h b/cpp/src/arrow/util/variant/variant_cast.h deleted file mode 100644 index 71ae80b5dfab6..0000000000000 --- a/cpp/src/arrow/util/variant/variant_cast.h +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_CAST_H -#define ARROW_UTIL_VARIANT_CAST_H - -#include - -#include "arrow/util/macros.h" - -namespace arrow { -namespace util { - -namespace detail { - -template -class static_caster -{ -public: - template - T& operator()(V& v) const - { - return static_cast(v); - } -}; - -template -class dynamic_caster -{ -public: - using result_type = T&; - template - T& operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - throw std::bad_cast(); - } - template - T& operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return dynamic_cast(v); - } -}; - -template -class dynamic_caster -{ -public: - using result_type = T*; - template - T* operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return NULLPTR; - } - template - T* operator()(V& v, typename std::enable_if::value>::type* = NULLPTR) const - { - return dynamic_cast(&v); - } -}; -} - -template -typename detail::dynamic_caster::result_type -dynamic_variant_cast(V& v) -{ - return arrow::util::apply_visitor(detail::dynamic_caster(), v); -} - -template -typename detail::dynamic_caster::result_type -dynamic_variant_cast(const V& v) -{ - return arrow::util::apply_visitor(detail::dynamic_caster(), v); -} - -template -T& static_variant_cast(V& v) -{ - return arrow::util::apply_visitor(detail::static_caster(), v); -} - -template -const T& static_variant_cast(const V& v) -{ - return arrow::util::apply_visitor(detail::static_caster(), v); -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_CAST_H diff --git a/cpp/src/arrow/util/variant/variant_io.h b/cpp/src/arrow/util/variant/variant_io.h deleted file mode 100644 index 5541a81f7035f..0000000000000 --- a/cpp/src/arrow/util/variant/variant_io.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_IO_H -#define ARROW_UTIL_VARIANT_IO_H - -#include - -#include - -namespace arrow { -namespace util { - -namespace detail { -// operator<< helper -template -class printer -{ -public: - explicit printer(Out& out) - : out_(out) {} - printer& operator=(printer const&) = delete; - - // visitor - template - void operator()(T const& operand) const - { - out_ << operand; - } - -private: - Out& out_; -}; -} - -// operator<< -template -VARIANT_INLINE std::basic_ostream& -operator<<(std::basic_ostream& out, variant const& rhs) -{ - detail::printer> visitor(out); - apply_visitor(visitor, rhs); - return out; -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_IO_H diff --git a/cpp/src/arrow/util/variant/variant_visitor.h b/cpp/src/arrow/util/variant/variant_visitor.h deleted file mode 100644 index 66b1dfea3d7c9..0000000000000 --- a/cpp/src/arrow/util/variant/variant_visitor.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) MapBox -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this -// list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this -// list of conditions and the following disclaimer in the documentation and/or -// other materials provided with the distribution. -// - Neither the name "MapBox" nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef ARROW_UTIL_VARIANT_VISITOR_HPP -#define ARROW_UTIL_VARIANT_VISITOR_HPP - -#include - -namespace arrow { -namespace util { - -template -struct visitor; - -template -struct visitor : Fn -{ - using Fn::operator(); - - template - visitor(T&& fn) : Fn(std::forward(fn)) {} -}; - -template -struct visitor : Fn, visitor -{ - using Fn::operator(); - using visitor::operator(); - - template - visitor(T&& fn, Ts&&... fns) - : Fn(std::forward(fn)) - , visitor(std::forward(fns)...) {} -}; - -template -visitor::type...> make_visitor(Fns&&... fns) -{ - return visitor::type...> - (std::forward(fns)...); -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_VISITOR_HPP diff --git a/cpp/src/arrow/util/string_view/CMakeLists.txt b/cpp/src/arrow/vendored/CMakeLists.txt similarity index 88% rename from cpp/src/arrow/util/string_view/CMakeLists.txt rename to cpp/src/arrow/vendored/CMakeLists.txt index 7e553077db1ad..04ea67aa45d04 100644 --- a/cpp/src/arrow/util/string_view/CMakeLists.txt +++ b/cpp/src/arrow/vendored/CMakeLists.txt @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -install(FILES - string_view.hpp - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/util/string_view") +ARROW_INSTALL_ALL_HEADERS("arrow/vendored") + +add_subdirectory(variant) diff --git a/cpp/src/arrow/util/date.h b/cpp/src/arrow/vendored/date.h similarity index 100% rename from cpp/src/arrow/util/date.h rename to cpp/src/arrow/vendored/date.h diff --git a/cpp/src/arrow/util/string_view/string_view.hpp b/cpp/src/arrow/vendored/string_view.hpp similarity index 100% rename from cpp/src/arrow/util/string_view/string_view.hpp rename to cpp/src/arrow/vendored/string_view.hpp diff --git a/cpp/src/arrow/util/variant/CMakeLists.txt b/cpp/src/arrow/vendored/variant/CMakeLists.txt similarity index 83% rename from cpp/src/arrow/util/variant/CMakeLists.txt rename to cpp/src/arrow/vendored/variant/CMakeLists.txt index b7a5692b6207c..de26f938d72f3 100644 --- a/cpp/src/arrow/util/variant/CMakeLists.txt +++ b/cpp/src/arrow/vendored/variant/CMakeLists.txt @@ -15,8 +15,4 @@ # specific language governing permissions and limitations # under the License. -####################################### -# arrow_util_variant -####################################### - -ARROW_INSTALL_ALL_HEADERS("arrow/util/variant") +ARROW_INSTALL_ALL_HEADERS("arrow/vendored/variant") diff --git a/cpp/src/arrow/util/variant/recursive_wrapper.h b/cpp/src/arrow/vendored/variant/recursive_wrapper.hpp similarity index 89% rename from cpp/src/arrow/util/variant/recursive_wrapper.h rename to cpp/src/arrow/vendored/variant/recursive_wrapper.hpp index c9d9385394b38..96b6a3f217f5b 100644 --- a/cpp/src/arrow/util/variant/recursive_wrapper.h +++ b/cpp/src/arrow/vendored/variant/recursive_wrapper.hpp @@ -1,7 +1,9 @@ -#ifndef ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H -#define ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 -// Based on variant/recursive_wrapper.h from boost. +#ifndef MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP +#define MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP + +// Based on variant/recursive_wrapper.hpp from boost. // // Original license: // @@ -15,7 +17,7 @@ #include #include -namespace arrow { +namespace mapbox { namespace util { template @@ -117,6 +119,6 @@ inline void swap(recursive_wrapper& lhs, recursive_wrapper& rhs) noexcept lhs.swap(rhs); } } // namespace util -} // namespace arrow +} // namespace mapbox -#endif // ARROW_UTIL_VARIANT_RECURSIVE_WRAPPER_H +#endif // MAPBOX_UTIL_RECURSIVE_WRAPPER_HPP diff --git a/cpp/src/arrow/vendored/variant/variant.hpp b/cpp/src/arrow/vendored/variant/variant.hpp new file mode 100644 index 0000000000000..bb399dece1d57 --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant.hpp @@ -0,0 +1,1029 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_HPP +#define MAPBOX_UTIL_VARIANT_HPP + +#include +#include // size_t +#include // operator new +#include // runtime_error +#include +#include +#include +#include +#include +#include + +#include "recursive_wrapper.hpp" +#include "variant_visitor.hpp" + +// clang-format off +// [[deprecated]] is only available in C++14, use this for the time being +#if __cplusplus <= 201103L +# ifdef __GNUC__ +# define MAPBOX_VARIANT_DEPRECATED __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define MAPBOX_VARIANT_DEPRECATED __declspec(deprecated) +# else +# define MAPBOX_VARIANT_DEPRECATED +# endif +#else +# define MAPBOX_VARIANT_DEPRECATED [[deprecated]] +#endif + + +#ifdef _MSC_VER +// https://msdn.microsoft.com/en-us/library/bw1hbe6y.aspx +# ifdef NDEBUG +# define VARIANT_INLINE __forceinline +# else +# define VARIANT_INLINE //__declspec(noinline) +# endif +#else +# ifdef NDEBUG +# define VARIANT_INLINE //inline __attribute__((always_inline)) +# else +# define VARIANT_INLINE __attribute__((noinline)) +# endif +#endif +// clang-format on + +// Exceptions +#if defined( __EXCEPTIONS) || defined( _MSC_VER) +#define HAS_EXCEPTIONS +#endif + +#define VARIANT_MAJOR_VERSION 1 +#define VARIANT_MINOR_VERSION 1 +#define VARIANT_PATCH_VERSION 0 + +#define VARIANT_VERSION (VARIANT_MAJOR_VERSION * 100000) + (VARIANT_MINOR_VERSION * 100) + (VARIANT_PATCH_VERSION) + +namespace mapbox { +namespace util { + +// XXX This should derive from std::logic_error instead of std::runtime_error. +// See https://github.com/mapbox/variant/issues/48 for details. +class bad_variant_access : public std::runtime_error +{ + +public: + explicit bad_variant_access(const std::string& what_arg) + : runtime_error(what_arg) {} + + explicit bad_variant_access(const char* what_arg) + : runtime_error(what_arg) {} + +}; // class bad_variant_access + +template +struct MAPBOX_VARIANT_DEPRECATED static_visitor +{ + using result_type = R; + +protected: + static_visitor() {} + ~static_visitor() {} +}; + +namespace detail { + +static constexpr std::size_t invalid_value = std::size_t(-1); + +template +struct direct_type; + +template +struct direct_type +{ + static constexpr std::size_t index = std::is_same::value + ? sizeof...(Types) + : direct_type::index; +}; + +template +struct direct_type +{ + static constexpr std::size_t index = invalid_value; +}; + +#if __cpp_lib_logical_traits >= 201510L + +using std::conjunction; +using std::disjunction; + +#else + +template +struct conjunction : std::true_type {}; + +template +struct conjunction : B1 {}; + +template +struct conjunction : std::conditional::type {}; + +template +struct conjunction : std::conditional, B1>::type {}; + +template +struct disjunction : std::false_type {}; + +template +struct disjunction : B1 {}; + +template +struct disjunction : std::conditional::type {}; + +template +struct disjunction : std::conditional>::type {}; + +#endif + +template +struct convertible_type; + +template +struct convertible_type +{ + static constexpr std::size_t index = std::is_convertible::value + ? disjunction...>::value ? invalid_value : sizeof...(Types) + : convertible_type::index; +}; + +template +struct convertible_type +{ + static constexpr std::size_t index = invalid_value; +}; + +template +struct value_traits +{ + using value_type = typename std::remove_const::type>::type; + static constexpr std::size_t direct_index = direct_type::index; + static constexpr bool is_direct = direct_index != invalid_value; + static constexpr std::size_t index = is_direct ? direct_index : convertible_type::index; + static constexpr bool is_valid = index != invalid_value; + static constexpr std::size_t tindex = is_valid ? sizeof...(Types)-index : 0; + using target_type = typename std::tuple_element>::type; +}; + +template +struct enable_if_type +{ + using type = R; +}; + +template +struct result_of_unary_visit +{ + using type = typename std::result_of::type; +}; + +template +struct result_of_unary_visit::type> +{ + using type = typename F::result_type; +}; + +template +struct result_of_binary_visit +{ + using type = typename std::result_of::type; +}; + +template +struct result_of_binary_visit::type> +{ + using type = typename F::result_type; +}; + +template +struct static_max; + +template +struct static_max +{ + static const std::size_t value = arg; +}; + +template +struct static_max +{ + static const std::size_t value = arg1 >= arg2 ? static_max::value : static_max::value; +}; + +template +struct variant_helper; + +template +struct variant_helper +{ + VARIANT_INLINE static void destroy(const std::size_t type_index, void* data) + { + if (type_index == sizeof...(Types)) + { + reinterpret_cast(data)->~T(); + } + else + { + variant_helper::destroy(type_index, data); + } + } + + VARIANT_INLINE static void move(const std::size_t old_type_index, void* old_value, void* new_value) + { + if (old_type_index == sizeof...(Types)) + { + new (new_value) T(std::move(*reinterpret_cast(old_value))); + } + else + { + variant_helper::move(old_type_index, old_value, new_value); + } + } + + VARIANT_INLINE static void copy(const std::size_t old_type_index, const void* old_value, void* new_value) + { + if (old_type_index == sizeof...(Types)) + { + new (new_value) T(*reinterpret_cast(old_value)); + } + else + { + variant_helper::copy(old_type_index, old_value, new_value); + } + } +}; + +template <> +struct variant_helper<> +{ + VARIANT_INLINE static void destroy(const std::size_t, void*) {} + VARIANT_INLINE static void move(const std::size_t, void*, void*) {} + VARIANT_INLINE static void copy(const std::size_t, const void*, void*) {} +}; + +template +struct unwrapper +{ + static T const& apply_const(T const& obj) { return obj; } + static T& apply(T& obj) { return obj; } +}; + +template +struct unwrapper> +{ + static auto apply_const(recursive_wrapper const& obj) + -> typename recursive_wrapper::type const& + { + return obj.get(); + } + static auto apply(recursive_wrapper& obj) + -> typename recursive_wrapper::type& + { + return obj.get(); + } +}; + +template +struct unwrapper> +{ + static auto apply_const(std::reference_wrapper const& obj) + -> typename std::reference_wrapper::type const& + { + return obj.get(); + } + static auto apply(std::reference_wrapper& obj) + -> typename std::reference_wrapper::type& + { + return obj.get(); + } +}; + +template +struct dispatcher; + +template +struct dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v, F&& f) + { + if (v.template is()) + { + return f(unwrapper::apply_const(v.template get_unchecked())); + } + else + { + return dispatcher::apply_const(v, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& v, F&& f) + { + if (v.template is()) + { + return f(unwrapper::apply(v.template get_unchecked())); + } + else + { + return dispatcher::apply(v, std::forward(f)); + } + } +}; + +template +struct dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v, F&& f) + { + return f(unwrapper::apply_const(v.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& v, F&& f) + { + return f(unwrapper::apply(v.template get_unchecked())); + } +}; + +template +struct binary_dispatcher_rhs; + +template +struct binary_dispatcher_rhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + if (rhs.template is()) // call binary functor + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_rhs::apply_const(lhs, rhs, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + if (rhs.template is()) // call binary functor + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_rhs::apply(lhs, rhs, std::forward(f)); + } + } +}; + +template +struct binary_dispatcher_rhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } +}; + +template +struct binary_dispatcher_lhs; + +template +struct binary_dispatcher_lhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + if (lhs.template is()) // call binary functor + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_lhs::apply_const(lhs, rhs, std::forward(f)); + } + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + if (lhs.template is()) // call binary functor + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } + else + { + return binary_dispatcher_lhs::apply(lhs, rhs, std::forward(f)); + } + } +}; + +template +struct binary_dispatcher_lhs +{ + VARIANT_INLINE static R apply_const(V const& lhs, V const& rhs, F&& f) + { + return f(unwrapper::apply_const(lhs.template get_unchecked()), + unwrapper::apply_const(rhs.template get_unchecked())); + } + + VARIANT_INLINE static R apply(V& lhs, V& rhs, F&& f) + { + return f(unwrapper::apply(lhs.template get_unchecked()), + unwrapper::apply(rhs.template get_unchecked())); + } +}; + +template +struct binary_dispatcher; + +template +struct binary_dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) + { + if (v0.template is()) + { + if (v1.template is()) + { + return f(unwrapper::apply_const(v0.template get_unchecked()), + unwrapper::apply_const(v1.template get_unchecked())); // call binary functor + } + else + { + return binary_dispatcher_rhs::apply_const(v0, v1, std::forward(f)); + } + } + else if (v1.template is()) + { + return binary_dispatcher_lhs::apply_const(v0, v1, std::forward(f)); + } + return binary_dispatcher::apply_const(v0, v1, std::forward(f)); + } + + VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) + { + if (v0.template is()) + { + if (v1.template is()) + { + return f(unwrapper::apply(v0.template get_unchecked()), + unwrapper::apply(v1.template get_unchecked())); // call binary functor + } + else + { + return binary_dispatcher_rhs::apply(v0, v1, std::forward(f)); + } + } + else if (v1.template is()) + { + return binary_dispatcher_lhs::apply(v0, v1, std::forward(f)); + } + return binary_dispatcher::apply(v0, v1, std::forward(f)); + } +}; + +template +struct binary_dispatcher +{ + VARIANT_INLINE static R apply_const(V const& v0, V const& v1, F&& f) + { + return f(unwrapper::apply_const(v0.template get_unchecked()), + unwrapper::apply_const(v1.template get_unchecked())); // call binary functor + } + + VARIANT_INLINE static R apply(V& v0, V& v1, F&& f) + { + return f(unwrapper::apply(v0.template get_unchecked()), + unwrapper::apply(v1.template get_unchecked())); // call binary functor + } +}; + +// comparator functors +struct equal_comp +{ + template + bool operator()(T const& lhs, T const& rhs) const + { + return lhs == rhs; + } +}; + +struct less_comp +{ + template + bool operator()(T const& lhs, T const& rhs) const + { + return lhs < rhs; + } +}; + +template +class comparer +{ +public: + explicit comparer(Variant const& lhs) noexcept + : lhs_(lhs) {} + comparer& operator=(comparer const&) = delete; + // visitor + template + bool operator()(T const& rhs_content) const + { + T const& lhs_content = lhs_.template get_unchecked(); + return Comp()(lhs_content, rhs_content); + } + +private: + Variant const& lhs_; +}; + +// hashing visitor +struct hasher +{ + template + std::size_t operator()(const T& hashable) const + { + return std::hash{}(hashable); + } +}; + +} // namespace detail + +struct no_init +{ +}; + +template +class variant +{ + static_assert(sizeof...(Types) > 0, "Template parameter type list of variant can not be empty"); + static_assert(!detail::disjunction...>::value, "Variant can not hold reference types. Maybe use std::reference_wrapper?"); + +private: + static const std::size_t data_size = detail::static_max::value; + static const std::size_t data_align = detail::static_max::value; +public: + struct adapted_variant_tag; + using types = std::tuple; +private: + using first_type = typename std::tuple_element<0, types>::type; + using data_type = typename std::aligned_storage::type; + using helper_type = detail::variant_helper; + + std::size_t type_index; + data_type data; + +public: + VARIANT_INLINE variant() noexcept(std::is_nothrow_default_constructible::value) + : type_index(sizeof...(Types)-1) + { + static_assert(std::is_default_constructible::value, "First type in variant must be default constructible to allow default construction of variant"); + new (&data) first_type(); + } + + VARIANT_INLINE variant(no_init) noexcept + : type_index(detail::invalid_value) {} + + // http://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers + template , + typename Enable = typename std::enable_if, typename Traits::value_type>::value>::type > + VARIANT_INLINE variant(T&& val) noexcept(std::is_nothrow_constructible::value) + : type_index(Traits::index) + { + new (&data) typename Traits::target_type(std::forward(val)); + } + + VARIANT_INLINE variant(variant const& old) + : type_index(old.type_index) + { + helper_type::copy(old.type_index, &old.data, &data); + } + + VARIANT_INLINE variant(variant&& old) + noexcept(detail::conjunction...>::value) + : type_index(old.type_index) + { + helper_type::move(old.type_index, &old.data, &data); + } + +private: + VARIANT_INLINE void copy_assign(variant const& rhs) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + helper_type::copy(rhs.type_index, &rhs.data, &data); + type_index = rhs.type_index; + } + + VARIANT_INLINE void move_assign(variant&& rhs) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + helper_type::move(rhs.type_index, &rhs.data, &data); + type_index = rhs.type_index; + } + +public: + VARIANT_INLINE variant& operator=(variant&& other) + { + move_assign(std::move(other)); + return *this; + } + + VARIANT_INLINE variant& operator=(variant const& other) + { + copy_assign(other); + return *this; + } + + // conversions + // move-assign + template + VARIANT_INLINE variant& operator=(T&& rhs) noexcept + { + variant temp(std::forward(rhs)); + move_assign(std::move(temp)); + return *this; + } + + // copy-assign + template + VARIANT_INLINE variant& operator=(T const& rhs) + { + variant temp(rhs); + copy_assign(temp); + return *this; + } + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE bool is() const + { + return type_index == detail::direct_type::index; + } + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE bool is() const + { + return type_index == detail::direct_type, Types...>::index; + } + + VARIANT_INLINE bool valid() const + { + return type_index != detail::invalid_value; + } + + template + VARIANT_INLINE void set(Args&&... args) + { + helper_type::destroy(type_index, &data); + type_index = detail::invalid_value; + new (&data) T(std::forward(args)...); + type_index = detail::direct_type::index; + } + + // get_unchecked() + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return *reinterpret_cast(&data); + } + +#ifdef HAS_EXCEPTIONS + // get() + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type::index) + { + return *reinterpret_cast(&data); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return *reinterpret_cast(&data); + } + +#ifdef HAS_EXCEPTIONS + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type::index) + { + return *reinterpret_cast(&data); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // get_unchecked() - T stored as recursive_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return (*reinterpret_cast*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + // get() - T stored as recursive_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return (*reinterpret_cast const*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast const*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // get_unchecked() - T stored as std::reference_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get_unchecked() + { + return (*reinterpret_cast*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + // get() - T stored as std::reference_wrapper + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T& get() + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get_unchecked() const + { + return (*reinterpret_cast const*>(&data)).get(); + } + +#ifdef HAS_EXCEPTIONS + template , Types...>::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE T const& get() const + { + if (type_index == detail::direct_type, Types...>::index) + { + return (*reinterpret_cast const*>(&data)).get(); + } + else + { + throw bad_variant_access("in get()"); + } + } +#endif + + // This function is deprecated because it returns an internal index field. + // Use which() instead. + MAPBOX_VARIANT_DEPRECATED VARIANT_INLINE std::size_t get_type_index() const + { + return type_index; + } + + VARIANT_INLINE int which() const noexcept + { + return static_cast(sizeof...(Types)-type_index - 1); + } + + template ::index != detail::invalid_value)>::type* = nullptr> + VARIANT_INLINE static constexpr int which() noexcept + { + return static_cast(sizeof...(Types)-detail::direct_type::index - 1); + } + + // visitor + // unary + template ::type> + auto VARIANT_INLINE static visit(V const& v, F&& f) + -> decltype(detail::dispatcher::apply_const(v, std::forward(f))) + { + return detail::dispatcher::apply_const(v, std::forward(f)); + } + // non-const + template ::type> + auto VARIANT_INLINE static visit(V& v, F&& f) + -> decltype(detail::dispatcher::apply(v, std::forward(f))) + { + return detail::dispatcher::apply(v, std::forward(f)); + } + + // binary + // const + template ::type> + auto VARIANT_INLINE static binary_visit(V const& v0, V const& v1, F&& f) + -> decltype(detail::binary_dispatcher::apply_const(v0, v1, std::forward(f))) + { + return detail::binary_dispatcher::apply_const(v0, v1, std::forward(f)); + } + // non-const + template ::type> + auto VARIANT_INLINE static binary_visit(V& v0, V& v1, F&& f) + -> decltype(detail::binary_dispatcher::apply(v0, v1, std::forward(f))) + { + return detail::binary_dispatcher::apply(v0, v1, std::forward(f)); + } + + // match + // unary + template + auto VARIANT_INLINE match(Fs&&... fs) const + -> decltype(variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...))) + { + return variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...)); + } + // non-const + template + auto VARIANT_INLINE match(Fs&&... fs) + -> decltype(variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...))) + { + return variant::visit(*this, ::mapbox::util::make_visitor(std::forward(fs)...)); + } + + ~variant() noexcept // no-throw destructor + { + helper_type::destroy(type_index, &data); + } + + // comparison operators + // equality + VARIANT_INLINE bool operator==(variant const& rhs) const + { + assert(valid() && rhs.valid()); + if (this->which() != rhs.which()) + { + return false; + } + detail::comparer visitor(*this); + return visit(rhs, visitor); + } + + VARIANT_INLINE bool operator!=(variant const& rhs) const + { + return !(*this == rhs); + } + + // less than + VARIANT_INLINE bool operator<(variant const& rhs) const + { + assert(valid() && rhs.valid()); + if (this->which() != rhs.which()) + { + return this->which() < rhs.which(); + } + detail::comparer visitor(*this); + return visit(rhs, visitor); + } + VARIANT_INLINE bool operator>(variant const& rhs) const + { + return rhs < *this; + } + VARIANT_INLINE bool operator<=(variant const& rhs) const + { + return !(*this > rhs); + } + VARIANT_INLINE bool operator>=(variant const& rhs) const + { + return !(*this < rhs); + } +}; + +// unary visitor interface +// const +template +auto VARIANT_INLINE apply_visitor(F&& f, V const& v) -> decltype(V::visit(v, std::forward(f))) +{ + return V::visit(v, std::forward(f)); +} + +// non-const +template +auto VARIANT_INLINE apply_visitor(F&& f, V& v) -> decltype(V::visit(v, std::forward(f))) +{ + return V::visit(v, std::forward(f)); +} + +// binary visitor interface +// const +template +auto VARIANT_INLINE apply_visitor(F&& f, V const& v0, V const& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) +{ + return V::binary_visit(v0, v1, std::forward(f)); +} + +// non-const +template +auto VARIANT_INLINE apply_visitor(F&& f, V& v0, V& v1) -> decltype(V::binary_visit(v0, v1, std::forward(f))) +{ + return V::binary_visit(v0, v1, std::forward(f)); +} + +// getter interface + +#ifdef HAS_EXCEPTIONS +template +auto get(T& var)->decltype(var.template get()) +{ + return var.template get(); +} +#endif + +template +ResultType& get_unchecked(T& var) +{ + return var.template get_unchecked(); +} + +#ifdef HAS_EXCEPTIONS +template +auto get(T const& var)->decltype(var.template get()) +{ + return var.template get(); +} +#endif + +template +ResultType const& get_unchecked(T const& var) +{ + return var.template get_unchecked(); +} +} // namespace util +} // namespace mapbox + +// hashable iff underlying types are hashable +namespace std { +template +struct hash< ::mapbox::util::variant> { + std::size_t operator()(const ::mapbox::util::variant& v) const noexcept + { + return ::mapbox::util::apply_visitor(::mapbox::util::detail::hasher{}, v); + } +}; +} + +#endif // MAPBOX_UTIL_VARIANT_HPP diff --git a/cpp/src/arrow/vendored/variant/variant_io.hpp b/cpp/src/arrow/vendored/variant/variant_io.hpp new file mode 100644 index 0000000000000..494d2a964e319 --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant_io.hpp @@ -0,0 +1,47 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_IO_HPP +#define MAPBOX_UTIL_VARIANT_IO_HPP + +#include + +#include "variant.hpp" + +namespace mapbox { +namespace util { + +namespace detail { +// operator<< helper +template +class printer +{ +public: + explicit printer(Out& out) + : out_(out) {} + printer& operator=(printer const&) = delete; + + // visitor + template + void operator()(T const& operand) const + { + out_ << operand; + } + +private: + Out& out_; +}; +} + +// operator<< +template +VARIANT_INLINE std::basic_ostream& +operator<<(std::basic_ostream& out, variant const& rhs) +{ + detail::printer> visitor(out); + apply_visitor(visitor, rhs); + return out; +} +} // namespace util +} // namespace mapbox + +#endif // MAPBOX_UTIL_VARIANT_IO_HPP diff --git a/cpp/src/arrow/vendored/variant/variant_visitor.hpp b/cpp/src/arrow/vendored/variant/variant_visitor.hpp new file mode 100644 index 0000000000000..60020f4dd05dc --- /dev/null +++ b/cpp/src/arrow/vendored/variant/variant_visitor.hpp @@ -0,0 +1,40 @@ +// Vendored from https://github.com/mapbox/variant at tag v1.1.5 + +#ifndef MAPBOX_UTIL_VARIANT_VISITOR_HPP +#define MAPBOX_UTIL_VARIANT_VISITOR_HPP + +namespace mapbox { +namespace util { + +template +struct visitor; + +template +struct visitor : Fn +{ + using type = Fn; + using Fn::operator(); + + visitor(Fn fn) : Fn(fn) {} +}; + +template +struct visitor : Fn, visitor +{ + using type = visitor; + using Fn::operator(); + using visitor::operator(); + + visitor(Fn fn, Fns... fns) : Fn(fn), visitor(fns...) {} +}; + +template +visitor make_visitor(Fns... fns) +{ + return visitor(fns...); +} + +} // namespace util +} // namespace mapbox + +#endif // MAPBOX_UTIL_VARIANT_VISITOR_HPP diff --git a/cpp/src/arrow/util/xxhash/xxhash.c b/cpp/src/arrow/vendored/xxhash/xxhash.c similarity index 100% rename from cpp/src/arrow/util/xxhash/xxhash.c rename to cpp/src/arrow/vendored/xxhash/xxhash.c diff --git a/cpp/src/arrow/util/xxhash/xxhash.h b/cpp/src/arrow/vendored/xxhash/xxhash.h similarity index 100% rename from cpp/src/arrow/util/xxhash/xxhash.h rename to cpp/src/arrow/vendored/xxhash/xxhash.h diff --git a/cpp/src/gandiva/precompiled/epoch_time_point.h b/cpp/src/gandiva/precompiled/epoch_time_point.h index dc6340d134e0a..115f019525118 100644 --- a/cpp/src/gandiva/precompiled/epoch_time_point.h +++ b/cpp/src/gandiva/precompiled/epoch_time_point.h @@ -19,7 +19,7 @@ #define GANDIVA_EPOCH_TIME_POINT_H // TODO(wesm): IR compilation does not have any include directories set -#include "../../arrow/util/date.h" +#include "../../arrow/vendored/date.h" // A point of time measured in millis since epoch. class EpochTimePoint { diff --git a/cpp/src/gandiva/to_date_holder.cc b/cpp/src/gandiva/to_date_holder.cc index 9c8562280041d..b512934e233a1 100644 --- a/cpp/src/gandiva/to_date_holder.cc +++ b/cpp/src/gandiva/to_date_holder.cc @@ -18,7 +18,7 @@ #include #include -#include "arrow/util/date.h" +#include "arrow/vendored/date.h" #include "gandiva/date_utils.h" #include "gandiva/execution_context.h" diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 4215399c0b009..8d153585c3d4e 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -64,7 +64,7 @@ using arrow::cuda::CudaDeviceManager; #define XXH_INLINE_ALL 1 #define XXH_NAMESPACE plasma_client_ -#include "arrow/util/xxhash/xxhash.h" +#include "arrow/vendored/xxhash/xxhash.h" #define XXH64_DEFAULT_SEED 0 diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index f2e3f164fa284..66d62c6257570 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -13,15 +13,7 @@ cpp/src/arrow/io/mman.h cpp/src/arrow/util/random.h cpp/src/arrow/status.cc cpp/src/arrow/status.h -cpp/src/arrow/util/string_view/string_view.hpp -cpp/src/arrow/util/variant.h -cpp/src/arrow/util/variant/optional.h -cpp/src/arrow/util/variant/recursive_wrapper.h -cpp/src/arrow/util/variant/variant_cast.h -cpp/src/arrow/util/variant/variant_io.h -cpp/src/arrow/util/variant/variant_visitor.h -cpp/src/arrow/util/xxhash/xxhash.c -cpp/src/arrow/util/xxhash/xxhash.h +cpp/src/arrow/vendored/* cpp/build-support/asan_symbolize.py cpp/build-support/cpplint.py cpp/build-support/clang_format_exclusions.txt From a236464551df9427af3f5750a1630100d086d178 Mon Sep 17 00:00:00 2001 From: Dustin Long Date: Mon, 17 Dec 2018 10:09:30 -0600 Subject: [PATCH 17/80] ARROW-3674: [Go] Implement Date32 and Date64 array types Implement both Date32 and Date64 types for arrays. Also resolves ARROW-3675. Unit tests follow the same pattern as the existing float64 and Time{32,64} tests. Author: Dustin Long Closes #3170 from dustmop/date-types and squashes the following commits: 29ae27474 ARROW-3674: Date{32,64} as primitive fixed-width types, not parametric 07a261047 ARROW-3674: Implement Date32 and Date64 array types --- go/arrow/array/array.go | 4 +- go/arrow/array/numeric.gen.go | 90 ++++++++ go/arrow/array/numeric_test.go | 220 ++++++++++++++++++ go/arrow/array/numericbuilder.gen.go | 270 ++++++++++++++++++++++ go/arrow/array/numericbuilder_test.go | 220 ++++++++++++++++++ go/arrow/datatype_fixedwidth.go | 2 + go/arrow/datatype_numeric.gen.go | 16 ++ go/arrow/datatype_numeric.gen.go.tmpldata | 10 + go/arrow/numeric.tmpldata | 20 +- go/arrow/type_traits_numeric.gen.go | 98 ++++++++ 10 files changed, 947 insertions(+), 3 deletions(-) diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go index b188dcd68c729..ef37aef42f602 100644 --- a/go/arrow/array/array.go +++ b/go/arrow/array/array.go @@ -180,8 +180,8 @@ func init() { arrow.STRING: func(data *Data) Interface { return NewStringData(data) }, arrow.BINARY: func(data *Data) Interface { return NewBinaryData(data) }, arrow.FIXED_SIZE_BINARY: func(data *Data) Interface { return NewFixedSizeBinaryData(data) }, - arrow.DATE32: unsupportedArrayType, - arrow.DATE64: unsupportedArrayType, + arrow.DATE32: func(data *Data) Interface { return NewDate32Data(data) }, + arrow.DATE64: func(data *Data) Interface { return NewDate64Data(data) }, arrow.TIMESTAMP: func(data *Data) Interface { return NewTimestampData(data) }, arrow.TIME32: func(data *Data) Interface { return NewTime32Data(data) }, arrow.TIME64: func(data *Data) Interface { return NewTime64Data(data) }, diff --git a/go/arrow/array/numeric.gen.go b/go/arrow/array/numeric.gen.go index 1f734c05127b4..1fb8257d940c4 100644 --- a/go/arrow/array/numeric.gen.go +++ b/go/arrow/array/numeric.gen.go @@ -609,3 +609,93 @@ func (a *Time64) setData(data *Data) { a.values = a.values[beg:end] } } + +// A type which represents an immutable sequence of arrow.Date32 values. +type Date32 struct { + array + values []arrow.Date32 +} + +func NewDate32Data(data *Data) *Date32 { + a := &Date32{} + a.refCount = 1 + a.setData(data) + return a +} + +func (a *Date32) Value(i int) arrow.Date32 { return a.values[i] } +func (a *Date32) Date32Values() []arrow.Date32 { return a.values } + +func (a *Date32) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i, v := range a.values { + if i > 0 { + fmt.Fprintf(o, " ") + } + switch { + case a.IsNull(i): + o.WriteString("(null)") + default: + fmt.Fprintf(o, "%v", v) + } + } + o.WriteString("]") + return o.String() +} + +func (a *Date32) setData(data *Data) { + a.array.setData(data) + vals := data.buffers[1] + if vals != nil { + a.values = arrow.Date32Traits.CastFromBytes(vals.Bytes()) + beg := a.array.data.offset + end := beg + a.array.data.length + a.values = a.values[beg:end] + } +} + +// A type which represents an immutable sequence of arrow.Date64 values. +type Date64 struct { + array + values []arrow.Date64 +} + +func NewDate64Data(data *Data) *Date64 { + a := &Date64{} + a.refCount = 1 + a.setData(data) + return a +} + +func (a *Date64) Value(i int) arrow.Date64 { return a.values[i] } +func (a *Date64) Date64Values() []arrow.Date64 { return a.values } + +func (a *Date64) String() string { + o := new(strings.Builder) + o.WriteString("[") + for i, v := range a.values { + if i > 0 { + fmt.Fprintf(o, " ") + } + switch { + case a.IsNull(i): + o.WriteString("(null)") + default: + fmt.Fprintf(o, "%v", v) + } + } + o.WriteString("]") + return o.String() +} + +func (a *Date64) setData(data *Data) { + a.array.setData(data) + vals := data.buffers[1] + if vals != nil { + a.values = arrow.Date64Traits.CastFromBytes(vals.Bytes()) + beg := a.array.data.offset + end := beg + a.array.data.length + a.values = a.values[beg:end] + } +} diff --git a/go/arrow/array/numeric_test.go b/go/arrow/array/numeric_test.go index 9e8267a70de6c..fc7f04addbe0d 100644 --- a/go/arrow/array/numeric_test.go +++ b/go/arrow/array/numeric_test.go @@ -394,3 +394,223 @@ func TestTime64SliceDataWithNull(t *testing.T) { t.Fatalf("got=%v, want=%v", got, want) } } + +func TestNewDate32Data(t *testing.T) { + exp := []arrow.Date32{1, 2, 4, 8, 16} + + dtype := &arrow.Date32Type{} + ad := array.NewData( + dtype, len(exp), + []*memory.Buffer{nil, memory.NewBufferBytes(arrow.Date32Traits.CastToBytes(exp))}, + nil, 0, 0, + ) + fa := array.NewDate32Data(ad) + + assert.Equal(t, len(exp), fa.Len(), "unexpected Len()") + assert.Equal(t, exp, fa.Date32Values(), "unexpected Date32Values()") +} + +func TestDate32SliceData(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 4 + ) + + var ( + vs = []arrow.Date32{1, 2, 3, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate32Builder(pool) + defer b.Release() + + for _, v := range vs { + b.Append(v) + } + + arr := b.NewArray().(*array.Date32) + defer arr.Release() + + if got, want := arr.Len(), len(vs); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date32) + defer slice.Release() + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date32Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestDate32SliceDataWithNull(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 5 + ) + + var ( + valids = []bool{true, true, true, false, true, true} + vs = []arrow.Date32{1, 2, 3, 0, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate32Builder(pool) + defer b.Release() + + b.AppendValues(vs, valids) + + arr := b.NewArray().(*array.Date32) + defer arr.Release() + + if got, want := arr.Len(), len(valids); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 1; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date32Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date32) + defer slice.Release() + + if got, want := slice.NullN(), 1; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date32Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestNewDate64Data(t *testing.T) { + exp := []arrow.Date64{1, 2, 4, 8, 16} + + dtype := &arrow.Date64Type{} + ad := array.NewData( + dtype, len(exp), + []*memory.Buffer{nil, memory.NewBufferBytes(arrow.Date64Traits.CastToBytes(exp))}, + nil, 0, 0, + ) + fa := array.NewDate64Data(ad) + + assert.Equal(t, len(exp), fa.Len(), "unexpected Len()") + assert.Equal(t, exp, fa.Date64Values(), "unexpected Date64Values()") +} + +func TestDate64SliceData(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 4 + ) + + var ( + vs = []arrow.Date64{1, 2, 3, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate64Builder(pool) + defer b.Release() + + for _, v := range vs { + b.Append(v) + } + + arr := b.NewArray().(*array.Date64) + defer arr.Release() + + if got, want := arr.Len(), len(vs); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date64Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date64) + defer slice.Release() + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date64Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} + +func TestDate64SliceDataWithNull(t *testing.T) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + + const ( + beg = 2 + end = 5 + ) + + var ( + valids = []bool{true, true, true, false, true, true} + vs = []arrow.Date64{1, 2, 3, 0, 4, 5} + sub = vs[beg:end] + ) + + b := array.NewDate64Builder(pool) + defer b.Release() + + b.AppendValues(vs, valids) + + arr := b.NewArray().(*array.Date64) + defer arr.Release() + + if got, want := arr.Len(), len(valids); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 1; got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := arr.Date64Values(), vs; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } + + slice := array.NewSlice(arr, beg, end).(*array.Date64) + defer slice.Release() + + if got, want := slice.NullN(), 1; got != want { + t.Errorf("got=%d, want=%d", got, want) + } + + if got, want := slice.Len(), len(sub); got != want { + t.Fatalf("got=%d, want=%d", got, want) + } + + if got, want := slice.Date64Values(), sub; !reflect.DeepEqual(got, want) { + t.Fatalf("got=%v, want=%v", got, want) + } +} diff --git a/go/arrow/array/numericbuilder.gen.go b/go/arrow/array/numericbuilder.gen.go index 3a7dc167f15aa..946c5ba74aaeb 100644 --- a/go/arrow/array/numericbuilder.gen.go +++ b/go/arrow/array/numericbuilder.gen.go @@ -1772,6 +1772,274 @@ func (b *Time64Builder) newData() (data *Data) { return } +type Date32Builder struct { + builder + + data *memory.Buffer + rawData []arrow.Date32 +} + +func NewDate32Builder(mem memory.Allocator) *Date32Builder { + return &Date32Builder{builder: builder{refCount: 1, mem: mem}} +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +func (b *Date32Builder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + } +} + +func (b *Date32Builder) Append(v arrow.Date32) { + b.Reserve(1) + b.UnsafeAppend(v) +} + +func (b *Date32Builder) AppendNull() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(false) +} + +func (b *Date32Builder) UnsafeAppend(v arrow.Date32) { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + b.rawData[b.length] = v + b.length++ +} + +func (b *Date32Builder) UnsafeAppendBoolToBitmap(isValid bool) { + if isValid { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + } else { + b.nulls++ + } + b.length++ +} + +// AppendValues will append the values in the v slice. The valid slice determines which values +// in v are valid (not null). The valid slice must either be empty or be equal in length to v. If empty, +// all values in v are appended and considered valid. +func (b *Date32Builder) AppendValues(v []arrow.Date32, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + b.Reserve(len(v)) + if len(v) > 0 { + arrow.Date32Traits.Copy(b.rawData[b.length:], v) + } + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *Date32Builder) init(capacity int) { + b.builder.init(capacity) + + b.data = memory.NewResizableBuffer(b.mem) + bytesN := arrow.Date32Traits.BytesRequired(capacity) + b.data.Resize(bytesN) + b.rawData = arrow.Date32Traits.CastFromBytes(b.data.Bytes()) +} + +// Reserve ensures there is enough space for appending n elements +// by checking the capacity and calling Resize if necessary. +func (b *Date32Builder) Reserve(n int) { + b.builder.reserve(n, b.Resize) +} + +// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(), +// additional memory will be allocated. If n is smaller, the allocated memory may reduced. +func (b *Date32Builder) Resize(n int) { + nBuilder := n + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + } else { + b.builder.resize(nBuilder, b.init) + b.data.Resize(arrow.Date32Traits.BytesRequired(n)) + b.rawData = arrow.Date32Traits.CastFromBytes(b.data.Bytes()) + } +} + +// NewArray creates a Date32 array from the memory buffers used by the builder and resets the Date32Builder +// so it can be used to build a new array. +func (b *Date32Builder) NewArray() Interface { + return b.NewDate32Array() +} + +// NewDate32Array creates a Date32 array from the memory buffers used by the builder and resets the Date32Builder +// so it can be used to build a new array. +func (b *Date32Builder) NewDate32Array() (a *Date32) { + data := b.newData() + a = NewDate32Data(data) + data.Release() + return +} + +func (b *Date32Builder) newData() (data *Data) { + bytesRequired := arrow.Date32Traits.BytesRequired(b.length) + if bytesRequired > 0 && bytesRequired < b.data.Len() { + // trim buffers + b.data.Resize(bytesRequired) + } + data = NewData(arrow.PrimitiveTypes.Date32, b.length, []*memory.Buffer{b.nullBitmap, b.data}, nil, b.nulls, 0) + b.reset() + + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + + return +} + +type Date64Builder struct { + builder + + data *memory.Buffer + rawData []arrow.Date64 +} + +func NewDate64Builder(mem memory.Allocator) *Date64Builder { + return &Date64Builder{builder: builder{refCount: 1, mem: mem}} +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +func (b *Date64Builder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + } +} + +func (b *Date64Builder) Append(v arrow.Date64) { + b.Reserve(1) + b.UnsafeAppend(v) +} + +func (b *Date64Builder) AppendNull() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(false) +} + +func (b *Date64Builder) UnsafeAppend(v arrow.Date64) { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + b.rawData[b.length] = v + b.length++ +} + +func (b *Date64Builder) UnsafeAppendBoolToBitmap(isValid bool) { + if isValid { + bitutil.SetBit(b.nullBitmap.Bytes(), b.length) + } else { + b.nulls++ + } + b.length++ +} + +// AppendValues will append the values in the v slice. The valid slice determines which values +// in v are valid (not null). The valid slice must either be empty or be equal in length to v. If empty, +// all values in v are appended and considered valid. +func (b *Date64Builder) AppendValues(v []arrow.Date64, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + b.Reserve(len(v)) + if len(v) > 0 { + arrow.Date64Traits.Copy(b.rawData[b.length:], v) + } + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *Date64Builder) init(capacity int) { + b.builder.init(capacity) + + b.data = memory.NewResizableBuffer(b.mem) + bytesN := arrow.Date64Traits.BytesRequired(capacity) + b.data.Resize(bytesN) + b.rawData = arrow.Date64Traits.CastFromBytes(b.data.Bytes()) +} + +// Reserve ensures there is enough space for appending n elements +// by checking the capacity and calling Resize if necessary. +func (b *Date64Builder) Reserve(n int) { + b.builder.reserve(n, b.Resize) +} + +// Resize adjusts the space allocated by b to n elements. If n is greater than b.Cap(), +// additional memory will be allocated. If n is smaller, the allocated memory may reduced. +func (b *Date64Builder) Resize(n int) { + nBuilder := n + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + } else { + b.builder.resize(nBuilder, b.init) + b.data.Resize(arrow.Date64Traits.BytesRequired(n)) + b.rawData = arrow.Date64Traits.CastFromBytes(b.data.Bytes()) + } +} + +// NewArray creates a Date64 array from the memory buffers used by the builder and resets the Date64Builder +// so it can be used to build a new array. +func (b *Date64Builder) NewArray() Interface { + return b.NewDate64Array() +} + +// NewDate64Array creates a Date64 array from the memory buffers used by the builder and resets the Date64Builder +// so it can be used to build a new array. +func (b *Date64Builder) NewDate64Array() (a *Date64) { + data := b.newData() + a = NewDate64Data(data) + data.Release() + return +} + +func (b *Date64Builder) newData() (data *Data) { + bytesRequired := arrow.Date64Traits.BytesRequired(b.length) + if bytesRequired > 0 && bytesRequired < b.data.Len() { + // trim buffers + b.data.Resize(bytesRequired) + } + data = NewData(arrow.PrimitiveTypes.Date64, b.length, []*memory.Buffer{b.nullBitmap, b.data}, nil, b.nulls, 0) + b.reset() + + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + + return +} + var ( _ Builder = (*Int64Builder)(nil) _ Builder = (*Uint64Builder)(nil) @@ -1786,4 +2054,6 @@ var ( _ Builder = (*TimestampBuilder)(nil) _ Builder = (*Time32Builder)(nil) _ Builder = (*Time64Builder)(nil) + _ Builder = (*Date32Builder)(nil) + _ Builder = (*Date64Builder)(nil) ) diff --git a/go/arrow/array/numericbuilder_test.go b/go/arrow/array/numericbuilder_test.go index 65f3c86c2ea35..3bb49a3af7310 100644 --- a/go/arrow/array/numericbuilder_test.go +++ b/go/arrow/array/numericbuilder_test.go @@ -362,3 +362,223 @@ func TestTime64Builder_Resize(t *testing.T) { ab.Release() } + +func TestNewDate32Builder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + ab.Append(1) + ab.Append(2) + ab.Append(3) + ab.AppendNull() + ab.Append(5) + ab.Append(6) + ab.AppendNull() + ab.Append(8) + ab.Append(9) + ab.Append(10) + + // check state of builder before NewDate32Array + assert.Equal(t, 10, ab.Len(), "unexpected Len()") + assert.Equal(t, 2, ab.NullN(), "unexpected NullN()") + + a := ab.NewDate32Array() + + // check state of builder after NewDate32Array + assert.Zero(t, ab.Len(), "unexpected ArrayBuilder.Len(), NewDate32Array did not reset state") + assert.Zero(t, ab.Cap(), "unexpected ArrayBuilder.Cap(), NewDate32Array did not reset state") + assert.Zero(t, ab.NullN(), "unexpected ArrayBuilder.NullN(), NewDate32Array did not reset state") + + // check state of array + assert.Equal(t, 2, a.NullN(), "unexpected null count") + assert.Equal(t, []arrow.Date32{1, 2, 3, 0, 5, 6, 0, 8, 9, 10}, a.Date32Values(), "unexpected Date32Values") + assert.Equal(t, []byte{0xb7}, a.NullBitmapBytes()[:1]) // 4 bytes due to minBuilderCapacity + assert.Len(t, a.Date32Values(), 10, "unexpected length of Date32Values") + + a.Release() + + ab.Append(7) + ab.Append(8) + + a = ab.NewDate32Array() + + assert.Equal(t, 0, a.NullN()) + assert.Equal(t, []arrow.Date32{7, 8}, a.Date32Values()) + assert.Len(t, a.Date32Values(), 2) + + a.Release() +} + +func TestDate32Builder_AppendValues(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + exp := []arrow.Date32{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate32Array() + assert.Equal(t, exp, a.Date32Values()) + + a.Release() + ab.Release() +} + +func TestDate32Builder_Empty(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + exp := []arrow.Date32{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate32Array() + assert.Equal(t, exp, a.Date32Values()) + a.Release() + + a = ab.NewDate32Array() + assert.Zero(t, a.Len()) + a.Release() + + ab.Release() +} + +func TestDate32Builder_Resize(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate32Builder(mem) + + assert.Equal(t, 0, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + ab.Reserve(63) + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + for i := 0; i < 63; i++ { + ab.Append(0) + } + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 63, ab.Len()) + + ab.Resize(5) + assert.Equal(t, 5, ab.Len()) + + ab.Resize(32) + assert.Equal(t, 5, ab.Len()) + + ab.Release() +} + +func TestNewDate64Builder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + ab.Append(1) + ab.Append(2) + ab.Append(3) + ab.AppendNull() + ab.Append(5) + ab.Append(6) + ab.AppendNull() + ab.Append(8) + ab.Append(9) + ab.Append(10) + + // check state of builder before NewDate64Array + assert.Equal(t, 10, ab.Len(), "unexpected Len()") + assert.Equal(t, 2, ab.NullN(), "unexpected NullN()") + + a := ab.NewDate64Array() + + // check state of builder after NewDate64Array + assert.Zero(t, ab.Len(), "unexpected ArrayBuilder.Len(), NewDate64Array did not reset state") + assert.Zero(t, ab.Cap(), "unexpected ArrayBuilder.Cap(), NewDate64Array did not reset state") + assert.Zero(t, ab.NullN(), "unexpected ArrayBuilder.NullN(), NewDate64Array did not reset state") + + // check state of array + assert.Equal(t, 2, a.NullN(), "unexpected null count") + assert.Equal(t, []arrow.Date64{1, 2, 3, 0, 5, 6, 0, 8, 9, 10}, a.Date64Values(), "unexpected Date64Values") + assert.Equal(t, []byte{0xb7}, a.NullBitmapBytes()[:1]) // 4 bytes due to minBuilderCapacity + assert.Len(t, a.Date64Values(), 10, "unexpected length of Date64Values") + + a.Release() + + ab.Append(7) + ab.Append(8) + + a = ab.NewDate64Array() + + assert.Equal(t, 0, a.NullN()) + assert.Equal(t, []arrow.Date64{7, 8}, a.Date64Values()) + assert.Len(t, a.Date64Values(), 2) + + a.Release() +} + +func TestDate64Builder_AppendValues(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + exp := []arrow.Date64{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate64Array() + assert.Equal(t, exp, a.Date64Values()) + + a.Release() + ab.Release() +} + +func TestDate64Builder_Empty(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + exp := []arrow.Date64{1, 2, 3, 4} + ab.AppendValues(exp, nil) + a := ab.NewDate64Array() + assert.Equal(t, exp, a.Date64Values()) + a.Release() + + a = ab.NewDate64Array() + assert.Zero(t, a.Len()) + a.Release() + + ab.Release() +} + +func TestDate64Builder_Resize(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + ab := array.NewDate64Builder(mem) + + assert.Equal(t, 0, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + ab.Reserve(63) + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 0, ab.Len()) + + for i := 0; i < 63; i++ { + ab.Append(0) + } + assert.Equal(t, 64, ab.Cap()) + assert.Equal(t, 63, ab.Len()) + + ab.Resize(5) + assert.Equal(t, 5, ab.Len()) + + ab.Resize(32) + assert.Equal(t, 5, ab.Len()) + + ab.Release() +} diff --git a/go/arrow/datatype_fixedwidth.go b/go/arrow/datatype_fixedwidth.go index 60cc98a4b97d9..444495058a591 100644 --- a/go/arrow/datatype_fixedwidth.go +++ b/go/arrow/datatype_fixedwidth.go @@ -37,6 +37,8 @@ type ( Time32 int32 Time64 int64 TimeUnit int + Date32 int32 + Date64 int64 ) const ( diff --git a/go/arrow/datatype_numeric.gen.go b/go/arrow/datatype_numeric.gen.go index 2ec4c4098a4a6..9b5dc835b1ea2 100644 --- a/go/arrow/datatype_numeric.gen.go +++ b/go/arrow/datatype_numeric.gen.go @@ -78,6 +78,18 @@ func (t *Float64Type) ID() Type { return FLOAT64 } func (t *Float64Type) Name() string { return "float64" } func (t *Float64Type) BitWidth() int { return 64 } +type Date32Type struct{} + +func (t *Date32Type) ID() Type { return DATE32 } +func (t *Date32Type) Name() string { return "date32" } +func (t *Date32Type) BitWidth() int { return 32 } + +type Date64Type struct{} + +func (t *Date64Type) ID() Type { return DATE64 } +func (t *Date64Type) Name() string { return "date64" } +func (t *Date64Type) BitWidth() int { return 64 } + var ( PrimitiveTypes = struct { Int8 DataType @@ -90,6 +102,8 @@ var ( Uint64 DataType Float32 DataType Float64 DataType + Date32 DataType + Date64 DataType }{ Int8: &Int8Type{}, @@ -102,5 +116,7 @@ var ( Uint64: &Uint64Type{}, Float32: &Float32Type{}, Float64: &Float64Type{}, + Date32: &Date32Type{}, + Date64: &Date64Type{}, } ) diff --git a/go/arrow/datatype_numeric.gen.go.tmpldata b/go/arrow/datatype_numeric.gen.go.tmpldata index 415b51b2e16bd..9badc6ee2b211 100644 --- a/go/arrow/datatype_numeric.gen.go.tmpldata +++ b/go/arrow/datatype_numeric.gen.go.tmpldata @@ -48,5 +48,15 @@ "Name": "Float64", "Type": "float64", "Size": 64 + }, + { + "Name": "Date32", + "Type": "date32", + "Size": 32 + }, + { + "Name": "Date64", + "Type": "date64", + "Size": 64 } ] diff --git a/go/arrow/numeric.tmpldata b/go/arrow/numeric.tmpldata index b9e976eea0534..45452ab4468c6 100644 --- a/go/arrow/numeric.tmpldata +++ b/go/arrow/numeric.tmpldata @@ -107,5 +107,23 @@ "Opt": { "Parametric": true } + }, + { + "Name": "Date32", + "name": "date32", + "Type": "Date32", + "QualifiedType": "arrow.Date32", + "InternalType": "int32", + "Default": "0", + "Size": "4" + }, + { + "Name": "Date64", + "name": "date64", + "Type": "Date64", + "QualifiedType": "arrow.Date64", + "InternalType": "int64", + "Default": "0", + "Size": "8" } -] \ No newline at end of file +] diff --git a/go/arrow/type_traits_numeric.gen.go b/go/arrow/type_traits_numeric.gen.go index 59ed13f541a53..14fafbc57659b 100644 --- a/go/arrow/type_traits_numeric.gen.go +++ b/go/arrow/type_traits_numeric.gen.go @@ -38,6 +38,8 @@ var ( TimestampTraits timestampTraits Time32Traits time32Traits Time64Traits time64Traits + Date32Traits date32Traits + Date64Traits date64Traits ) // Int64 traits @@ -663,3 +665,99 @@ func (time64Traits) CastToBytes(b []Time64) []byte { // Copy copies src to dst. func (time64Traits) Copy(dst, src []Time64) { copy(dst, src) } + +// Date32 traits + +const ( + // Date32SizeBytes specifies the number of bytes required to store a single Date32 in memory + Date32SizeBytes = int(unsafe.Sizeof(Date32(0))) +) + +type date32Traits struct{} + +// BytesRequired returns the number of bytes required to store n elements in memory. +func (date32Traits) BytesRequired(n int) int { return Date32SizeBytes * n } + +// PutValue +func (date32Traits) PutValue(b []byte, v Date32) { + binary.LittleEndian.PutUint32(b, uint32(v)) +} + +// CastFromBytes reinterprets the slice b to a slice of type Date32. +// +// NOTE: len(b) must be a multiple of Date32SizeBytes. +func (date32Traits) CastFromBytes(b []byte) []Date32 { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []Date32 + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len / Date32SizeBytes + s.Cap = h.Cap / Date32SizeBytes + + return res +} + +// CastToBytes reinterprets the slice b to a slice of bytes. +func (date32Traits) CastToBytes(b []Date32) []byte { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []byte + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len * Date32SizeBytes + s.Cap = h.Cap * Date32SizeBytes + + return res +} + +// Copy copies src to dst. +func (date32Traits) Copy(dst, src []Date32) { copy(dst, src) } + +// Date64 traits + +const ( + // Date64SizeBytes specifies the number of bytes required to store a single Date64 in memory + Date64SizeBytes = int(unsafe.Sizeof(Date64(0))) +) + +type date64Traits struct{} + +// BytesRequired returns the number of bytes required to store n elements in memory. +func (date64Traits) BytesRequired(n int) int { return Date64SizeBytes * n } + +// PutValue +func (date64Traits) PutValue(b []byte, v Date64) { + binary.LittleEndian.PutUint64(b, uint64(v)) +} + +// CastFromBytes reinterprets the slice b to a slice of type Date64. +// +// NOTE: len(b) must be a multiple of Date64SizeBytes. +func (date64Traits) CastFromBytes(b []byte) []Date64 { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []Date64 + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len / Date64SizeBytes + s.Cap = h.Cap / Date64SizeBytes + + return res +} + +// CastToBytes reinterprets the slice b to a slice of bytes. +func (date64Traits) CastToBytes(b []Date64) []byte { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + var res []byte + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len * Date64SizeBytes + s.Cap = h.Cap * Date64SizeBytes + + return res +} + +// Copy copies src to dst. +func (date64Traits) Copy(dst, src []Date64) { copy(dst, src) } From c7cb1cee388bbfa890ce724f6a7a95991bd8eb1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 10:48:12 -0600 Subject: [PATCH 18/80] ARROW-3368: [Integration/CI/Python] Add dask integration test to docker-compose setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port dask integration testing, and refactored the hdfs one. Multiple python hdfs tests cases are failing, nut sure why. Author: Krisztián Szűcs Closes #3086 from kszucs/ARROW-3368 and squashes the following commits: d6e98ecb8 resolve load warning of native-hadoop library 237440ab0 cleanup 0b3e1fc4c port fastparquet hdfs test c497be0d6 better error msg cf06721ef minimal hdfs config d1e9b3717 arrow- test executable prefix 5c11d72ed add hdfs config files f7681d045 download hadoop from apache mirrors c84294a56 update comment 1eef5bfa8 remove dask_integration.sh 00ef67691 two datanodes; support env vars in conftest c2bc444c2 remove outdated files 840f313b3 test optional modules 0e8b3932b add dask-integration to nightlies c3d4a9bef remove comments from docker-compose ffe0ac7ea set hadoop version to 2.6.0 1c7bf304d unset LD_LIBRARY_PATH f1af0248d run dask tests 694e7e6ef docker-compose setup for dask integration --- ci/docker_build_python.sh | 7 +- dev/dask_integration.sh | 21 -- dev/dask_integration/Dockerfile | 22 -- dev/dask_integration/dask_integration.sh | 98 ------ dev/tasks/tests.yml | 22 +- docker-compose.yml | 44 ++- .../dask/Dockerfile | 20 +- integration/dask/runtest.sh | 34 ++ integration/hdfs/Dockerfile | 78 ++-- integration/hdfs/hdfs-site.xml | 44 +++ integration/hdfs/libhdfs3.xml | 332 ------------------ integration/hdfs/runtest.sh | 12 +- python/Dockerfile | 3 +- python/pyarrow/tests/conftest.py | 41 ++- python/pyarrow/tests/test_hdfs.py | 35 +- python/testing/README.md | 42 --- .../dask_tests/test_dask_integration.py | 58 --- python/testing/functions.sh | 75 ---- python/testing/parquet_interop.py | 51 --- python/testing/set_env_common.sh | 70 ---- python/testing/setup_toolchain.sh | 64 ---- 21 files changed, 246 insertions(+), 927 deletions(-) delete mode 100755 dev/dask_integration.sh delete mode 100644 dev/dask_integration/Dockerfile delete mode 100755 dev/dask_integration/dask_integration.sh rename python/testing/test_hdfs.sh => integration/dask/Dockerfile (68%) mode change 100755 => 100644 create mode 100755 integration/dask/runtest.sh create mode 100644 integration/hdfs/hdfs-site.xml delete mode 100644 integration/hdfs/libhdfs3.xml delete mode 100644 python/testing/README.md delete mode 100644 python/testing/dask_tests/test_dask_integration.py delete mode 100644 python/testing/functions.sh delete mode 100644 python/testing/parquet_interop.py delete mode 100644 python/testing/set_env_common.sh delete mode 100644 python/testing/setup_toolchain.sh diff --git a/ci/docker_build_python.sh b/ci/docker_build_python.sh index 8ba8a1d66f1be..23d852bcb8713 100755 --- a/ci/docker_build_python.sh +++ b/ci/docker_build_python.sh @@ -26,12 +26,17 @@ export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" export PYARROW_CXXFLAGS=$CXXFLAGS export PYARROW_CMAKE_GENERATOR=Ninja export PYARROW_BUILD_TYPE=${PYARROW_BUILD_TYPE:-debug} + +# Feature flags +export PYARROW_WITH_ORC=${PYARROW_WITH_ORC:-1} export PYARROW_WITH_PARQUET=${PYARROW_WITH_PARQUET:-1} export PYARROW_WITH_PLASMA=${PYARROW_WITH_PLASMA:-1} # Build pyarrow pushd ${source_dir} -python setup.py build_ext --build-temp=${build_dir} install +python setup.py build --build-temp=${build_dir} \ + install --single-version-externally-managed \ + --record=/build/python/record.txt popd diff --git a/dev/dask_integration.sh b/dev/dask_integration.sh deleted file mode 100755 index d344328b6af1e..0000000000000 --- a/dev/dask_integration.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Pass the service name to run_docker_compose.sh -# Which validates environment and runs the service -exec "$(dirname ${BASH_SOURCE})"/run_docker_compose.sh dask_integration diff --git a/dev/dask_integration/Dockerfile b/dev/dask_integration/Dockerfile deleted file mode 100644 index f0c1f03f6f93c..0000000000000 --- a/dev/dask_integration/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -FROM arrow_integration_xenial_base - -ADD . /apache-arrow -WORKDIR /apache-arrow - -CMD arrow/dev/dask_integration/dask_integration.sh diff --git a/dev/dask_integration/dask_integration.sh b/dev/dask_integration/dask_integration.sh deleted file mode 100755 index f4999c0ae447f..0000000000000 --- a/dev/dask_integration/dask_integration.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set up environment and working directory -cd /apache-arrow - -conda activate pyarrow-dev - -# install pytables from defaults for now -conda install -y pytables - -pip install -q git+https://github.com/dask/partd --upgrade --no-deps -pip install -q git+https://github.com/dask/zict --upgrade --no-deps -pip install -q git+https://github.com/dask/distributed --upgrade --no-deps -pip install -q git+https://github.com/mrocklin/sparse --upgrade --no-deps -pip install -q git+https://github.com/dask/s3fs --upgrade --no-deps - -conda install -y -q -c conda-forge numba cython \ - bcolz \ - blosc \ - bokeh \ - boto3 \ - chest \ - cloudpickle \ - coverage \ - cytoolz \ - distributed \ - graphviz \ - h5py \ - partd \ - psutil \ - "pytest<=3.1.1" \ - scikit-image \ - scikit-learn \ - sqlalchemy \ - toolz - -pip install -q git+https://github.com/dask/fastparquet - -pip install -q \ - cachey \ - graphviz \ - moto \ - pyarrow \ - --upgrade --no-deps - -pip install -q \ - cityhash \ - flake8 \ - mmh3 \ - pandas_datareader \ - pytest-xdist \ - xxhash \ - pycodestyle - -export ARROW_BUILD_TYPE=release -export ARROW_HOME=$(pwd)/dist -export PARQUET_HOME=$(pwd)/dist -CONDA_BASE=/home/ubuntu/miniconda -export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} - -# Allow for --user Python installation inside Docker -export HOME=$(pwd) - -# Clean up and get the dask master branch from github -rm -rf dask .local -export GIT_COMMITTER_NAME="Nobody" -export GIT_COMMITTER_EMAIL="nobody@nowhere.com" -git clone https://github.com/dask/dask.git -pushd dask -pip install --user -e .[complete] -# Verify integrity of the installed dask dataframe code -py.test dask/dataframe/tests/test_dataframe.py -popd - -# Run the integration test -pushd arrow/python/testing -py.test dask_tests -popd - -pushd dask/dask/dataframe/io -py.test tests/test_parquet.py -popd diff --git a/dev/tasks/tests.yml b/dev/tasks/tests.yml index d9493b606e5a0..a0c7676ba7312 100644 --- a/dev/tasks/tests.yml +++ b/dev/tasks/tests.yml @@ -36,12 +36,14 @@ groups: - docker-lint - docker-iwyu - docker-clang-format - - docker-hdfs-integration - docker-pandas-master + - docker-hdfs-integration + - docker-dask-integration integration: - - docker-hdfs-integration - docker-pandas-master + - docker-dask-integration + - docker-hdfs-integration cpp-python: - docker-cpp @@ -239,11 +241,27 @@ tasks: ############################## Integration tests ############################ + docker-dask-integration: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 3.6 + commands: + - docker-compose build cpp + - docker-compose build python + - docker-compose build dask-integration + - docker-compose run dask-integration + docker-hdfs-integration: platform: linux template: docker-tests/travis.linux.yml params: + environment: + PYTHON_VERSION: 3.6 commands: + - docker-compose build cpp + - docker-compose build python - docker-compose build hdfs-integration - docker-compose run hdfs-integration diff --git a/docker-compose.yml b/docker-compose.yml index d3a7990d5cc23..0a01a7cbe97bf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -222,6 +222,20 @@ services: # - "21050" # hostname: impala + pandas-master: + # Usage: + # export PYTHON_VERSION=3.6 + # docker-compose build cpp + # docker-compose build python + # docker-compose build --no-cache pandas-master + # docker-compose run pandas-master + image: arrow:pandas-master + build: + context: . + dockerfile: integration/pandas/Dockerfile + shm_size: 2G + volumes: *ubuntu-volumes + hdfs-namenode: image: gelog/hadoop shm_size: 2G @@ -231,7 +245,7 @@ services: command: hdfs namenode hostname: hdfs-namenode - hdfs-datanode: + hdfs-datanode-1: image: gelog/hadoop command: hdfs datanode ports: @@ -241,6 +255,17 @@ services: links: - hdfs-namenode:hdfs-namenode + hdfs-datanode-2: + image: gelog/hadoop + command: hdfs datanode + ports: + # The host port is randomly assigned by Docker, to allow scaling + # to multiple DataNodes on the same host + - "50075" + links: + - hdfs-namenode:hdfs-namenode + + # TODO(kszucs): pass hdfs client version explicitly as a build argument hdfs-integration: # Usage: # export PYTHON_VERSION=3.6 @@ -250,7 +275,8 @@ services: # docker-compose run hdfs-integration links: - hdfs-namenode:hdfs-namenode - - hdfs-datanode:hdfs-datanode + - hdfs-datanode-1:hdfs-datanode-1 + - hdfs-datanode-2:hdfs-datanode-2 environment: - ARROW_HDFS_TEST_HOST=hdfs-namenode - ARROW_HDFS_TEST_PORT=9000 @@ -258,22 +284,20 @@ services: build: context: . dockerfile: integration/hdfs/Dockerfile + volumes: *ubuntu-volumes - pandas-master: + # TODO(kszucs): pass dask version explicitly as a build argument + dask-integration: # Usage: # export PYTHON_VERSION=3.6 # docker-compose build cpp # docker-compose build python - # docker-compose build --no-cache pandas-master - # docker-compose run pandas-master - image: arrow:pandas-master + # docker-compose build dask-integration + # docker-compose run dask-integration build: context: . - dockerfile: integration/pandas/Dockerfile - shm_size: 2G + dockerfile: integration/dask/Dockerfile volumes: *ubuntu-volumes - - # TODO(kszucs): dask-integration # TODO(kszucs): hive-integration # TODO(kszucs): spark-integration diff --git a/python/testing/test_hdfs.sh b/integration/dask/Dockerfile old mode 100755 new mode 100644 similarity index 68% rename from python/testing/test_hdfs.sh rename to integration/dask/Dockerfile index 016e54a66a671..5e054c51c561e --- a/python/testing/test_hdfs.sh +++ b/integration/dask/Dockerfile @@ -1,5 +1,3 @@ -#!/usr/bin/env bash -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,9 +15,17 @@ # specific language governing permissions and limitations # under the License. -set -ex +FROM arrow:python-3.6 + +# setup /etc/localtime +RUN DEBIAN_FRONTEND=noninteractive \ + apt-get install -y -q tzdata + +# install dask release from conda +RUN conda install -c conda-forge dask pytest=3 && \ + conda clean --all -docker build -t arrow-hdfs-test -f hdfs/Dockerfile . -bash hdfs/restart_docker_container.sh -docker exec -it arrow-hdfs /io/hdfs/run_tests.sh -docker stop arrow-hdfs +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + arrow/integration/dask/runtest.sh diff --git a/integration/dask/runtest.sh b/integration/dask/runtest.sh new file mode 100755 index 0000000000000..9a37e0a67ba9b --- /dev/null +++ b/integration/dask/runtest.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +# check that optional pyarrow modules are available +# because pytest would just skip the dask tests +python -c "import pyarrow.orc" +python -c "import pyarrow.parquet" + +# TODO(kszucs): the following tests are also uses pyarrow +# pytest -sv --pyargs dask.bytes.tests.test_s3 +# pytest -sv --pyargs dask.bytes.tests.test_hdfs +# pytest -sv --pyargs dask.bytes.tests.test_local + +pytest -v --pyargs dask.dataframe.io.tests.test_orc +pytest -v --pyargs dask.dataframe.io.tests.test_parquet +pytest -v --pyargs dask.dataframe.tests.test_dataframe diff --git a/integration/hdfs/Dockerfile b/integration/hdfs/Dockerfile index a1d3e4eb0a598..4fc266f267e76 100644 --- a/integration/hdfs/Dockerfile +++ b/integration/hdfs/Dockerfile @@ -15,63 +15,35 @@ # specific language governing permissions and limitations # under the License. -FROM gelog/hadoop +FROM arrow:python-3.6 -RUN apt-get update && \ - apt-get install -y \ - autoconf \ - automake \ - make \ - gcc \ - g++ \ - git \ - wget \ - pkg-config \ - ninja-build - -ENV CC=gcc \ - CXX=g++ \ - PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -# install dependencies -ARG PYTHON_VERSION=3.6 -ADD ci/docker_install_conda.sh \ - ci/conda_env_cpp.yml \ - ci/conda_env_python.yml \ - /arrow/ci/ -RUN arrow/ci/docker_install_conda.sh && \ - conda install -c conda-forge \ - --file arrow/ci/conda_env_cpp.yml \ - --file arrow/ci/conda_env_python.yml \ - python=$PYTHON_VERSION && \ - conda clean --all - -# installing in the previous step boost=1.60 and boost-cpp=1.67 gets installed, -# cmake finds 1.60 and parquet fails to compile -# installing it in a separate step, boost=1.60 and boost-cpp=1.64 gets -# installed, cmake finds 1.64 -# libhdfs3 needs to be pinned, see ARROW-1465 and ARROW-1445 +# installing libhdfs3, it needs to be pinned, see ARROW-1465 and ARROW-1445 RUN conda install -y -c conda-forge hdfs3 libhdfs3=2.2.31 && \ conda clean --all +# installing libhdfs (JNI) +ARG HADOOP_VERSION=2.6.5 +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_HOME=/usr/local/hadoop \ + HADOOP_OPTS=-Djava.library.path=/usr/local/hadoop/lib/native \ + PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin +RUN apt-get update -y && \ + apt-get install -y openjdk-8-jdk && \ + wget -q -O hadoop-$HADOOP_VERSION.tar.gz "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" && \ + tar -zxf /hadoop-$HADOOP_VERSION.tar.gz && \ + rm /hadoop-$HADOOP_VERSION.tar.gz && \ + mv hadoop-$HADOOP_VERSION /usr/local/hadoop +ADD integration/hdfs/hdfs-site.xml $HADOOP_HOME/etc/hadoop/ + # build cpp with tests -ENV ARROW_HDFS=ON \ +ENV CC=gcc \ + CXX=g++ \ + ARROW_ORC=ON \ + ARROW_HDFS=ON \ ARROW_PYTHON=ON \ - ARROW_BUILD_TESTS=ON \ - LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${HADOOP_HOME}/lib/native" -ADD ci/docker_build_cpp.sh /arrow/ci/ -ADD cpp /arrow/cpp -ADD format /arrow/format -ADD java/pom.xml /arrow/java/pom.xml -RUN arrow/ci/docker_build_cpp.sh - -# build python -ADD ci/docker_build_python.sh /arrow/ci/ -ADD python /arrow/python -RUN arrow/ci/docker_build_python.sh + ARROW_BUILD_TESTS=ON -# execute integration tests -ENV LIBHDFS3_CONF=/arrow/integration/hdfs/libhdfs3.xml -ADD integration /arrow/integration -CMD arrow/integration/hdfs/runtest.sh +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + arrow/integration/hdfs/runtest.sh diff --git a/integration/hdfs/hdfs-site.xml b/integration/hdfs/hdfs-site.xml new file mode 100644 index 0000000000000..a80b945a664b7 --- /dev/null +++ b/integration/hdfs/hdfs-site.xml @@ -0,0 +1,44 @@ + + + + + + + + + dfs.replication + 2 + + + dfs.datanode.data.dir + file:///data/dfs/data + + + dfs.namenode.name.dir + file:///data/dfs/name + + + dfs.namenode.checkpoint.dir + file:///data/dfs/namesecondary + + + dfs.namenode.datanode.registration.ip-hostname-check + false + + + dfs.default.replica + 1 + + diff --git a/integration/hdfs/libhdfs3.xml b/integration/hdfs/libhdfs3.xml deleted file mode 100644 index f929929b386da..0000000000000 --- a/integration/hdfs/libhdfs3.xml +++ /dev/null @@ -1,332 +0,0 @@ - - - - - - - - - - - - - - - rpc.client.timeout - 3600000 - - timeout interval of a RPC invocation in millisecond. default is 3600000. - - - - rpc.client.connect.tcpnodelay - true - - whether set socket TCP_NODELAY to true when connect to RPC server. default is true. - - - - - rpc.client.max.idle - 10000 - - the max idle time of a RPC connection in millisecond. default is 10000. - - - - - rpc.client.ping.interval - 10000 - - the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000. - - - - - rpc.client.connect.timeout - 600000 - - the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000. - - - - - rpc.client.connect.retry - 10 - - the max retry times if the RPC client fail to setup the connection to server. default is 10. - - - - - rpc.client.read.timeout - 3600000 - - the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000. - - - - - rpc.client.write.timeout - 3600000 - - the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000. - - - - - rpc.client.socket.linger.timeout - -1 - - set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1. - - - - - - dfs.client.read.shortcircuit - false - - whether reading block file bypass datanode if the block and the client are on the same node. default is true. - - - - - dfs.default.replica - 1 - - the default number of replica. default is 3. - - - - - dfs.prefetchsize - 10 - - the default number of blocks which information will be prefetched. default is 10. - - - - - dfs.client.failover.max.attempts - 15 - - if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15. - - - - - dfs.default.blocksize - 134217728 - - default block size. default is 134217728. - - - - - dfs.client.log.severity - INFO - - the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO. - - - - - - input.connect.timeout - 600000 - - the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000. - - - - - input.read.timeout - 3600000 - - the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000. - - - - - input.write.timeout - 3600000 - - the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000. - - - - - input.localread.default.buffersize - 2097152 - - number of bytes of the buffer which is used to hold the data from block file and verify checksum. - it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576. - - - - - input.localread.blockinfo.cachesize - 1000 - - the size of block file path information cache. default is 1000. - - - - - input.read.getblockinfo.retry - 3 - - the max retry times when the client fail to get block information from namenode. default is 3. - - - - - - output.replace-datanode-on-failure - false - - whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false. - - - - - output.default.chunksize - 512 - - the number of bytes of a chunk in pipeline. default is 512. - - - - - output.default.packetsize - 65536 - - the number of bytes of a packet in pipeline. default is 65536. - - - - - output.default.write.retry - 10 - - the max retry times when the client fail to setup the pipeline. default is 10. - - - - - output.connect.timeout - 600000 - - the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000. - - - - - output.read.timeout - 3600000 - - the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000. - - - - - output.write.timeout - 3600000 - - the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000. - - - - - output.packetpool.size - 1024 - - the max number of packets in a file's packet pool. default is 1024. - - - - - output.close.timeout - 900000 - - the timeout interval in millisecond when close an output stream. default is 900000. - - - - - dfs.domain.socket.path - /var/lib/hadoop-hdfs/dn_socket - - Optional. This is a path to a UNIX domain socket that will be used for - communication between the DataNode and local HDFS clients. - If the string "_PORT" is present in this path, it will be replaced by the - TCP port of the DataNode. - - - - - dfs.client.use.legacy.blockreader.local - false - - Legacy short-circuit reader implementation based on HDFS-2246 is used - if this configuration parameter is true. - This is for the platforms other than Linux - where the new implementation based on HDFS-347 is not available. - - - - diff --git a/integration/hdfs/runtest.sh b/integration/hdfs/runtest.sh index a90eb93645369..2f090c8a81fba 100755 --- a/integration/hdfs/runtest.sh +++ b/integration/hdfs/runtest.sh @@ -20,9 +20,17 @@ set -e export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +export LIBHDFS3_CONF=$HADOOP_CONF_DIR/hdfs-site.xml +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ +# execute cpp tests pushd /build/cpp - debug/io-hdfs-test + debug/arrow-io-hdfs-test popd -pytest -v --pyargs pyarrow +# cannot use --pyargs with custom arguments like --hdfs or --only-hdfs, because +# pytest ignores them, see https://github.com/pytest-dev/pytest/issues/3517 +export PYARROW_TEST_ONLY_HDFS=ON + +pytest -v --pyargs pyarrow.tests.test_hdfs diff --git a/python/Dockerfile b/python/Dockerfile index 5c2ef1e30d142..a99a4206290f8 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -25,7 +25,8 @@ RUN conda install -c conda-forge \ python=$PYTHON_VERSION && \ conda clean --all -ENV ARROW_PYTHON=ON +ENV ARROW_PYTHON=ON \ + ARROW_BUILD_TESTS=OFF # build and test CMD arrow/ci/docker_build_cpp.sh && \ diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 69e8e82e2532a..3c092cfb60247 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -99,19 +99,36 @@ def pytest_configure(config): def pytest_addoption(parser): + def bool_env(name, default=None): + value = os.environ.get(name.upper()) + if value is None: + return default + value = value.lower() + if value in {'1', 'true', 'on', 'yes', 'y'}: + return True + elif value in {'0', 'false', 'off', 'no', 'n'}: + return False + else: + raise ValueError('{}={} is not parsable as boolean' + .format(name.upper(), value)) + for group in groups: - for flag in ['--{0}', '--enable-{0}']: - parser.addoption(flag.format(group), action='store_true', - default=defaults[group], - help=('Enable the {0} test group'.format(group))) - - parser.addoption('--disable-{0}'.format(group), action='store_true', - default=False, - help=('Disable the {0} test group'.format(group))) - - parser.addoption('--only-{0}'.format(group), action='store_true', - default=False, - help=('Run only the {0} test group'.format(group))) + for flag, envvar in [('--{}', 'PYARROW_TEST_{}'), + ('--enable-{}', 'PYARROW_TEST_ENABLE_{}')]: + default = bool_env(envvar.format(group), defaults[group]) + parser.addoption(flag.format(group), + action='store_true', default=default, + help=('Enable the {} test group'.format(group))) + + default = bool_env('PYARROW_TEST_DISABLE_{}'.format(group), False) + parser.addoption('--disable-{}'.format(group), + action='store_true', default=default, + help=('Disable the {} test group'.format(group))) + + default = bool_env('PYARROW_TEST_ONLY_{}'.format(group), False) + parser.addoption('--only-{}'.format(group), + action='store_true', default=default, + help=('Run only the {} test group'.format(group))) parser.addoption('--runslow', action='store_true', default=False, help='run slow tests') diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index 81b03b6fb7e4e..f218a1604a9d9 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -15,21 +15,22 @@ # specific language governing permissions and limitations # under the License. -from io import BytesIO -from os.path import join as pjoin import os import pickle +import pytest import random import unittest +import pandas.util.testing as pdt + +from io import BytesIO +from os.path import join as pjoin import numpy as np -import pandas.util.testing as pdt -import pytest +import pyarrow as pa +import pyarrow.tests.test_parquet as test_parquet from pyarrow.compat import guid -import pyarrow as pa -import pyarrow.tests.test_parquet as test_parquet # ---------------------------------------------------------------------- # HDFS tests @@ -406,3 +407,25 @@ def _get_hdfs_uri(path): uri = "hdfs://{}:{}{}".format(host, port, path) return uri + + +@pytest.mark.parquet +@pytest.mark.fastparquet +@pytest.mark.parametrize('client', ['libhdfs', 'libhdfs3']) +def test_fastparquet_read_with_hdfs(client): + import pyarrow.parquet as pq + fastparquet = pytest.importorskip('fastparquet') + + fs = hdfs_test_client(client) + + df = pdt.makeDataFrame() + table = pa.Table.from_pandas(df) + + path = '/tmp/testing.parquet' + with fs.open(path, 'wb') as f: + pq.write_table(table, f) + + parquet_file = fastparquet.ParquetFile(path, open_with=fs.open) + + result = parquet_file.to_pandas() + pdt.assert_frame_equal(result, df) diff --git a/python/testing/README.md b/python/testing/README.md deleted file mode 100644 index d7d0ff0bb7f47..0000000000000 --- a/python/testing/README.md +++ /dev/null @@ -1,42 +0,0 @@ - - -# Testing tools for odds and ends - -## Testing Dask integration - -Initial integration testing with Dask has been Dockerized. -To invoke the test run the following command in the `arrow` -root-directory: - -```shell -bash dev/dask_integration.sh -``` - -This script will create a `dask` directory on the same level as -`arrow`. It will clone the Dask project from Github into `dask` -and do a Python `--user` install. The Docker code will use the parent -directory of `arrow` as `$HOME` and that's where Python will -install `dask` into a `.local` directory. - -The output of the Docker session will contain the results of tests -of the Dask dataframe followed by the single integration test that -now exists for Arrow. That test creates a set of `csv`-files and then -does parallel reading of `csv`-files into a Dask dataframe. The code -for this test resides here in the `dask_test` directory. diff --git a/python/testing/dask_tests/test_dask_integration.py b/python/testing/dask_tests/test_dask_integration.py deleted file mode 100644 index 842c45f57d1f7..0000000000000 --- a/python/testing/dask_tests/test_dask_integration.py +++ /dev/null @@ -1,58 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import pytest - -from datetime import date, timedelta -import csv -from random import randint - -import pyarrow as pa - -dd = pytest.importorskip('dask.dataframe') - - -def make_datafiles(tmpdir, prefix='data', num_files=20): - rowcount = 5000 - fieldnames = ['date', 'temperature', 'dewpoint'] - start_date = date(1900, 1, 1) - for i in range(num_files): - filename = '{0}/{1}-{2}.csv'.format(tmpdir, prefix, i) - with open(filename, 'w') as outcsv: - writer = csv.DictWriter(outcsv, fieldnames) - writer.writeheader() - the_date = start_date - for _ in range(rowcount): - temperature = randint(-10, 35) - dewpoint = temperature - randint(0, 10) - writer.writerow({'date': the_date, 'temperature': temperature, - 'dewpoint': dewpoint}) - the_date += timedelta(days=1) - - -def test_dask_file_read(tmpdir): - prefix = 'data' - make_datafiles(tmpdir, prefix) - # Read all datafiles in parallel - datafiles = '{0}/{1}-*.csv'.format(tmpdir, prefix) - dask_df = dd.read_csv(datafiles) - # Convert Dask dataframe to Arrow table - table = pa.Table.from_pandas(dask_df.compute()) - # Second column (1) is temperature - dask_temp = int(1000 * dask_df['temperature'].mean().compute()) - arrow_temp = int(1000 * table[1].to_pandas().mean()) - assert dask_temp == arrow_temp diff --git a/python/testing/functions.sh b/python/testing/functions.sh deleted file mode 100644 index 983f490331ff8..0000000000000 --- a/python/testing/functions.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -use_gcc() { - export CC=gcc-4.9 - export CXX=g++-4.9 -} - -use_clang() { - export CC=clang-4.0 - export CXX=clang++-4.0 -} - -bootstrap_python_env() { - PYTHON_VERSION=$1 - CONDA_ENV_DIR=$BUILD_DIR/pyarrow-test-$PYTHON_VERSION - - conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl - conda activate $CONDA_ENV_DIR - - python --version - which python - - # faster builds, please - conda install -y -q nomkl pip numpy pandas cython -} - -build_pyarrow() { - # Other stuff pip install - pushd $ARROW_PYTHON_DIR - pip install -r requirements.txt - python setup.py build_ext --with-parquet --with-plasma \ - install --single-version-externally-managed --record=record.text - popd - - python -c "import pyarrow.parquet" - python -c "import pyarrow.plasma" - - export PYARROW_PATH=$CONDA_PREFIX/lib/python$PYTHON_VERSION/site-packages/pyarrow -} - -build_arrow() { - mkdir -p $ARROW_CPP_BUILD_DIR - pushd $ARROW_CPP_BUILD_DIR - - cmake -GNinja \ - -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DARROW_NO_DEPRECATED_API=ON \ - -DARROW_PARQUET=ON \ - -DARROW_PYTHON=ON \ - -DARROW_PLASMA=ON \ - -DARROW_BOOST_USE_SHARED=off \ - $ARROW_CPP_DIR - - ninja - ninja install - popd -} diff --git a/python/testing/parquet_interop.py b/python/testing/parquet_interop.py deleted file mode 100644 index 6d41ba4b6a5f1..0000000000000 --- a/python/testing/parquet_interop.py +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os - -import fastparquet -import pyarrow as pa -import pyarrow.parquet as pq -import pandas.util.testing as tm - - -def hdfs_test_client(driver='libhdfs'): - host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost') - user = os.environ['ARROW_HDFS_TEST_USER'] - try: - port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 20500)) - except ValueError: - raise ValueError('Env variable ARROW_HDFS_TEST_PORT was not ' - 'an integer') - - return pa.HdfsClient(host, port, user, driver=driver) - - -def test_fastparquet_read_with_hdfs(): - fs = hdfs_test_client() - - df = tm.makeDataFrame() - table = pa.Table.from_pandas(df) - - path = '/tmp/testing.parquet' - with fs.open(path, 'wb') as f: - pq.write_table(table, f) - - parquet_file = fastparquet.ParquetFile(path, open_with=fs.open) - - result = parquet_file.to_pandas() - tm.assert_frame_equal(result, df) diff --git a/python/testing/set_env_common.sh b/python/testing/set_env_common.sh deleted file mode 100644 index 00251f92be4b4..0000000000000 --- a/python/testing/set_env_common.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -export MINICONDA=$HOME/miniconda -export CPP_TOOLCHAIN=$HOME/cpp-toolchain - -export PATH="$MINICONDA/bin:$PATH" -export CONDA_PKGS_DIRS=$HOME/.conda_packages - -export ARROW_CHECKOUT=$HOME/arrow -export BUILD_DIR=$ARROW_CHECKOUT - -export BUILD_OS_NAME=linux -export BUILD_TYPE=debug - -export ARROW_CPP_DIR=$BUILD_DIR/cpp -export ARROW_PYTHON_DIR=$BUILD_DIR/python -export ARROW_C_GLIB_DIR=$BUILD_DIR/c_glib -export ARROW_JAVA_DIR=${BUILD_DIR}/java -export ARROW_JS_DIR=${BUILD_DIR}/js -export ARROW_INTEGRATION_DIR=$BUILD_DIR/integration - -export CPP_BUILD_DIR=$BUILD_DIR/cpp-build - -export ARROW_CPP_INSTALL=$BUILD_DIR/cpp-install -export ARROW_CPP_BUILD_DIR=$BUILD_DIR/cpp-build -export ARROW_C_GLIB_INSTALL=$BUILD_DIR/c-glib-install - -export ARROW_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN -export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN - -export BOOST_ROOT=$CPP_TOOLCHAIN -export PATH=$CPP_TOOLCHAIN/bin:$PATH -export LD_LIBRARY_PATH=$CPP_TOOLCHAIN/lib:$LD_LIBRARY_PATH - -export VALGRIND="valgrind --tool=memcheck" - -export ARROW_HOME=$CPP_TOOLCHAIN -export PARQUET_HOME=$CPP_TOOLCHAIN - -# Arrow test variables - -export JAVA_HOME=/usr/lib/jvm/java-7-oracle -export HADOOP_HOME=/usr/lib/hadoop -export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` -export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$HADOOP_HOME/lib/native" -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/ - -export ARROW_HDFS_TEST_HOST=arrow-hdfs -export ARROW_HDFS_TEST_PORT=9000 -export ARROW_HDFS_TEST_USER=ubuntu -export ARROW_LIBHDFS_DIR=/usr/lib - -export LIBHDFS3_CONF=/io/hdfs/libhdfs3-hdfs-client.xml diff --git a/python/testing/setup_toolchain.sh b/python/testing/setup_toolchain.sh deleted file mode 100644 index 498206ef33a79..0000000000000 --- a/python/testing/setup_toolchain.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -export PATH="$MINICONDA/bin:$PATH" -conda update -y -q conda -conda config --set auto_update_conda false -conda info -a - -conda config --set show_channel_urls True - -# Help with SSL timeouts to S3 -conda config --set remote_connect_timeout_secs 12 - -conda config --add channels https://repo.continuum.io/pkgs/free -conda config --add channels conda-forge -conda info -a - -# faster builds, please -conda install -y nomkl - -conda install --y conda-build jinja2 anaconda-client cmake curl - -# Set up C++ toolchain -conda create -y -q -p $CPP_TOOLCHAIN python=3.6 \ - jemalloc=4.4.0 \ - nomkl \ - boost-cpp \ - rapidjson \ - flatbuffers \ - gflags \ - lz4-c \ - snappy \ - zstd \ - brotli \ - zlib \ - git \ - cmake \ - curl \ - thrift-cpp \ - libhdfs3 \ - glog \ - ninja - -if [ $BUILD_OS_NAME == "osx" ]; then - brew update && brew bundle --file=python/Brewfile -fi From 5c48bdb5de7d46b0cf3a479f393224688474b940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 17 Dec 2018 10:55:20 -0600 Subject: [PATCH 19/80] ARROW-2637: [C++/Python] Build support and instructions for development on Alpine Linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build support is tested by `docker-compose run cpp-alpine` Author: Krisztián Szűcs Closes #3191 from kszucs/ARROW-2637 and squashes the following commits: ea43e08ee add bash to run the tests 348e982a0 add README instructions to build arrow on alpine linux --- cpp/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/README.md b/cpp/README.md index 010387dbd4de3..a94c4be4f2cd4 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -46,6 +46,18 @@ sudo apt-get install \ libboost-system-dev ``` +On Alpine Linux: + +```shell +apk add autoconf \ + bash \ + boost-dev \ + cmake \ + g++ \ + gcc \ + make +``` + On macOS, you can use [Homebrew][1]: ```shell From 0b78f4bdf70617279bbd3997ed79a7194cf66438 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Dec 2018 15:11:25 -0600 Subject: [PATCH 20/80] ARROW-4033: [C++] Use readlink -f instead of realpath in dependency download script This documentation might be better moved to the Sphinx docs. Author: Wes McKinney Closes #3205 from wesm/ARROW-4033 and squashes the following commits: 21349f02f Use readlink -f instead of realpath --- cpp/thirdparty/README.md | 45 +++++++++++++++---------- cpp/thirdparty/download_dependencies.sh | 2 +- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/cpp/thirdparty/README.md b/cpp/thirdparty/README.md index bd1cb28d81818..0353395dfb1ff 100644 --- a/cpp/thirdparty/README.md +++ b/cpp/thirdparty/README.md @@ -29,17 +29,24 @@ offline builds. To set up your own specific build toolchain, here are the relevant environment variables +* brotli: `BROTLI_HOME`, can be disabled with `-DARROW_WITH_BROTLI=off` * Boost: `BOOST_ROOT` +* double-conversion: `DOUBLE_CONVERSION_HOME` * Googletest: `GTEST_HOME` (only required to build the unit tests) * gflags: `GFLAGS_HOME` (only required to build the unit tests) +* glog: `GLOG_HOME` (only required if `ARROW_USE_GLOG=ON`) * Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks) * Flatbuffers: `FLATBUFFERS_HOME` (only required for -DARROW_IPC=on, which is the default) * Hadoop: `HADOOP_HOME` (only required for the HDFS I/O extensions) * jemalloc: `JEMALLOC_HOME` -* brotli: `BROTLI_HOME`, can be disabled with `-DARROW_WITH_BROTLI=off` * lz4: `LZ4_HOME`, can be disabled with `-DARROW_WITH_LZ4=off` +* Apache ORC: `ORC_HOME` +* protobuf: `PROTOBUF_HOME` +* rapidjson: `RAPIDJSON_HOME` +* re2: `RE2_HOME` (only required to build Gandiva currently) * snappy: `SNAPPY_HOME`, can be disabled with `-DARROW_WITH_SNAPPY=off` +* thrift: `THRIFT_HOME` * zlib: `ZLIB_HOME`, can be disabled with `-DARROW_WITH_ZLIB=off` * zstd: `ZSTD_HOME`, can be disabled with `-DARROW_WITH_ZSTD=off` @@ -69,24 +76,26 @@ script: ```shell # Download tarballs into `$HOME/arrow-thirdparty-deps` -$ ./thirdparty/download_dependencies $HOME/arrow-thirdparty-deps -# some output omitted - +$ ./thirdparty/download_dependencies $HOME/arrow-thirdparty # Environment variables for offline Arrow build -export ARROW_BOOST_URL=$HOME/arrow-thirdparty-deps/boost.tar.gz -export ARROW_GTEST_URL=$HOME/arrow-thirdparty-deps/gtest.tar.gz -export ARROW_GFLAGS_URL=$HOME/arrow-thirdparty-deps/gflags.tar.gz -export ARROW_GBENCHMARK_URL=$HOME/arrow-thirdparty-deps/gbenchmark.tar.gz -export ARROW_FLATBUFFERS_URL=$HOME/arrow-thirdparty-deps/flatbuffers.tar.gz -export ARROW_RAPIDJSON_URL=$HOME/arrow-thirdparty-deps/rapidjson.tar.gz -export ARROW_SNAPPY_URL=$HOME/arrow-thirdparty-deps/snappy.tar.gz -export ARROW_BROTLI_URL=$HOME/arrow-thirdparty-deps/brotli.tar.gz -export ARROW_LZ4_URL=$HOME/arrow-thirdparty-deps/lz4.tar.gz -export ARROW_ZLIB_URL=$HOME/arrow-thirdparty-deps/zlib.tar.gz -export ARROW_ZSTD_URL=$HOME/arrow-thirdparty-deps/zstd.tar.gz -export ARROW_PROTOBUF_URL=$HOME/arrow-thirdparty-deps/protobuf.tar.gz -export ARROW_GRPC_URL=$HOME/arrow-thirdparty-deps/grpc.tar.gz -export ARROW_ORC_URL=$HOME/arrow-thirdparty-deps/orc.tar.gz +export ARROW_BOOST_URL=$HOME/arrow-thirdparty/boost-1.67.0.tar.gz +export ARROW_BROTLI_URL=$HOME/arrow-thirdparty/brotli-v0.6.0.tar.gz +export ARROW_DOUBLE_CONVERSION_URL=$HOME/arrow-thirdparty/double-conversion-v3.1.1.tar.gz +export ARROW_FLATBUFFERS_URL=$HOME/arrow-thirdparty/flatbuffers-02a7807dd8d26f5668ffbbec0360dc107bbfabd5.tar.gz +export ARROW_GBENCHMARK_URL=$HOME/arrow-thirdparty/gbenchmark-v1.4.1.tar.gz +export ARROW_GFLAGS_URL=$HOME/arrow-thirdparty/gflags-v2.2.0.tar.gz +export ARROW_GLOG_URL=$HOME/arrow-thirdparty/glog-v0.3.5.tar.gz +export ARROW_GRPC_URL=$HOME/arrow-thirdparty/grpc-v1.14.1.tar.gz +export ARROW_GTEST_URL=$HOME/arrow-thirdparty/gtest-1.8.0.tar.gz +export ARROW_LZ4_URL=$HOME/arrow-thirdparty/lz4-v1.7.5.tar.gz +export ARROW_ORC_URL=$HOME/arrow-thirdparty/orc-1.5.1.tar.gz +export ARROW_PROTOBUF_URL=$HOME/arrow-thirdparty/protobuf-v3.6.1.tar.gz +export ARROW_RAPIDJSON_URL=$HOME/arrow-thirdparty/rapidjson-v1.1.0.tar.gz +export ARROW_RE2_URL=$HOME/arrow-thirdparty/re2-2018-10-01.tar.gz +export ARROW_SNAPPY_URL=$HOME/arrow-thirdparty/snappy-1.1.3.tar.gz +export ARROW_THRIFT_URL=$HOME/arrow-thirdparty/thrift-0.11.0.tar.gz +export ARROW_ZLIB_URL=$HOME/arrow-thirdparty/zlib-1.2.8.tar.gz +export ARROW_ZSTD_URL=$HOME/arrow-thirdparty/zstd-v1.3.7.tar.gz ``` This can be automated by using inline source/eval: diff --git a/cpp/thirdparty/download_dependencies.sh b/cpp/thirdparty/download_dependencies.sh index de7d23ca2ef5e..f782963dd1450 100755 --- a/cpp/thirdparty/download_dependencies.sh +++ b/cpp/thirdparty/download_dependencies.sh @@ -30,7 +30,7 @@ else DESTDIR=$1 fi -DESTDIR=$(realpath "${DESTDIR}") +DESTDIR=$(readlink -f "${DESTDIR}") download_dependency() { local url=$1 From 9fcce64e6108dd911c9cfcd4121ea33e2b447c91 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Dec 2018 15:23:42 -0600 Subject: [PATCH 21/80] ARROW-4026: [C++] Add *-all, *-tests, *-benchmarks modular CMake targets. Use in Travis CI This provides much more granular control over what targets are built. Before this patch `ninja arrow` would build all libraries _and_ tests if `ARROW_BUILD_TESTS=ON`. If you wanted to build the tests for a dependent target of the Arrow libraries, like Parquet or Plasma, you were forced to build the Arrow core unit tests. Now you can do ``` ninja parquet-tests ``` And it will only build the Arrow and Parquet libraries, and the tests labeled with "parquet-tests". Similarly this allows you to rebuild the libraries without necessarily having to relink all the unit tests (e.g. with `ninja arrow` or `ninja parquet`) Author: Wes McKinney Closes #3204 from wesm/ARROW-4026 and squashes the following commits: 1e41eee2d Misc fixes, add missing toolchain dependency 420282433 Add *-all, *-tests, *-benchmarks modular build targets. Use in Travis CI --- .travis.yml | 8 +-- ci/travis_before_script_cpp.sh | 4 -- ci/travis_script_gandiva_cpp.sh | 2 +- ci/travis_script_python.sh | 5 +- cpp/CMakeLists.txt | 17 ++++--- cpp/README.md | 8 ++- cpp/cmake_modules/BuildUtils.cmake | 52 +++++++++----------- cpp/cmake_modules/ThirdpartyToolchain.cmake | 30 ++++------- cpp/src/arrow/CMakeLists.txt | 19 +++++-- cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt | 5 +- cpp/src/arrow/flight/CMakeLists.txt | 1 - cpp/src/arrow/gpu/CMakeLists.txt | 13 ++++- cpp/src/arrow/ipc/CMakeLists.txt | 2 +- cpp/src/arrow/python/CMakeLists.txt | 5 +- cpp/src/gandiva/CMakeLists.txt | 15 +++--- cpp/src/gandiva/jni/CMakeLists.txt | 2 - cpp/src/gandiva/precompiled/CMakeLists.txt | 2 +- cpp/src/parquet/CMakeLists.txt | 25 ++++++---- cpp/src/parquet/arrow/CMakeLists.txt | 4 +- cpp/src/plasma/CMakeLists.txt | 6 ++- 20 files changed, 122 insertions(+), 103 deletions(-) diff --git a/.travis.yml b/.travis.yml index d22a4e7df0fea..bf0261b3fa1ea 100644 --- a/.travis.yml +++ b/.travis.yml @@ -112,9 +112,7 @@ matrix: - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_TRAVIS_GANDIVA_TESTS=1 - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - - ARROW_CPP_BUILD_TARGETS="gandiva" - # TODO(wesm): Remove this after ARROW-4026 - - ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS="gandiva" + - ARROW_CPP_BUILD_TARGETS="gandiva-all" - ARROW_TRAVIS_USE_TOOLCHAIN=1 # ARROW-3979 temporarily disabled. - ARROW_TRAVIS_VALGRIND=0 @@ -164,9 +162,7 @@ matrix: - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_TRAVIS_GANDIVA_TESTS=1 - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - - ARROW_CPP_BUILD_TARGETS="gandiva" - # TODO(wesm): Remove this after ARROW-4026 - - ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS="gandiva" + - ARROW_CPP_BUILD_TARGETS="gandiva-all" - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN before_script: diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 6cb7d6074f230..aa5b2a6ab084c 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -42,7 +42,6 @@ fi CMAKE_COMMON_FLAGS="\ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ --DARROW_TEST_INCLUDE_LABELS=$ARROW_TRAVIS_CPP_TEST_INCLUDE_LABELS \ -DARROW_NO_DEPRECATED_API=ON \ -DARROW_EXTRA_ERROR_CONTEXT=ON" CMAKE_LINUX_FLAGS="" @@ -102,9 +101,6 @@ if [ $ARROW_TRAVIS_GANDIVA == "1" ]; then if [ $ARROW_TRAVIS_GANDIVA_JAVA == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA_JAVA=ON" fi - if [ $ARROW_TRAVIS_GANDIVA_TESTS == "1" ]; then - CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BUILD_TESTS=ON" - fi fi if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then diff --git a/ci/travis_script_gandiva_cpp.sh b/ci/travis_script_gandiva_cpp.sh index f3c379393fe14..bc4a7a9a8f03b 100755 --- a/ci/travis_script_gandiva_cpp.sh +++ b/ci/travis_script_gandiva_cpp.sh @@ -23,7 +23,7 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh pushd $CPP_BUILD_DIR -PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva +PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva-tests popd diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index b8385c3834266..20ec57efc39e4 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -87,21 +87,20 @@ rm -rf * # XXX Can we simply reuse CMAKE_COMMON_FLAGS from travis_before_script_cpp.sh? CMAKE_COMMON_FLAGS="-DARROW_EXTRA_ERROR_CONTEXT=ON" -PYTHON_CPP_BUILD_TARGETS="arrow_python plasma" +PYTHON_CPP_BUILD_TARGETS="arrow_python-all plasma" if [ $ARROW_TRAVIS_COVERAGE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON" fi if [ $ARROW_TRAVIS_PYTHON_GANDIVA == "1" ]; then - CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=OFF" + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON" PYTHON_CPP_BUILD_TARGETS="$PYTHON_CPP_BUILD_TARGETS gandiva" fi cmake -GNinja \ $CMAKE_COMMON_FLAGS \ -DARROW_BUILD_TESTS=ON \ - -DARROW_TEST_INCLUDE_LABELS=python \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_OPTIONAL_INSTALL=ON \ -DARROW_PLASMA=on \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f563199c62470..60cbe85d10b6d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -333,10 +333,6 @@ Always OFF if building binaries" #---------------------------------------------------------------------- # Advanced developer options - set(ARROW_TEST_INCLUDE_LABELS "" CACHE STRING - "Only build unit tests having the indicated label or labels. \ -Pass multiple labels by dividing with semicolons") - option(ARROW_EXTRA_ERROR_CONTEXT "Compile with extra error context (line numbers, code)" OFF) @@ -466,10 +462,18 @@ endif() if(NOT ARROW_BUILD_TESTS) set(NO_TESTS 1) +else() + add_custom_target(all-tests) + add_custom_target(unittest ctest -L unittest) + add_dependencies(unittest all-tests) endif() if(NOT ARROW_BUILD_BENCHMARKS) set(NO_BENCHMARKS 1) +else() + add_custom_target(all-benchmarks) + add_custom_target(benchmark ctest -L benchmark) + add_dependencies(benchmark all-benchmarks) endif() if(NOT ARROW_BUILD_EXAMPLES) @@ -516,8 +520,6 @@ include(SetupCxxFlags) # Dependencies ############################################################ -add_custom_target(arrow_dependencies) - include(BuildUtils) enable_testing() @@ -712,6 +714,9 @@ if (ARROW_USE_GLOG) add_definitions("-DARROW_USE_GLOG") endif() +add_custom_target(arrow_dependencies) +add_dependencies(arrow_dependencies toolchain) + if (ARROW_STATIC_LINK_LIBS) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) endif() diff --git a/cpp/README.md b/cpp/README.md index a94c4be4f2cd4..5940db1f44301 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -105,14 +105,18 @@ export LC_ALL="en_US.UTF-8" ## Modular Build Targets Since there are several major parts of the C++ project, we have provided -modular CMake targets for building each component along with its dependencies, -unit tests, and benchmarks (if enabled): +modular CMake targets for building each library component, group of unit tests +and benchmarks, and their dependencies: * `make arrow` for Arrow core libraries * `make parquet` for Parquet libraries * `make gandiva` for Gandiva (LLVM expression compiler) libraries * `make plasma` for Plasma libraries, server +To build the unit tests or benchmarks, add `-tests` or `-benchmarks` to the +target name. So `make arrow-tests` will build the Arrow core unit tests. Using +the `-all` target, e.g. `parquet-all`, will build everything. + If you wish to only build and install one or more project subcomponents, we have provided the CMake option `ARROW_OPTIONAL_INSTALL` to only install targets that have been built. For example, if you only wish to build the Parquet diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 812d0c39e7fa5..7c1db679bf23e 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -308,6 +308,9 @@ endfunction() # \arg PREFIX a string to append to the name of the benchmark executable. For # example, if you have src/arrow/foo/bar-benchmark.cc, then PREFIX "foo" will # create test executable foo-bar-benchmark +# \arg LABELS the benchmark label or labels to assign the unit tests to. By +# default, benchmarks will go in the "benchmark" group. Custom targets for the +# group names must exist function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(options) set(one_value_args) @@ -343,20 +346,22 @@ function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(NO_COLOR "") endif() + # Add test as dependency of relevant label targets + add_dependencies(all-benchmarks ${BENCHMARK_NAME}) + foreach (TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${BENCHMARK_NAME}) + endforeach() + if (ARG_DEPENDENCIES) add_dependencies(${BENCHMARK_NAME} ${ARG_DEPENDENCIES}) endif() if (ARG_LABELS) - set(ARG_LABELS "${ARG_LABELS}") + set(ARG_LABELS "benchmark;${ARG_LABELS}") else() set(ARG_LABELS benchmark) endif() - foreach (TEST_LABEL ${ARG_LABELS}) - add_dependencies(${TEST_LABEL} ${BENCHMARK_NAME}) - endforeach() - add_test(${BENCHMARK_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) set_property(TEST ${BENCHMARK_NAME} @@ -389,7 +394,7 @@ endfunction() # \arg LABELS the unit test label or labels to assign the unit tests # to. By default, unit tests will go in the "unittest" group, but if we have # multiple unit tests in some subgroup, you can assign a test to multiple -# groups using the syntax unittest;GROUP2;GROUP3. Custom targets for the group +# groups use the syntax unittest;GROUP2;GROUP3. Custom targets for the group # names must exist function(ADD_TEST_CASE REL_TEST_NAME) set(options NO_VALGRIND ENABLED) @@ -401,18 +406,6 @@ function(ADD_TEST_CASE REL_TEST_NAME) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() - if (NOT "${ARROW_TEST_INCLUDE_LABELS}" STREQUAL "") - set(_SKIP_TEST TRUE) - foreach (_INCLUDED_LABEL ${ARROW_TEST_INCLUDE_LABELS}) - if ("${ARG_LABELS}" MATCHES "${_INCLUDED_LABEL}") - set(_SKIP_TEST FALSE) - endif() - endforeach() - if (_SKIP_TEST) - return() - endif() - endif() - if (NO_TESTS AND NOT ARG_ENABLED) return() endif() @@ -422,12 +415,6 @@ function(ADD_TEST_CASE REL_TEST_NAME) set(TEST_NAME "${ARG_PREFIX}-${TEST_NAME}") endif() - if (ARG_LABELS) - set(ARG_LABELS "${ARG_LABELS}") - else() - set(ARG_LABELS unittest) - endif() - if (ARG_SOURCES) set(SOURCES ${ARG_SOURCES}) else() @@ -458,10 +445,6 @@ function(ADD_TEST_CASE REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARG_EXTRA_DEPENDENCIES}) endif() - foreach (TEST_LABEL ${ARG_LABELS}) - add_dependencies(${TEST_LABEL} ${TEST_NAME}) - endforeach() - if (ARROW_TEST_MEMCHECK AND NOT ARG_NO_VALGRIND) SET_PROPERTY(TARGET ${TEST_NAME} APPEND_STRING PROPERTY @@ -477,6 +460,18 @@ function(ADD_TEST_CASE REL_TEST_NAME) ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) endif() + # Add test as dependency of relevant targets + add_dependencies(all-tests ${TEST_NAME}) + foreach (TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${TEST_NAME}) + endforeach() + + if (ARG_LABELS) + set(ARG_LABELS "unittest;${ARG_LABELS}") + else() + set(ARG_LABELS unittest) + endif() + set_property(TEST ${TEST_NAME} APPEND PROPERTY LABELS ${ARG_LABELS}) @@ -537,7 +532,6 @@ function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) add_dependencies(${EXAMPLE_NAME} ${ARG_DEPENDENCIES}) endif() - add_test(${EXAMPLE_NAME} ${EXAMPLE_PATH}) set_tests_properties(${EXAMPLE_NAME} PROPERTIES LABELS "example") endfunction() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index c007b1c225bb9..d493de75a55f5 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(toolchain) + # ---------------------------------------------------------------------- # Toolchain linkage options @@ -401,7 +403,7 @@ if (ARROW_BOOST_VENDORED) ${EP_LOG_OPTIONS}) set(Boost_INCLUDE_DIR "${BOOST_PREFIX}") set(Boost_INCLUDE_DIRS "${BOOST_INCLUDE_DIR}") - add_dependencies(arrow_dependencies boost_ep) + add_dependencies(toolchain boost_ep) else() if (MSVC) # disable autolinking in boost @@ -506,15 +508,14 @@ if("${DOUBLE_CONVERSION_HOME}" STREQUAL "") CMAKE_ARGS ${DOUBLE_CONVERSION_CMAKE_ARGS} BUILD_BYPRODUCTS "${DOUBLE_CONVERSION_STATIC_LIB}") set(DOUBLE_CONVERSION_VENDORED 1) + add_dependencies(toolchain double-conversion_ep) else() find_package(double-conversion REQUIRED PATHS "${DOUBLE_CONVERSION_HOME}") set(DOUBLE_CONVERSION_VENDORED 0) endif() -if (DOUBLE_CONVERSION_VENDORED) - add_dependencies(arrow_dependencies double-conversion_ep) -else() +if (NOT DOUBLE_CONVERSION_VENDORED) get_property(DOUBLE_CONVERSION_STATIC_LIB TARGET double-conversion::double-conversion PROPERTY LOCATION) get_property(DOUBLE_CONVERSION_INCLUDE_DIR TARGET double-conversion::double-conversion @@ -532,9 +533,6 @@ message(STATUS "double-conversion static library: ${DOUBLE_CONVERSION_STATIC_LIB # ---------------------------------------------------------------------- # Google gtest & gflags -add_custom_target(unittest ctest -L unittest) -add_custom_target(benchmark ctest -L benchmark) - if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) if("${GTEST_HOME}" STREQUAL "") @@ -699,6 +697,7 @@ if (ARROW_IPC) ExternalProject_Get_Property(rapidjson_ep SOURCE_DIR) set(RAPIDJSON_INCLUDE_DIR "${SOURCE_DIR}/include") set(RAPIDJSON_VENDORED 1) + add_dependencies(toolchain rapidjson_ep) else() set(RAPIDJSON_INCLUDE_DIR "${RAPIDJSON_HOME}/include") set(RAPIDJSON_VENDORED 0) @@ -706,10 +705,6 @@ if (ARROW_IPC) message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIR}") include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) - if(RAPIDJSON_VENDORED) - add_dependencies(arrow_dependencies rapidjson_ep) - endif() - ## Flatbuffers if("${FLATBUFFERS_HOME}" STREQUAL "") set(FLATBUFFERS_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_ep-prefix/src/flatbuffers_ep-install") @@ -733,15 +728,12 @@ if (ARROW_IPC) set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_PREFIX}/include") set(FLATBUFFERS_COMPILER "${FLATBUFFERS_PREFIX}/bin/flatc") set(FLATBUFFERS_VENDORED 1) + add_dependencies(toolchain flatbuffers_ep) else() find_package(Flatbuffers REQUIRED) set(FLATBUFFERS_VENDORED 0) endif() - if(FLATBUFFERS_VENDORED) - add_dependencies(arrow_dependencies flatbuffers_ep) - endif() - message(STATUS "Flatbuffers include dir: ${FLATBUFFERS_INCLUDE_DIR}") message(STATUS "Flatbuffers compiler: ${FLATBUFFERS_COMPILER}") include_directories(SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) @@ -1155,6 +1147,7 @@ if (ARROW_GANDIVA) CMAKE_ARGS ${RE2_CMAKE_ARGS} BUILD_BYPRODUCTS "${RE2_STATIC_LIB}") set (RE2_VENDORED 1) + add_dependencies(toolchain re2_ep) else () find_package (RE2 REQUIRED) set (RE2_VENDORED 0) @@ -1171,10 +1164,6 @@ if (ARROW_GANDIVA) STATIC_LIB ${RE2_STATIC_LIB}) set(RE2_LIBRARY re2_static) endif() - - if (RE2_VENDORED) - add_dependencies (arrow_dependencies re2_ep) - endif () endif () @@ -1317,6 +1306,8 @@ if (ARROW_ORC) CMAKE_ARGS ${ORC_CMAKE_ARGS} ${EP_LOG_OPTIONS}) + add_dependencies(toolchain orc_ep) + set(ORC_VENDORED 1) add_dependencies(orc_ep ${ZLIB_LIBRARY}) if (LZ4_VENDORED) @@ -1342,7 +1333,6 @@ if (ARROW_ORC) if (ORC_VENDORED) add_dependencies(orc_static orc_ep) endif() - endif() # ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9291addca0e1c..8dd2ac082db0a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -15,13 +15,17 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(arrow-all) add_custom_target(arrow) +add_custom_target(arrow-benchmarks) +add_custom_target(arrow-tests) +add_dependencies(arrow-all arrow arrow-tests arrow-benchmarks) # Adding unit tests part of the "arrow" portion of the test suite function(ADD_ARROW_TEST REL_TEST_NAME) set(options) set(one_value_args PREFIX) - set(multi_value_args) + set(multi_value_args LABELS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if (ARG_PREFIX) @@ -29,9 +33,16 @@ function(ADD_ARROW_TEST REL_TEST_NAME) else() set(PREFIX "arrow") endif() + + if (ARG_LABELS) + set(LABELS ${ARG_LABELS}) + else() + set(LABELS "arrow-tests") + endif() + ADD_TEST_CASE(${REL_TEST_NAME} PREFIX ${PREFIX} - LABELS "unittest;arrow" + LABELS ${LABELS} ${ARG_UNPARSED_ARGUMENTS}) endfunction() @@ -47,7 +58,7 @@ function(ADD_ARROW_BENCHMARK REL_TEST_NAME) endif() ADD_BENCHMARK(${REL_TEST_NAME} PREFIX ${PREFIX} - LABELS "benchmark;arrow" + LABELS "arrow-benchmarks" ${ARG_UNPARSED_ARGUMENTS}) endfunction() @@ -215,6 +226,8 @@ ADD_ARROW_LIB(arrow SHARED_PRIVATE_LINK_LIBS ${ARROW_SHARED_PRIVATE_LINK_LIBS} STATIC_LINK_LIBS ${ARROW_STATIC_LINK_LIBS}) +add_dependencies(arrow ${ARROW_LIBRARIES}) + if (ARROW_BUILD_STATIC AND WIN32) target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) endif() diff --git a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt index 9fd7f924d3a69..d2640a66b2f8f 100644 --- a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt @@ -16,6 +16,7 @@ # under the License. add_custom_target(arrow_hiveserver2) +add_custom_target(arrow_hiveserver2-tests) # Headers: top level ARROW_INSTALL_ALL_HEADERS("arrow/dbi/hiveserver2") @@ -103,9 +104,9 @@ set(ARROW_HIVESERVER2_TEST_LINK_LIBS thriftstatic) if (ARROW_BUILD_TESTS) - ADD_ARROW_TEST(hiveserver2-test + ADD_TEST_CASE(hiveserver2-test STATIC_LINK_LIBS "${ARROW_HIVESERVER2_TEST_LINK_LIBS}" - LABELS "arrow_hiveserver2" + LABELS "arrow_hiveserver2-tests" ) if (TARGET arrow-hiveserver2-test) set_property(TARGET arrow-hiveserver2-test diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index aa56269a8953e..2feaee1160b07 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -75,7 +75,6 @@ set(ARROW_FLIGHT_SRCS ADD_ARROW_LIB(arrow_flight SOURCES ${ARROW_FLIGHT_SRCS} - DEPENDENCIES arrow_dependencies SHARED_LINK_LIBS arrow_shared ${ARROW_FLIGHT_STATIC_LINK_LIBS} STATIC_LINK_LIBS arrow_static ${ARROW_FLIGHT_STATIC_LINK_LIBS}) diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 8b69c654bb1fe..2fcdf23e42ad7 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -19,6 +19,12 @@ # arrow_cuda ####################################### +add_custom_target(arrow_cuda-all) +add_custom_target(arrow_cuda) +add_custom_target(arrow_cuda-benchmarks) +add_custom_target(arrow_cuda-tests) +add_dependencies(arrow_cuda-all arrow_cuda arrow_cuda-tests arrow_cuda-benchmarks) + if (DEFINED ENV{CUDA_HOME}) set(CUDA_TOOLKIT_ROOT_DIR "$ENV{CUDA_HOME}") endif() @@ -49,6 +55,8 @@ ADD_ARROW_LIB(arrow_cuda STATIC_LINK_LIBS ${ARROW_CUDA_SHARED_LINK_LIBS} ) +add_dependencies(arrow_cuda ${ARROW_CUDA_LIBRARIES}) + foreach(LIB_TARGET ${ARROW_CUDA_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) @@ -77,9 +85,10 @@ if (ARROW_BUILD_TESTS) endif() if (ARROW_BUILD_BENCHMARKS) - cuda_add_executable(cuda-benchmark cuda-benchmark.cc) - target_link_libraries(cuda-benchmark + cuda_add_executable(arrow-cuda-benchmark cuda-benchmark.cc) + target_link_libraries(arrow-cuda-benchmark arrow_cuda_shared gtest_static ${ARROW_BENCHMARK_LINK_LIBS}) + add_dependencies(arrow_cuda-benchmarks arrow-cuda-benchmark) endif() diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index c44f7b9fe1bfe..422e72e2edae2 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -17,7 +17,7 @@ # Targets required for protocol integration testing add_custom_target(integration) -add_dependencies(arrow integration) +add_dependencies(arrow-tests integration) ####################################### # Messaging and interprocess communication diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 98c105ae623ce..cccbf09d4fb4d 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -22,7 +22,10 @@ find_package(PythonLibsNew REQUIRED) find_package(NumPy REQUIRED) +add_custom_target(arrow_python-all) add_custom_target(arrow_python) +add_custom_target(arrow_python-tests) +add_dependencies(arrow_python-all arrow_python arrow_python-tests) set(ARROW_PYTHON_SRCS arrow_to_pandas.cc @@ -130,6 +133,6 @@ if (ARROW_BUILD_TESTS) STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}" EXTRA_LINK_LIBS ${PYTHON_LIBRARIES} EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" - LABELS "arrow_python" + LABELS "arrow_python-tests" NO_VALGRIND) endif() diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index da0d3bba69147..8052db5e8545d 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -15,12 +15,15 @@ # specific language governing permissions and limitations # under the License. -project(gandiva) - -find_package(LLVM) - # For "make gandiva" to build everything Gandiva-related +add_custom_target(gandiva-all) add_custom_target(gandiva) +add_custom_target(gandiva-tests) +add_custom_target(gandiva-benchmarks) + +add_dependencies(gandiva-all gandiva gandiva-tests gandiva-benchmarks) + +find_package(LLVM) # Set the path where the byte-code files will be installed. set(GANDIVA_BC_INSTALL_DIR @@ -80,7 +83,7 @@ endif() ADD_ARROW_LIB(gandiva SOURCES ${SRC_FILES} OUTPUTS GANDIVA_LIBRARIES - DEPENDENCIES arrow_dependencies precompiled + DEPENDENCIES precompiled EXTRA_INCLUDES $ SHARED_LINK_LIBS arrow_shared @@ -120,7 +123,7 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) set(TEST_ARGUMENTS ENABLED PREFIX "gandiva" - LABELS "unittest;gandiva" + LABELS "gandiva-tests" ${ARG_UNPARSED_ARGUMENTS}) # and uses less disk space, but in some cases we need to force static diff --git a/cpp/src/gandiva/jni/CMakeLists.txt b/cpp/src/gandiva/jni/CMakeLists.txt index ab04f536b4dd2..a07d3903a75ac 100644 --- a/cpp/src/gandiva/jni/CMakeLists.txt +++ b/cpp/src/gandiva/jni/CMakeLists.txt @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -project(gandiva_jni) - if(CMAKE_VERSION VERSION_LESS 3.11) message(FATAL_ERROR "Building the Gandiva JNI bindings requires CMake version >= 3.11") endif() diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 0792fd6421d65..2af49084bf310 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -58,7 +58,7 @@ function(add_precompiled_unit_test REL_TEST_NAME) set(TEST_NAME "gandiva-precompiled-${TEST_NAME}") add_executable(${TEST_NAME} ${REL_TEST_NAME} ${ARGN}) - add_dependencies(gandiva ${TEST_NAME}) + add_dependencies(gandiva-tests ${TEST_NAME}) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/src) target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_TEST_LINK_LIBS} ${RE2_LIBRARY} diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 995c39adb7d35..4eb8f68a2ba98 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -15,6 +15,12 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(parquet-all) +add_custom_target(parquet) +add_custom_target(parquet-benchmarks) +add_custom_target(parquet-tests) +add_dependencies(parquet-all parquet parquet-tests parquet-benchmarks) + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/.parquetcppversion" PARQUET_VERSION) string(REPLACE "\n" "" PARQUET_VERSION "${PARQUET_VERSION}") string(REGEX MATCH "^([0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+)?)" VERSION ${PARQUET_VERSION}) @@ -22,9 +28,6 @@ if(NOT VERSION) message(FATAL_ERROR "invalid .parquetcppversion") endif() -# For "make parquet" to build everything Parquet-related -add_custom_target(parquet) - function(ADD_PARQUET_TEST REL_TEST_NAME) set(options USE_STATIC_LINKING) set(one_value_args) @@ -34,19 +37,21 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() + set(TEST_ARGUMENTS + PREFIX "parquet" + LABELS "parquet-tests") + # By default we prefer shared linking with libparquet, as it's faster # and uses less disk space, but in some cases we need to force static # linking (see rationale below). if (ARG_USE_STATIC_LINKING) ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_STATIC_TEST_LINK_LIBS} - PREFIX "parquet" - LABELS "unittest;parquet") + ${TEST_ARGUMENTS}) else() ADD_TEST_CASE(${REL_TEST_NAME} STATIC_LINK_LIBS ${PARQUET_SHARED_TEST_LINK_LIBS} - PREFIX "parquet" - LABELS "unittest;parquet") + ${TEST_ARGUMENTS}) endif() endfunction() @@ -217,6 +222,8 @@ ADD_ARROW_LIB(parquet STATIC_LINK_LIBS ${PARQUET_STATIC_LINK_LIBS} ) +add_dependencies(parquet ${PARQUET_LIBRARIES}) + # Thrift requires these definitions for some types that we use foreach(LIB_TARGET ${PARQUET_LIBRARIES}) target_compile_definitions(${LIB_TARGET} @@ -232,8 +239,6 @@ foreach(LIB_TARGET ${PARQUET_LIBRARIES}) endif() endforeach() -add_dependencies(parquet ${PARQUET_LIBRARIES}) - add_subdirectory(api) add_subdirectory(arrow) add_subdirectory(util) @@ -271,7 +276,9 @@ ADD_PARQUET_TEST(schema-test USE_STATIC_LINKING) ADD_ARROW_BENCHMARK(column-io-benchmark PREFIX "parquet" + LABELS "parquet-benchmarks" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) ADD_ARROW_BENCHMARK(encoding-benchmark PREFIX "parquet" + LABELS "parquet-benchmarks" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 89afc39a23376..f4e4f7e0b975a 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -20,9 +20,7 @@ ADD_PARQUET_TEST(arrow-reader-writer-test) ADD_BENCHMARK(reader-writer-benchmark PREFIX "parquet-arrow" + LABELS "parquet-benchmarks" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) -if (TARGET parquet-arrow-reader-writer-benchmark) - add_dependencies(parquet parquet-arrow-reader-writer-benchmark) -endif() ARROW_INSTALL_ALL_HEADERS("parquet/arrow") diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 83c201d0f45a0..d9c7dcaedeac3 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -15,7 +15,11 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(plasma-all) add_custom_target(plasma) +add_custom_target(plasma-benchmarks) +add_custom_target(plasma-tests) +add_dependencies(plasma-all plasma plasma-tests plasma-benchmarks) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") @@ -199,7 +203,7 @@ function(ADD_PLASMA_TEST REL_TEST_NAME) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) ADD_TEST_CASE(${REL_TEST_NAME} PREFIX "plasma" - LABELS "unittest;plasma" + LABELS "plasma-tests" ${ARG_UNPARSED_ARGUMENTS}) endfunction() From e9ed591db9cb87e0086bf9fef4201cc726bd5d03 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Mon, 17 Dec 2018 15:35:38 -0600 Subject: [PATCH 22/80] ARROW-4028: [Rust] Merge parquet-rs codebase This imports parquet-rs source code into Apache Arrow Rust implementation. I include most of the source code except a few things such as `fuzz` and benchmarks. Thinking about adding them later. The module hierarchy now looks like: - arrow: all the arrow code - parquet: all the parquet code (in future, parquet-arrow integration will live here) - util: common util libraries shared between arrow and parquet (I'll try to move the utils from parquet to here in future). Author: Chao Sun Author: Chao Sun Closes #3050 from sunchao/import-parquet and squashes the following commits: 2ce98bd2a Update git submodule 2d296f8f7 ARROW-4028: Merge parquet-rs codebase --- ci/rust-build-main.bat | 3 + ci/travis_script_rust.sh | 2 + cpp/submodules/parquet-testing | 2 +- docker-compose.yml | 2 + rust/Cargo.toml | 12 + rust/benches/array_from_vec.rs | 1 - rust/benches/builder.rs | 6 +- rust/build.rs | 43 + rust/examples/read_csv.rs | 5 +- rust/rustfmt.toml | 18 + rust/src/array.rs | 8 +- rust/src/array_data.rs | 3 +- rust/src/builder.rs | 6 +- rust/src/csv/reader.rs | 10 +- rust/src/lib.rs | 6 + rust/src/mod.rs | 28 + rust/src/parquet/basic.rs | 1497 ++++++++++ rust/src/parquet/column/mod.rs | 124 + rust/src/parquet/column/page.rs | 296 ++ rust/src/parquet/column/reader.rs | 1576 ++++++++++ rust/src/parquet/column/writer.rs | 1617 +++++++++++ rust/src/parquet/compression.rs | 321 +++ rust/src/parquet/data_type.rs | 463 +++ rust/src/parquet/encodings/decoding.rs | 1403 +++++++++ rust/src/parquet/encodings/encoding.rs | 1360 +++++++++ rust/src/parquet/encodings/levels.rs | 529 ++++ rust/src/parquet/encodings/mod.rs | 21 + rust/src/parquet/encodings/rle.rs | 839 ++++++ rust/src/parquet/errors.rs | 87 + rust/src/parquet/file/metadata.rs | 736 +++++ rust/src/parquet/file/mod.rs | 88 + rust/src/parquet/file/properties.rs | 648 +++++ rust/src/parquet/file/reader.rs | 899 ++++++ rust/src/parquet/file/statistics.rs | 692 +++++ rust/src/parquet/file/writer.rs | 936 ++++++ rust/src/parquet/mod.rs | 34 + rust/src/parquet/record/api.rs | 1439 ++++++++++ rust/src/parquet/record/mod.rs | 24 + rust/src/parquet/record/reader.rs | 1464 ++++++++++ rust/src/parquet/record/triplet.rs | 561 ++++ rust/src/parquet/schema/mod.rs | 66 + rust/src/parquet/schema/parser.rs | 764 +++++ rust/src/parquet/schema/printer.rs | 467 +++ rust/src/parquet/schema/types.rs | 1830 ++++++++++++ rust/src/parquet/util/bit_packing.rs | 3658 ++++++++++++++++++++++++ rust/src/parquet/util/bit_util.rs | 1058 +++++++ rust/src/parquet/util/hash_util.rs | 160 ++ rust/src/parquet/util/io.rs | 220 ++ rust/src/parquet/util/memory.rs | 524 ++++ rust/src/parquet/util/mod.rs | 26 + rust/src/parquet/util/test_common.rs | 190 ++ rust/src/record_batch.rs | 4 +- rust/src/tensor.rs | 1 + 53 files changed, 26757 insertions(+), 20 deletions(-) create mode 100644 rust/build.rs create mode 100644 rust/rustfmt.toml create mode 100644 rust/src/mod.rs create mode 100644 rust/src/parquet/basic.rs create mode 100644 rust/src/parquet/column/mod.rs create mode 100644 rust/src/parquet/column/page.rs create mode 100644 rust/src/parquet/column/reader.rs create mode 100644 rust/src/parquet/column/writer.rs create mode 100644 rust/src/parquet/compression.rs create mode 100644 rust/src/parquet/data_type.rs create mode 100644 rust/src/parquet/encodings/decoding.rs create mode 100644 rust/src/parquet/encodings/encoding.rs create mode 100644 rust/src/parquet/encodings/levels.rs create mode 100644 rust/src/parquet/encodings/mod.rs create mode 100644 rust/src/parquet/encodings/rle.rs create mode 100644 rust/src/parquet/errors.rs create mode 100644 rust/src/parquet/file/metadata.rs create mode 100644 rust/src/parquet/file/mod.rs create mode 100644 rust/src/parquet/file/properties.rs create mode 100644 rust/src/parquet/file/reader.rs create mode 100644 rust/src/parquet/file/statistics.rs create mode 100644 rust/src/parquet/file/writer.rs create mode 100644 rust/src/parquet/mod.rs create mode 100644 rust/src/parquet/record/api.rs create mode 100644 rust/src/parquet/record/mod.rs create mode 100644 rust/src/parquet/record/reader.rs create mode 100644 rust/src/parquet/record/triplet.rs create mode 100644 rust/src/parquet/schema/mod.rs create mode 100644 rust/src/parquet/schema/parser.rs create mode 100644 rust/src/parquet/schema/printer.rs create mode 100644 rust/src/parquet/schema/types.rs create mode 100644 rust/src/parquet/util/bit_packing.rs create mode 100644 rust/src/parquet/util/bit_util.rs create mode 100644 rust/src/parquet/util/hash_util.rs create mode 100644 rust/src/parquet/util/io.rs create mode 100644 rust/src/parquet/util/memory.rs create mode 100644 rust/src/parquet/util/mod.rs create mode 100644 rust/src/parquet/util/test_common.rs diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat index c8a51fef6ec46..e338f7e172e6e 100644 --- a/ci/rust-build-main.bat +++ b/ci/rust-build-main.bat @@ -17,6 +17,9 @@ @rem The "main" Rust build script for Windows CI +@rem Retrieve git submodules, configure env var for Parquet unit tests +git submodule update --init || exit /B +set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data pushd rust @echo =================================== diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 55cce8f354e44..4b09bc22e4c20 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -19,6 +19,8 @@ set -e +source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh + RUST_DIR=${TRAVIS_BUILD_DIR}/rust pushd $RUST_DIR diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 46ae2605c2de3..92a8e6c2efdce 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 46ae2605c2de306f5740587107dcf333a527f2d1 +Subproject commit 92a8e6c2efdce1925c605d6313994db2c94478fb diff --git a/docker-compose.yml b/docker-compose.yml index 0a01a7cbe97bf..b61511ee56dea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -152,6 +152,8 @@ services: build: context: . dockerfile: rust/Dockerfile + environment: + PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data volumes: *ubuntu-volumes r: diff --git a/rust/Cargo.toml b/rust/Cargo.toml index aa23815f74085..49e8a9d9c8470 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -42,10 +42,22 @@ serde_derive = "1.0.80" serde_json = "1.0.13" rand = "0.5" csv = "1.0.0" +parquet-format = "2.5.0" +quick-error = "1.2.2" +byteorder = "1" +thrift = "0.0.4" +snap = "0.2" +brotli = "2.5" +flate2 = "1.0.2" +lz4 = "1.23" +zstd = "0.4" +chrono = "0.4" +num-bigint = "0.2" num = "0.2" [dev-dependencies] criterion = "0.2" +lazy_static = "1" [[bench]] name = "array_from_vec" diff --git a/rust/benches/array_from_vec.rs b/rust/benches/array_from_vec.rs index 669b88eaa40d9..f9357140922a6 100644 --- a/rust/benches/array_from_vec.rs +++ b/rust/benches/array_from_vec.rs @@ -17,7 +17,6 @@ #[macro_use] extern crate criterion; - use criterion::Criterion; extern crate arrow; diff --git a/rust/benches/builder.rs b/rust/benches/builder.rs index 04f8a33b5bd55..90fd75a0da390 100644 --- a/rust/benches/builder.rs +++ b/rust/benches/builder.rs @@ -19,11 +19,13 @@ extern crate arrow; extern crate criterion; extern crate rand; -use arrow::builder::*; +use std::mem::size_of; + use criterion::*; use rand::distributions::Standard; use rand::{thread_rng, Rng}; -use std::mem::size_of; + +use arrow::builder::*; // Build arrays with 512k elements. const BATCH_SIZE: usize = 8 << 10; diff --git a/rust/build.rs b/rust/build.rs new file mode 100644 index 0000000000000..b42b2a4babfec --- /dev/null +++ b/rust/build.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::process::Command; + +fn main() { + // Set Parquet version, build hash and "created by" string. + let version = env!("CARGO_PKG_VERSION"); + let mut created_by = format!("parquet-rs version {}", version); + if let Ok(git_hash) = run(Command::new("git").arg("rev-parse").arg("HEAD")) { + created_by.push_str(format!(" (build {})", git_hash).as_str()); + println!("cargo:rustc-env=PARQUET_BUILD={}", git_hash); + } + println!("cargo:rustc-env=PARQUET_VERSION={}", version); + println!("cargo:rustc-env=PARQUET_CREATED_BY={}", created_by); +} + +/// Runs command and returns either content of stdout for successful execution, +/// or an error message otherwise. +fn run(command: &mut Command) -> Result { + println!("Running: `{:?}`", command); + match command.output() { + Ok(ref output) if output.status.success() => { + Ok(String::from_utf8_lossy(&output.stdout).trim().to_string()) + } + Ok(ref output) => Err(format!("Failed: `{:?}` ({})", command, output.status)), + Err(error) => Err(format!("Failed: `{:?}` ({})", command, error)), + } +} diff --git a/rust/examples/read_csv.rs b/rust/examples/read_csv.rs index df66a8112e5f2..147d2f9c23845 100644 --- a/rust/examples/read_csv.rs +++ b/rust/examples/read_csv.rs @@ -17,11 +17,12 @@ extern crate arrow; +use std::fs::File; +use std::sync::Arc; + use arrow::array::{BinaryArray, Float64Array}; use arrow::csv; use arrow::datatypes::{DataType, Field, Schema}; -use std::fs::File; -use std::sync::Arc; fn main() { let schema = Schema::new(vec![ diff --git a/rust/rustfmt.toml b/rust/rustfmt.toml new file mode 100644 index 0000000000000..72eeee0af1c53 --- /dev/null +++ b/rust/rustfmt.toml @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +format_doc_comments = true \ No newline at end of file diff --git a/rust/src/array.rs b/rust/src/array.rs index 11e732a1267ea..251dd35eea150 100644 --- a/rust/src/array.rs +++ b/rust/src/array.rs @@ -657,12 +657,14 @@ impl From> for StructArray { #[cfg(test)] mod tests { use super::*; + + use std::sync::Arc; + use std::thread; + use crate::array_data::ArrayData; use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field, ToByteSlice}; + use crate::datatypes::{DataType, Field}; use crate::memory; - use std::sync::Arc; - use std::thread; #[test] fn test_primitive_array_from_vec() { diff --git a/rust/src/array_data.rs b/rust/src/array_data.rs index 36a817ee579a0..9ea01a402a9cb 100644 --- a/rust/src/array_data.rs +++ b/rust/src/array_data.rs @@ -225,9 +225,10 @@ impl ArrayDataBuilder { #[cfg(test)] mod tests { + use super::*; + use std::sync::Arc; - use super::{ArrayData, DataType}; use crate::buffer::Buffer; use crate::util::bit_util; diff --git a/rust/src/builder.rs b/rust/src/builder.rs index fc781ffa50641..d5d222d006fe8 100644 --- a/rust/src/builder.rs +++ b/rust/src/builder.rs @@ -456,10 +456,10 @@ impl BinaryArrayBuilder { #[cfg(test)] mod tests { - use crate::array::Array; - use super::*; + use crate::array::Array; + #[test] fn test_builder_i32_empty() { let b = Int32BufferBuilder::new(5); @@ -825,7 +825,6 @@ mod tests { #[test] fn test_binary_array_builder() { - use crate::array::BinaryArray; let mut builder = BinaryArrayBuilder::new(20); builder.push(b'h').unwrap(); @@ -860,7 +859,6 @@ mod tests { #[test] fn test_binary_array_builder_push_string() { - use crate::array::BinaryArray; let mut builder = BinaryArrayBuilder::new(20); let var = "hello".to_owned(); diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs index 956408e4a40c3..632aa7ae7936d 100644 --- a/rust/src/csv/reader.rs +++ b/rust/src/csv/reader.rs @@ -29,16 +29,16 @@ //! use std::sync::Arc; //! //! let schema = Schema::new(vec![ -//! Field::new("city", DataType::Utf8, false), -//! Field::new("lat", DataType::Float64, false), -//! Field::new("lng", DataType::Float64, false), +//! Field::new("city", DataType::Utf8, false), +//! Field::new("lat", DataType::Float64, false), +//! Field::new("lng", DataType::Float64, false), //! ]); //! //! let file = File::open("test/data/uk_cities.csv").unwrap(); //! //! let mut csv = csv::Reader::new(file, Arc::new(schema), false, 1024, None); //! let batch = csv.next().unwrap().unwrap(); -//!``` +//! ``` use std::fs::File; use std::io::BufReader; @@ -195,8 +195,8 @@ impl Reader { #[cfg(test)] mod tests { - use super::*; + use crate::array::*; use crate::datatypes::Field; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index f41d08f1427a6..d5708b10504c4 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -15,7 +15,12 @@ // specific language governing permissions and limitations // under the License. +#![feature(type_ascription)] +#![feature(rustc_private)] #![feature(specialization)] +#![feature(try_from)] +#![allow(dead_code)] +#![allow(non_camel_case_types)] pub mod array; pub mod array_data; @@ -27,6 +32,7 @@ pub mod csv; pub mod datatypes; pub mod error; pub mod memory; +pub mod parquet; pub mod record_batch; pub mod tensor; pub mod util; diff --git a/rust/src/mod.rs b/rust/src/mod.rs new file mode 100644 index 0000000000000..b9fa43ab8184b --- /dev/null +++ b/rust/src/mod.rs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod array; +pub mod array_data; +pub mod bitmap; +pub mod buffer; +pub mod builder; +pub mod csv; +pub mod datatypes; +pub mod error; +pub mod memory; +pub mod record_batch; +pub mod tensor; diff --git a/rust/src/parquet/basic.rs b/rust/src/parquet/basic.rs new file mode 100644 index 0000000000000..22e16347dc00f --- /dev/null +++ b/rust/src/parquet/basic.rs @@ -0,0 +1,1497 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains Rust mappings for Thrift definition. +//! Refer to `parquet.thrift` file to see raw definitions. + +use std::{convert, fmt, result, str}; + +use parquet_format as parquet; + +use crate::parquet::errors::ParquetError; + +// ---------------------------------------------------------------------- +// Types from the Thrift definition + +// ---------------------------------------------------------------------- +// Mirrors `parquet::Type` + +/// Types supported by Parquet. +/// These physical types are intended to be used in combination with the encodings to +/// control the on disk storage format. +/// For example INT16 is not included as a type since a good encoding of INT32 +/// would handle this. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Type { + BOOLEAN, + INT32, + INT64, + INT96, + FLOAT, + DOUBLE, + BYTE_ARRAY, + FIXED_LEN_BYTE_ARRAY, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::ConvertedType` + +/// Common types (logical types) used by frameworks when using Parquet. +/// This helps map between types in those frameworks to the base types in Parquet. +/// This is only metadata and not needed to read or write the data. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum LogicalType { + NONE, + /// A BYTE_ARRAY actually contains UTF8 encoded chars. + UTF8, + + /// A map is converted as an optional field containing a repeated key/value pair. + MAP, + + /// A key/value pair is converted into a group of two fields. + MAP_KEY_VALUE, + + /// A list is converted into an optional field containing a repeated field for its + /// values. + LIST, + + /// An enum is converted into a binary field + ENUM, + + /// A decimal value. + /// This may be used to annotate binary or fixed primitive types. The + /// underlying byte array stores the unscaled value encoded as two's + /// complement using big-endian byte order (the most significant byte is the + /// zeroth element). + /// + /// This must be accompanied by a (maximum) precision and a scale in the + /// SchemaElement. The precision specifies the number of digits in the decimal + /// and the scale stores the location of the decimal point. For example 1.23 + /// would have precision 3 (3 total digits) and scale 2 (the decimal point is + /// 2 digits over). + DECIMAL, + + /// A date stored as days since Unix epoch, encoded as the INT32 physical type. + DATE, + + /// The total number of milliseconds since midnight. The value is stored as an INT32 + /// physical type. + TIME_MILLIS, + + /// The total number of microseconds since midnight. The value is stored as an INT64 + /// physical type. + TIME_MICROS, + + /// Date and time recorded as milliseconds since the Unix epoch. + /// Recorded as a physical type of INT64. + TIMESTAMP_MILLIS, + + /// Date and time recorded as microseconds since the Unix epoch. + /// The value is stored as an INT64 physical type. + TIMESTAMP_MICROS, + + /// An unsigned 8 bit integer value stored as INT32 physical type. + UINT_8, + + /// An unsigned 16 bit integer value stored as INT32 physical type. + UINT_16, + + /// An unsigned 32 bit integer value stored as INT32 physical type. + UINT_32, + + /// An unsigned 64 bit integer value stored as INT64 physical type. + UINT_64, + + /// A signed 8 bit integer value stored as INT32 physical type. + INT_8, + + /// A signed 16 bit integer value stored as INT32 physical type. + INT_16, + + /// A signed 32 bit integer value stored as INT32 physical type. + INT_32, + + /// A signed 64 bit integer value stored as INT64 physical type. + INT_64, + + /// A JSON document embedded within a single UTF8 column. + JSON, + + /// A BSON document embedded within a single BINARY column. + BSON, + + /// An interval of time. + /// + /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12. + /// This data is composed of three separate little endian unsigned integers. + /// Each stores a component of a duration of time. The first integer identifies + /// the number of months associated with the duration, the second identifies + /// the number of days associated with the duration and the third identifies + /// the number of milliseconds associated with the provided duration. + /// This duration of time is independent of any particular timezone or date. + INTERVAL, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::FieldRepetitionType` + +/// Representation of field types in schema. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Repetition { + /// Field is required (can not be null) and each record has exactly 1 value. + REQUIRED, + /// Field is optional (can be null) and each record has 0 or 1 values. + OPTIONAL, + /// Field is repeated and can contain 0 or more values. + REPEATED, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::Encoding` + +/// Encodings supported by Parquet. +/// Not all encodings are valid for all types. These enums are also used to specify the +/// encoding of definition and repetition levels. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum Encoding { + /// Default byte encoding. + /// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. + /// - INT32 - 4 bytes per value, stored as little-endian. + /// - INT64 - 8 bytes per value, stored as little-endian. + /// - FLOAT - 4 bytes per value, stored as little-endian. + /// - DOUBLE - 8 bytes per value, stored as little-endian. + /// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + /// - FIXED_LEN_BYTE_ARRAY - just the bytes are stored. + PLAIN, + + /// **Deprecated** dictionary encoding. + /// + /// The values in the dictionary are encoded using PLAIN encoding. + /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and PLAIN + /// encoding is used for dictionary page. + PLAIN_DICTIONARY, + + /// Group packed run length encoding. + /// + /// Usable for definition/repetition levels encoding and boolean values. + RLE, + + /// Bit packed encoding. + /// + /// This can only be used if the data has a known max width. + /// Usable for definition/repetition levels encoding. + BIT_PACKED, + + /// Delta encoding for integers, either INT32 or INT64. + /// + /// Works best on sorted data. + DELTA_BINARY_PACKED, + + /// Encoding for byte arrays to separate the length values and the data. + /// + /// The lengths are encoded using DELTA_BINARY_PACKED encoding. + DELTA_LENGTH_BYTE_ARRAY, + + /// Incremental encoding for byte arrays. + /// + /// Prefix lengths are encoded using DELTA_BINARY_PACKED encoding. + /// Suffixes are stored using DELTA_LENGTH_BYTE_ARRAY encoding. + DELTA_BYTE_ARRAY, + + /// Dictionary encoding. + /// + /// The ids are encoded using the RLE encoding. + RLE_DICTIONARY, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::CompressionCodec` + +/// Supported compression algorithms. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Compression { + UNCOMPRESSED, + SNAPPY, + GZIP, + LZO, + BROTLI, + LZ4, + ZSTD, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::PageType` + +/// Available data pages for Parquet file format. +/// Note that some of the page types may not be supported. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum PageType { + DATA_PAGE, + INDEX_PAGE, + DICTIONARY_PAGE, + DATA_PAGE_V2, +} + +// ---------------------------------------------------------------------- +// Mirrors `parquet::ColumnOrder` + +/// Sort order for page and column statistics. +/// +/// Types are associated with sort orders and column stats are aggregated using a sort +/// order, and a sort order should be considered when comparing values with statistics +/// min/max. +/// +/// See reference in +/// https://github.com/apache/parquet-cpp/blob/master/src/parquet/types.h +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum SortOrder { + /// Signed (either value or legacy byte-wise) comparison. + SIGNED, + /// Unsigned (depending on physical type either value or byte-wise) comparison. + UNSIGNED, + /// Comparison is undefined. + UNDEFINED, +} + +/// Column order that specifies what method was used to aggregate min/max values for +/// statistics. +/// +/// If column order is undefined, then it is the legacy behaviour and all values should +/// be compared as signed values/bytes. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum ColumnOrder { + /// Column uses the order defined by its logical or physical type + /// (if there is no logical type), parquet-format 2.4.0+. + TYPE_DEFINED_ORDER(SortOrder), + /// Undefined column order, means legacy behaviour before parquet-format 2.4.0. + /// Sort order is always SIGNED. + UNDEFINED, +} + +impl ColumnOrder { + /// Returns sort order for a physical/logical type. + pub fn get_sort_order(logical_type: LogicalType, physical_type: Type) -> SortOrder { + match logical_type { + // Unsigned byte-wise comparison. + LogicalType::UTF8 | LogicalType::JSON | LogicalType::BSON | LogicalType::ENUM => { + SortOrder::UNSIGNED + } + + LogicalType::INT_8 + | LogicalType::INT_16 + | LogicalType::INT_32 + | LogicalType::INT_64 => SortOrder::SIGNED, + + LogicalType::UINT_8 + | LogicalType::UINT_16 + | LogicalType::UINT_32 + | LogicalType::UINT_64 => SortOrder::UNSIGNED, + + // Signed comparison of the represented value. + LogicalType::DECIMAL => SortOrder::SIGNED, + + LogicalType::DATE => SortOrder::SIGNED, + + LogicalType::TIME_MILLIS + | LogicalType::TIME_MICROS + | LogicalType::TIMESTAMP_MILLIS + | LogicalType::TIMESTAMP_MICROS => SortOrder::SIGNED, + + LogicalType::INTERVAL => SortOrder::UNSIGNED, + + LogicalType::LIST | LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + SortOrder::UNDEFINED + } + + // Fall back to physical type. + LogicalType::NONE => Self::get_default_sort_order(physical_type), + } + } + + /// Returns default sort order based on physical type. + fn get_default_sort_order(physical_type: Type) -> SortOrder { + match physical_type { + // Order: false, true + Type::BOOLEAN => SortOrder::UNSIGNED, + Type::INT32 | Type::INT64 => SortOrder::SIGNED, + Type::INT96 => SortOrder::UNDEFINED, + // Notes to remember when comparing float/double values: + // If the min is a NaN, it should be ignored. + // If the max is a NaN, it should be ignored. + // If the min is +0, the row group may contain -0 values as well. + // If the max is -0, the row group may contain +0 values as well. + // When looking for NaN values, min and max should be ignored. + Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED, + // unsigned byte-wise comparison + Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED, + } + } + + /// Returns sort order associated with this column order. + pub fn sort_order(&self) -> SortOrder { + match *self { + ColumnOrder::TYPE_DEFINED_ORDER(order) => order, + ColumnOrder::UNDEFINED => SortOrder::SIGNED, + } + } +} + +impl fmt::Display for Type { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for LogicalType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for Repetition { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for Encoding { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for Compression { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for PageType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for SortOrder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl fmt::Display for ColumnOrder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +// ---------------------------------------------------------------------- +// parquet::Type <=> Type conversion + +impl convert::From for Type { + fn from(value: parquet::Type) -> Self { + match value { + parquet::Type::BOOLEAN => Type::BOOLEAN, + parquet::Type::INT32 => Type::INT32, + parquet::Type::INT64 => Type::INT64, + parquet::Type::INT96 => Type::INT96, + parquet::Type::FLOAT => Type::FLOAT, + parquet::Type::DOUBLE => Type::DOUBLE, + parquet::Type::BYTE_ARRAY => Type::BYTE_ARRAY, + parquet::Type::FIXED_LEN_BYTE_ARRAY => Type::FIXED_LEN_BYTE_ARRAY, + } + } +} + +impl convert::From for parquet::Type { + fn from(value: Type) -> Self { + match value { + Type::BOOLEAN => parquet::Type::BOOLEAN, + Type::INT32 => parquet::Type::INT32, + Type::INT64 => parquet::Type::INT64, + Type::INT96 => parquet::Type::INT96, + Type::FLOAT => parquet::Type::FLOAT, + Type::DOUBLE => parquet::Type::DOUBLE, + Type::BYTE_ARRAY => parquet::Type::BYTE_ARRAY, + Type::FIXED_LEN_BYTE_ARRAY => parquet::Type::FIXED_LEN_BYTE_ARRAY, + } + } +} + +// ---------------------------------------------------------------------- +// parquet::ConvertedType <=> LogicalType conversion + +impl convert::From> for LogicalType { + fn from(option: Option) -> Self { + match option { + None => LogicalType::NONE, + Some(value) => match value { + parquet::ConvertedType::UTF8 => LogicalType::UTF8, + parquet::ConvertedType::MAP => LogicalType::MAP, + parquet::ConvertedType::MAP_KEY_VALUE => LogicalType::MAP_KEY_VALUE, + parquet::ConvertedType::LIST => LogicalType::LIST, + parquet::ConvertedType::ENUM => LogicalType::ENUM, + parquet::ConvertedType::DECIMAL => LogicalType::DECIMAL, + parquet::ConvertedType::DATE => LogicalType::DATE, + parquet::ConvertedType::TIME_MILLIS => LogicalType::TIME_MILLIS, + parquet::ConvertedType::TIME_MICROS => LogicalType::TIME_MICROS, + parquet::ConvertedType::TIMESTAMP_MILLIS => LogicalType::TIMESTAMP_MILLIS, + parquet::ConvertedType::TIMESTAMP_MICROS => LogicalType::TIMESTAMP_MICROS, + parquet::ConvertedType::UINT_8 => LogicalType::UINT_8, + parquet::ConvertedType::UINT_16 => LogicalType::UINT_16, + parquet::ConvertedType::UINT_32 => LogicalType::UINT_32, + parquet::ConvertedType::UINT_64 => LogicalType::UINT_64, + parquet::ConvertedType::INT_8 => LogicalType::INT_8, + parquet::ConvertedType::INT_16 => LogicalType::INT_16, + parquet::ConvertedType::INT_32 => LogicalType::INT_32, + parquet::ConvertedType::INT_64 => LogicalType::INT_64, + parquet::ConvertedType::JSON => LogicalType::JSON, + parquet::ConvertedType::BSON => LogicalType::BSON, + parquet::ConvertedType::INTERVAL => LogicalType::INTERVAL, + }, + } + } +} + +impl convert::From for Option { + fn from(value: LogicalType) -> Self { + match value { + LogicalType::NONE => None, + LogicalType::UTF8 => Some(parquet::ConvertedType::UTF8), + LogicalType::MAP => Some(parquet::ConvertedType::MAP), + LogicalType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MAP_KEY_VALUE), + LogicalType::LIST => Some(parquet::ConvertedType::LIST), + LogicalType::ENUM => Some(parquet::ConvertedType::ENUM), + LogicalType::DECIMAL => Some(parquet::ConvertedType::DECIMAL), + LogicalType::DATE => Some(parquet::ConvertedType::DATE), + LogicalType::TIME_MILLIS => Some(parquet::ConvertedType::TIME_MILLIS), + LogicalType::TIME_MICROS => Some(parquet::ConvertedType::TIME_MICROS), + LogicalType::TIMESTAMP_MILLIS => Some(parquet::ConvertedType::TIMESTAMP_MILLIS), + LogicalType::TIMESTAMP_MICROS => Some(parquet::ConvertedType::TIMESTAMP_MICROS), + LogicalType::UINT_8 => Some(parquet::ConvertedType::UINT_8), + LogicalType::UINT_16 => Some(parquet::ConvertedType::UINT_16), + LogicalType::UINT_32 => Some(parquet::ConvertedType::UINT_32), + LogicalType::UINT_64 => Some(parquet::ConvertedType::UINT_64), + LogicalType::INT_8 => Some(parquet::ConvertedType::INT_8), + LogicalType::INT_16 => Some(parquet::ConvertedType::INT_16), + LogicalType::INT_32 => Some(parquet::ConvertedType::INT_32), + LogicalType::INT_64 => Some(parquet::ConvertedType::INT_64), + LogicalType::JSON => Some(parquet::ConvertedType::JSON), + LogicalType::BSON => Some(parquet::ConvertedType::BSON), + LogicalType::INTERVAL => Some(parquet::ConvertedType::INTERVAL), + } + } +} + +// ---------------------------------------------------------------------- +// parquet::FieldRepetitionType <=> Repetition conversion + +impl convert::From for Repetition { + fn from(value: parquet::FieldRepetitionType) -> Self { + match value { + parquet::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, + parquet::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, + parquet::FieldRepetitionType::REPEATED => Repetition::REPEATED, + } + } +} + +impl convert::From for parquet::FieldRepetitionType { + fn from(value: Repetition) -> Self { + match value { + Repetition::REQUIRED => parquet::FieldRepetitionType::REQUIRED, + Repetition::OPTIONAL => parquet::FieldRepetitionType::OPTIONAL, + Repetition::REPEATED => parquet::FieldRepetitionType::REPEATED, + } + } +} + +// ---------------------------------------------------------------------- +// parquet::Encoding <=> Encoding conversion + +impl convert::From for Encoding { + fn from(value: parquet::Encoding) -> Self { + match value { + parquet::Encoding::PLAIN => Encoding::PLAIN, + parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY, + parquet::Encoding::RLE => Encoding::RLE, + parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED, + parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED, + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => Encoding::DELTA_LENGTH_BYTE_ARRAY, + parquet::Encoding::DELTA_BYTE_ARRAY => Encoding::DELTA_BYTE_ARRAY, + parquet::Encoding::RLE_DICTIONARY => Encoding::RLE_DICTIONARY, + } + } +} + +impl convert::From for parquet::Encoding { + fn from(value: Encoding) -> Self { + match value { + Encoding::PLAIN => parquet::Encoding::PLAIN, + Encoding::PLAIN_DICTIONARY => parquet::Encoding::PLAIN_DICTIONARY, + Encoding::RLE => parquet::Encoding::RLE, + Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED, + Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DELTA_BINARY_PACKED, + Encoding::DELTA_LENGTH_BYTE_ARRAY => parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + Encoding::DELTA_BYTE_ARRAY => parquet::Encoding::DELTA_BYTE_ARRAY, + Encoding::RLE_DICTIONARY => parquet::Encoding::RLE_DICTIONARY, + } + } +} + +// ---------------------------------------------------------------------- +// parquet::CompressionCodec <=> Compression conversion + +impl convert::From for Compression { + fn from(value: parquet::CompressionCodec) -> Self { + match value { + parquet::CompressionCodec::UNCOMPRESSED => Compression::UNCOMPRESSED, + parquet::CompressionCodec::SNAPPY => Compression::SNAPPY, + parquet::CompressionCodec::GZIP => Compression::GZIP, + parquet::CompressionCodec::LZO => Compression::LZO, + parquet::CompressionCodec::BROTLI => Compression::BROTLI, + parquet::CompressionCodec::LZ4 => Compression::LZ4, + parquet::CompressionCodec::ZSTD => Compression::ZSTD, + } + } +} + +impl convert::From for parquet::CompressionCodec { + fn from(value: Compression) -> Self { + match value { + Compression::UNCOMPRESSED => parquet::CompressionCodec::UNCOMPRESSED, + Compression::SNAPPY => parquet::CompressionCodec::SNAPPY, + Compression::GZIP => parquet::CompressionCodec::GZIP, + Compression::LZO => parquet::CompressionCodec::LZO, + Compression::BROTLI => parquet::CompressionCodec::BROTLI, + Compression::LZ4 => parquet::CompressionCodec::LZ4, + Compression::ZSTD => parquet::CompressionCodec::ZSTD, + } + } +} + +// ---------------------------------------------------------------------- +// parquet::PageType <=> PageType conversion + +impl convert::From for PageType { + fn from(value: parquet::PageType) -> Self { + match value { + parquet::PageType::DATA_PAGE => PageType::DATA_PAGE, + parquet::PageType::INDEX_PAGE => PageType::INDEX_PAGE, + parquet::PageType::DICTIONARY_PAGE => PageType::DICTIONARY_PAGE, + parquet::PageType::DATA_PAGE_V2 => PageType::DATA_PAGE_V2, + } + } +} + +impl convert::From for parquet::PageType { + fn from(value: PageType) -> Self { + match value { + PageType::DATA_PAGE => parquet::PageType::DATA_PAGE, + PageType::INDEX_PAGE => parquet::PageType::INDEX_PAGE, + PageType::DICTIONARY_PAGE => parquet::PageType::DICTIONARY_PAGE, + PageType::DATA_PAGE_V2 => parquet::PageType::DATA_PAGE_V2, + } + } +} + +// ---------------------------------------------------------------------- +// String conversions for schema parsing. + +impl str::FromStr for Repetition { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result { + match s { + "REQUIRED" => Ok(Repetition::REQUIRED), + "OPTIONAL" => Ok(Repetition::OPTIONAL), + "REPEATED" => Ok(Repetition::REPEATED), + other => Err(general_err!("Invalid repetition {}", other)), + } + } +} + +impl str::FromStr for Type { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result { + match s { + "BOOLEAN" => Ok(Type::BOOLEAN), + "INT32" => Ok(Type::INT32), + "INT64" => Ok(Type::INT64), + "INT96" => Ok(Type::INT96), + "FLOAT" => Ok(Type::FLOAT), + "DOUBLE" => Ok(Type::DOUBLE), + "BYTE_ARRAY" | "BINARY" => Ok(Type::BYTE_ARRAY), + "FIXED_LEN_BYTE_ARRAY" => Ok(Type::FIXED_LEN_BYTE_ARRAY), + other => Err(general_err!("Invalid type {}", other)), + } + } +} + +impl str::FromStr for LogicalType { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result { + match s { + "NONE" => Ok(LogicalType::NONE), + "UTF8" => Ok(LogicalType::UTF8), + "MAP" => Ok(LogicalType::MAP), + "MAP_KEY_VALUE" => Ok(LogicalType::MAP_KEY_VALUE), + "LIST" => Ok(LogicalType::LIST), + "ENUM" => Ok(LogicalType::ENUM), + "DECIMAL" => Ok(LogicalType::DECIMAL), + "DATE" => Ok(LogicalType::DATE), + "TIME_MILLIS" => Ok(LogicalType::TIME_MILLIS), + "TIME_MICROS" => Ok(LogicalType::TIME_MICROS), + "TIMESTAMP_MILLIS" => Ok(LogicalType::TIMESTAMP_MILLIS), + "TIMESTAMP_MICROS" => Ok(LogicalType::TIMESTAMP_MICROS), + "UINT_8" => Ok(LogicalType::UINT_8), + "UINT_16" => Ok(LogicalType::UINT_16), + "UINT_32" => Ok(LogicalType::UINT_32), + "UINT_64" => Ok(LogicalType::UINT_64), + "INT_8" => Ok(LogicalType::INT_8), + "INT_16" => Ok(LogicalType::INT_16), + "INT_32" => Ok(LogicalType::INT_32), + "INT_64" => Ok(LogicalType::INT_64), + "JSON" => Ok(LogicalType::JSON), + "BSON" => Ok(LogicalType::BSON), + "INTERVAL" => Ok(LogicalType::INTERVAL), + other => Err(general_err!("Invalid logical type {}", other)), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_display_type() { + assert_eq!(Type::BOOLEAN.to_string(), "BOOLEAN"); + assert_eq!(Type::INT32.to_string(), "INT32"); + assert_eq!(Type::INT64.to_string(), "INT64"); + assert_eq!(Type::INT96.to_string(), "INT96"); + assert_eq!(Type::FLOAT.to_string(), "FLOAT"); + assert_eq!(Type::DOUBLE.to_string(), "DOUBLE"); + assert_eq!(Type::BYTE_ARRAY.to_string(), "BYTE_ARRAY"); + assert_eq!( + Type::FIXED_LEN_BYTE_ARRAY.to_string(), + "FIXED_LEN_BYTE_ARRAY" + ); + } + + #[test] + fn test_from_type() { + assert_eq!(Type::from(parquet::Type::BOOLEAN), Type::BOOLEAN); + assert_eq!(Type::from(parquet::Type::INT32), Type::INT32); + assert_eq!(Type::from(parquet::Type::INT64), Type::INT64); + assert_eq!(Type::from(parquet::Type::INT96), Type::INT96); + assert_eq!(Type::from(parquet::Type::FLOAT), Type::FLOAT); + assert_eq!(Type::from(parquet::Type::DOUBLE), Type::DOUBLE); + assert_eq!(Type::from(parquet::Type::BYTE_ARRAY), Type::BYTE_ARRAY); + assert_eq!( + Type::from(parquet::Type::FIXED_LEN_BYTE_ARRAY), + Type::FIXED_LEN_BYTE_ARRAY + ); + } + + #[test] + fn test_into_type() { + assert_eq!(parquet::Type::BOOLEAN, Type::BOOLEAN.into()); + assert_eq!(parquet::Type::INT32, Type::INT32.into()); + assert_eq!(parquet::Type::INT64, Type::INT64.into()); + assert_eq!(parquet::Type::INT96, Type::INT96.into()); + assert_eq!(parquet::Type::FLOAT, Type::FLOAT.into()); + assert_eq!(parquet::Type::DOUBLE, Type::DOUBLE.into()); + assert_eq!(parquet::Type::BYTE_ARRAY, Type::BYTE_ARRAY.into()); + assert_eq!( + parquet::Type::FIXED_LEN_BYTE_ARRAY, + Type::FIXED_LEN_BYTE_ARRAY.into() + ); + } + + #[test] + fn test_from_string_into_type() { + assert_eq!( + Type::BOOLEAN.to_string().parse::().unwrap(), + Type::BOOLEAN + ); + assert_eq!( + Type::INT32.to_string().parse::().unwrap(), + Type::INT32 + ); + assert_eq!( + Type::INT64.to_string().parse::().unwrap(), + Type::INT64 + ); + assert_eq!( + Type::INT96.to_string().parse::().unwrap(), + Type::INT96 + ); + assert_eq!( + Type::FLOAT.to_string().parse::().unwrap(), + Type::FLOAT + ); + assert_eq!( + Type::DOUBLE.to_string().parse::().unwrap(), + Type::DOUBLE + ); + assert_eq!( + Type::BYTE_ARRAY.to_string().parse::().unwrap(), + Type::BYTE_ARRAY + ); + assert_eq!("BINARY".parse::().unwrap(), Type::BYTE_ARRAY); + assert_eq!( + Type::FIXED_LEN_BYTE_ARRAY + .to_string() + .parse::() + .unwrap(), + Type::FIXED_LEN_BYTE_ARRAY + ); + } + + #[test] + fn test_display_logical_type() { + assert_eq!(LogicalType::NONE.to_string(), "NONE"); + assert_eq!(LogicalType::UTF8.to_string(), "UTF8"); + assert_eq!(LogicalType::MAP.to_string(), "MAP"); + assert_eq!(LogicalType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE"); + assert_eq!(LogicalType::LIST.to_string(), "LIST"); + assert_eq!(LogicalType::ENUM.to_string(), "ENUM"); + assert_eq!(LogicalType::DECIMAL.to_string(), "DECIMAL"); + assert_eq!(LogicalType::DATE.to_string(), "DATE"); + assert_eq!(LogicalType::TIME_MILLIS.to_string(), "TIME_MILLIS"); + assert_eq!(LogicalType::DATE.to_string(), "DATE"); + assert_eq!(LogicalType::TIME_MICROS.to_string(), "TIME_MICROS"); + assert_eq!( + LogicalType::TIMESTAMP_MILLIS.to_string(), + "TIMESTAMP_MILLIS" + ); + assert_eq!( + LogicalType::TIMESTAMP_MICROS.to_string(), + "TIMESTAMP_MICROS" + ); + assert_eq!(LogicalType::UINT_8.to_string(), "UINT_8"); + assert_eq!(LogicalType::UINT_16.to_string(), "UINT_16"); + assert_eq!(LogicalType::UINT_32.to_string(), "UINT_32"); + assert_eq!(LogicalType::UINT_64.to_string(), "UINT_64"); + assert_eq!(LogicalType::INT_8.to_string(), "INT_8"); + assert_eq!(LogicalType::INT_16.to_string(), "INT_16"); + assert_eq!(LogicalType::INT_32.to_string(), "INT_32"); + assert_eq!(LogicalType::INT_64.to_string(), "INT_64"); + assert_eq!(LogicalType::JSON.to_string(), "JSON"); + assert_eq!(LogicalType::BSON.to_string(), "BSON"); + assert_eq!(LogicalType::INTERVAL.to_string(), "INTERVAL"); + } + + #[test] + fn test_from_logical_type() { + assert_eq!(LogicalType::from(None), LogicalType::NONE); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UTF8)), + LogicalType::UTF8 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::MAP)), + LogicalType::MAP + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::MAP_KEY_VALUE)), + LogicalType::MAP_KEY_VALUE + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::LIST)), + LogicalType::LIST + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::ENUM)), + LogicalType::ENUM + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::DECIMAL)), + LogicalType::DECIMAL + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::DATE)), + LogicalType::DATE + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::TIME_MILLIS)), + LogicalType::TIME_MILLIS + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::TIME_MICROS)), + LogicalType::TIME_MICROS + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::TIMESTAMP_MILLIS)), + LogicalType::TIMESTAMP_MILLIS + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::TIMESTAMP_MICROS)), + LogicalType::TIMESTAMP_MICROS + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UINT_8)), + LogicalType::UINT_8 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UINT_16)), + LogicalType::UINT_16 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UINT_32)), + LogicalType::UINT_32 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::UINT_64)), + LogicalType::UINT_64 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INT_8)), + LogicalType::INT_8 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INT_16)), + LogicalType::INT_16 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INT_32)), + LogicalType::INT_32 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INT_64)), + LogicalType::INT_64 + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::JSON)), + LogicalType::JSON + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::BSON)), + LogicalType::BSON + ); + assert_eq!( + LogicalType::from(Some(parquet::ConvertedType::INTERVAL)), + LogicalType::INTERVAL + ); + } + + #[test] + fn test_into_logical_type() { + let converted_type: Option = None; + assert_eq!(converted_type, LogicalType::NONE.into()); + assert_eq!(Some(parquet::ConvertedType::UTF8), LogicalType::UTF8.into()); + assert_eq!(Some(parquet::ConvertedType::MAP), LogicalType::MAP.into()); + assert_eq!( + Some(parquet::ConvertedType::MAP_KEY_VALUE), + LogicalType::MAP_KEY_VALUE.into() + ); + assert_eq!(Some(parquet::ConvertedType::LIST), LogicalType::LIST.into()); + assert_eq!(Some(parquet::ConvertedType::ENUM), LogicalType::ENUM.into()); + assert_eq!( + Some(parquet::ConvertedType::DECIMAL), + LogicalType::DECIMAL.into() + ); + assert_eq!(Some(parquet::ConvertedType::DATE), LogicalType::DATE.into()); + assert_eq!( + Some(parquet::ConvertedType::TIME_MILLIS), + LogicalType::TIME_MILLIS.into() + ); + assert_eq!( + Some(parquet::ConvertedType::TIME_MICROS), + LogicalType::TIME_MICROS.into() + ); + assert_eq!( + Some(parquet::ConvertedType::TIMESTAMP_MILLIS), + LogicalType::TIMESTAMP_MILLIS.into() + ); + assert_eq!( + Some(parquet::ConvertedType::TIMESTAMP_MICROS), + LogicalType::TIMESTAMP_MICROS.into() + ); + assert_eq!( + Some(parquet::ConvertedType::UINT_8), + LogicalType::UINT_8.into() + ); + assert_eq!( + Some(parquet::ConvertedType::UINT_16), + LogicalType::UINT_16.into() + ); + assert_eq!( + Some(parquet::ConvertedType::UINT_32), + LogicalType::UINT_32.into() + ); + assert_eq!( + Some(parquet::ConvertedType::UINT_64), + LogicalType::UINT_64.into() + ); + assert_eq!( + Some(parquet::ConvertedType::INT_8), + LogicalType::INT_8.into() + ); + assert_eq!( + Some(parquet::ConvertedType::INT_16), + LogicalType::INT_16.into() + ); + assert_eq!( + Some(parquet::ConvertedType::INT_32), + LogicalType::INT_32.into() + ); + assert_eq!( + Some(parquet::ConvertedType::INT_64), + LogicalType::INT_64.into() + ); + assert_eq!(Some(parquet::ConvertedType::JSON), LogicalType::JSON.into()); + assert_eq!(Some(parquet::ConvertedType::BSON), LogicalType::BSON.into()); + assert_eq!( + Some(parquet::ConvertedType::INTERVAL), + LogicalType::INTERVAL.into() + ); + } + + #[test] + fn test_from_string_into_logical_type() { + assert_eq!( + LogicalType::NONE + .to_string() + .parse::() + .unwrap(), + LogicalType::NONE + ); + assert_eq!( + LogicalType::UTF8 + .to_string() + .parse::() + .unwrap(), + LogicalType::UTF8 + ); + assert_eq!( + LogicalType::MAP.to_string().parse::().unwrap(), + LogicalType::MAP + ); + assert_eq!( + LogicalType::MAP_KEY_VALUE + .to_string() + .parse::() + .unwrap(), + LogicalType::MAP_KEY_VALUE + ); + assert_eq!( + LogicalType::LIST + .to_string() + .parse::() + .unwrap(), + LogicalType::LIST + ); + assert_eq!( + LogicalType::ENUM + .to_string() + .parse::() + .unwrap(), + LogicalType::ENUM + ); + assert_eq!( + LogicalType::DECIMAL + .to_string() + .parse::() + .unwrap(), + LogicalType::DECIMAL + ); + assert_eq!( + LogicalType::DATE + .to_string() + .parse::() + .unwrap(), + LogicalType::DATE + ); + assert_eq!( + LogicalType::TIME_MILLIS + .to_string() + .parse::() + .unwrap(), + LogicalType::TIME_MILLIS + ); + assert_eq!( + LogicalType::TIME_MICROS + .to_string() + .parse::() + .unwrap(), + LogicalType::TIME_MICROS + ); + assert_eq!( + LogicalType::TIMESTAMP_MILLIS + .to_string() + .parse::() + .unwrap(), + LogicalType::TIMESTAMP_MILLIS + ); + assert_eq!( + LogicalType::TIMESTAMP_MICROS + .to_string() + .parse::() + .unwrap(), + LogicalType::TIMESTAMP_MICROS + ); + assert_eq!( + LogicalType::UINT_8 + .to_string() + .parse::() + .unwrap(), + LogicalType::UINT_8 + ); + assert_eq!( + LogicalType::UINT_16 + .to_string() + .parse::() + .unwrap(), + LogicalType::UINT_16 + ); + assert_eq!( + LogicalType::UINT_32 + .to_string() + .parse::() + .unwrap(), + LogicalType::UINT_32 + ); + assert_eq!( + LogicalType::UINT_64 + .to_string() + .parse::() + .unwrap(), + LogicalType::UINT_64 + ); + assert_eq!( + LogicalType::INT_8 + .to_string() + .parse::() + .unwrap(), + LogicalType::INT_8 + ); + assert_eq!( + LogicalType::INT_16 + .to_string() + .parse::() + .unwrap(), + LogicalType::INT_16 + ); + assert_eq!( + LogicalType::INT_32 + .to_string() + .parse::() + .unwrap(), + LogicalType::INT_32 + ); + assert_eq!( + LogicalType::INT_64 + .to_string() + .parse::() + .unwrap(), + LogicalType::INT_64 + ); + assert_eq!( + LogicalType::JSON + .to_string() + .parse::() + .unwrap(), + LogicalType::JSON + ); + assert_eq!( + LogicalType::BSON + .to_string() + .parse::() + .unwrap(), + LogicalType::BSON + ); + assert_eq!( + LogicalType::INTERVAL + .to_string() + .parse::() + .unwrap(), + LogicalType::INTERVAL + ); + } + + #[test] + fn test_display_repetition() { + assert_eq!(Repetition::REQUIRED.to_string(), "REQUIRED"); + assert_eq!(Repetition::OPTIONAL.to_string(), "OPTIONAL"); + assert_eq!(Repetition::REPEATED.to_string(), "REPEATED"); + } + + #[test] + fn test_from_repetition() { + assert_eq!( + Repetition::from(parquet::FieldRepetitionType::REQUIRED), + Repetition::REQUIRED + ); + assert_eq!( + Repetition::from(parquet::FieldRepetitionType::OPTIONAL), + Repetition::OPTIONAL + ); + assert_eq!( + Repetition::from(parquet::FieldRepetitionType::REPEATED), + Repetition::REPEATED + ); + } + + #[test] + fn test_into_repetition() { + assert_eq!( + parquet::FieldRepetitionType::REQUIRED, + Repetition::REQUIRED.into() + ); + assert_eq!( + parquet::FieldRepetitionType::OPTIONAL, + Repetition::OPTIONAL.into() + ); + assert_eq!( + parquet::FieldRepetitionType::REPEATED, + Repetition::REPEATED.into() + ); + } + + #[test] + fn test_from_string_into_repetition() { + assert_eq!( + Repetition::REQUIRED + .to_string() + .parse::() + .unwrap(), + Repetition::REQUIRED + ); + assert_eq!( + Repetition::OPTIONAL + .to_string() + .parse::() + .unwrap(), + Repetition::OPTIONAL + ); + assert_eq!( + Repetition::REPEATED + .to_string() + .parse::() + .unwrap(), + Repetition::REPEATED + ); + } + + #[test] + fn test_display_encoding() { + assert_eq!(Encoding::PLAIN.to_string(), "PLAIN"); + assert_eq!(Encoding::PLAIN_DICTIONARY.to_string(), "PLAIN_DICTIONARY"); + assert_eq!(Encoding::RLE.to_string(), "RLE"); + assert_eq!(Encoding::BIT_PACKED.to_string(), "BIT_PACKED"); + assert_eq!( + Encoding::DELTA_BINARY_PACKED.to_string(), + "DELTA_BINARY_PACKED" + ); + assert_eq!( + Encoding::DELTA_LENGTH_BYTE_ARRAY.to_string(), + "DELTA_LENGTH_BYTE_ARRAY" + ); + assert_eq!(Encoding::DELTA_BYTE_ARRAY.to_string(), "DELTA_BYTE_ARRAY"); + assert_eq!(Encoding::RLE_DICTIONARY.to_string(), "RLE_DICTIONARY"); + } + + #[test] + fn test_from_encoding() { + assert_eq!(Encoding::from(parquet::Encoding::PLAIN), Encoding::PLAIN); + assert_eq!( + Encoding::from(parquet::Encoding::PLAIN_DICTIONARY), + Encoding::PLAIN_DICTIONARY + ); + assert_eq!(Encoding::from(parquet::Encoding::RLE), Encoding::RLE); + assert_eq!( + Encoding::from(parquet::Encoding::BIT_PACKED), + Encoding::BIT_PACKED + ); + assert_eq!( + Encoding::from(parquet::Encoding::DELTA_BINARY_PACKED), + Encoding::DELTA_BINARY_PACKED + ); + assert_eq!( + Encoding::from(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY), + Encoding::DELTA_LENGTH_BYTE_ARRAY + ); + assert_eq!( + Encoding::from(parquet::Encoding::DELTA_BYTE_ARRAY), + Encoding::DELTA_BYTE_ARRAY + ); + } + + #[test] + fn test_into_encoding() { + assert_eq!(parquet::Encoding::PLAIN, Encoding::PLAIN.into()); + assert_eq!( + parquet::Encoding::PLAIN_DICTIONARY, + Encoding::PLAIN_DICTIONARY.into() + ); + assert_eq!(parquet::Encoding::RLE, Encoding::RLE.into()); + assert_eq!(parquet::Encoding::BIT_PACKED, Encoding::BIT_PACKED.into()); + assert_eq!( + parquet::Encoding::DELTA_BINARY_PACKED, + Encoding::DELTA_BINARY_PACKED.into() + ); + assert_eq!( + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + Encoding::DELTA_LENGTH_BYTE_ARRAY.into() + ); + assert_eq!( + parquet::Encoding::DELTA_BYTE_ARRAY, + Encoding::DELTA_BYTE_ARRAY.into() + ); + } + + #[test] + fn test_display_compression() { + assert_eq!(Compression::UNCOMPRESSED.to_string(), "UNCOMPRESSED"); + assert_eq!(Compression::SNAPPY.to_string(), "SNAPPY"); + assert_eq!(Compression::GZIP.to_string(), "GZIP"); + assert_eq!(Compression::LZO.to_string(), "LZO"); + assert_eq!(Compression::BROTLI.to_string(), "BROTLI"); + assert_eq!(Compression::LZ4.to_string(), "LZ4"); + assert_eq!(Compression::ZSTD.to_string(), "ZSTD"); + } + + #[test] + fn test_from_compression() { + assert_eq!( + Compression::from(parquet::CompressionCodec::UNCOMPRESSED), + Compression::UNCOMPRESSED + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::SNAPPY), + Compression::SNAPPY + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::GZIP), + Compression::GZIP + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::LZO), + Compression::LZO + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::BROTLI), + Compression::BROTLI + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::LZ4), + Compression::LZ4 + ); + assert_eq!( + Compression::from(parquet::CompressionCodec::ZSTD), + Compression::ZSTD + ); + } + + #[test] + fn test_into_compression() { + assert_eq!( + parquet::CompressionCodec::UNCOMPRESSED, + Compression::UNCOMPRESSED.into() + ); + assert_eq!( + parquet::CompressionCodec::SNAPPY, + Compression::SNAPPY.into() + ); + assert_eq!(parquet::CompressionCodec::GZIP, Compression::GZIP.into()); + assert_eq!(parquet::CompressionCodec::LZO, Compression::LZO.into()); + assert_eq!( + parquet::CompressionCodec::BROTLI, + Compression::BROTLI.into() + ); + assert_eq!(parquet::CompressionCodec::LZ4, Compression::LZ4.into()); + assert_eq!(parquet::CompressionCodec::ZSTD, Compression::ZSTD.into()); + } + + #[test] + fn test_display_page_type() { + assert_eq!(PageType::DATA_PAGE.to_string(), "DATA_PAGE"); + assert_eq!(PageType::INDEX_PAGE.to_string(), "INDEX_PAGE"); + assert_eq!(PageType::DICTIONARY_PAGE.to_string(), "DICTIONARY_PAGE"); + assert_eq!(PageType::DATA_PAGE_V2.to_string(), "DATA_PAGE_V2"); + } + + #[test] + fn test_from_page_type() { + assert_eq!( + PageType::from(parquet::PageType::DATA_PAGE), + PageType::DATA_PAGE + ); + assert_eq!( + PageType::from(parquet::PageType::INDEX_PAGE), + PageType::INDEX_PAGE + ); + assert_eq!( + PageType::from(parquet::PageType::DICTIONARY_PAGE), + PageType::DICTIONARY_PAGE + ); + assert_eq!( + PageType::from(parquet::PageType::DATA_PAGE_V2), + PageType::DATA_PAGE_V2 + ); + } + + #[test] + fn test_into_page_type() { + assert_eq!(parquet::PageType::DATA_PAGE, PageType::DATA_PAGE.into()); + assert_eq!(parquet::PageType::INDEX_PAGE, PageType::INDEX_PAGE.into()); + assert_eq!( + parquet::PageType::DICTIONARY_PAGE, + PageType::DICTIONARY_PAGE.into() + ); + assert_eq!( + parquet::PageType::DATA_PAGE_V2, + PageType::DATA_PAGE_V2.into() + ); + } + + #[test] + fn test_display_sort_order() { + assert_eq!(SortOrder::SIGNED.to_string(), "SIGNED"); + assert_eq!(SortOrder::UNSIGNED.to_string(), "UNSIGNED"); + assert_eq!(SortOrder::UNDEFINED.to_string(), "UNDEFINED"); + } + + #[test] + fn test_display_column_order() { + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).to_string(), + "TYPE_DEFINED_ORDER(SIGNED)" + ); + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).to_string(), + "TYPE_DEFINED_ORDER(UNSIGNED)" + ); + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).to_string(), + "TYPE_DEFINED_ORDER(UNDEFINED)" + ); + assert_eq!(ColumnOrder::UNDEFINED.to_string(), "UNDEFINED"); + } + + #[test] + fn test_column_order_get_sort_order() { + // Helper to check the order in a list of values. + // Only logical type is checked. + fn check_sort_order(types: Vec, expected_order: SortOrder) { + for tpe in types { + assert_eq!( + ColumnOrder::get_sort_order(tpe, Type::BYTE_ARRAY), + expected_order + ); + } + } + + // Unsigned comparison (physical type does not matter) + let unsigned = vec![ + LogicalType::UTF8, + LogicalType::JSON, + LogicalType::BSON, + LogicalType::ENUM, + LogicalType::UINT_8, + LogicalType::UINT_16, + LogicalType::UINT_32, + LogicalType::UINT_64, + LogicalType::INTERVAL, + ]; + check_sort_order(unsigned, SortOrder::UNSIGNED); + + // Signed comparison (physical type does not matter) + let signed = vec![ + LogicalType::INT_8, + LogicalType::INT_16, + LogicalType::INT_32, + LogicalType::INT_64, + LogicalType::DECIMAL, + LogicalType::DATE, + LogicalType::TIME_MILLIS, + LogicalType::TIME_MICROS, + LogicalType::TIMESTAMP_MILLIS, + LogicalType::TIMESTAMP_MICROS, + ]; + check_sort_order(signed, SortOrder::SIGNED); + + // Undefined comparison + let undefined = vec![ + LogicalType::LIST, + LogicalType::MAP, + LogicalType::MAP_KEY_VALUE, + ]; + check_sort_order(undefined, SortOrder::UNDEFINED); + + // Check None logical type + // This should return a sort order for byte array type. + check_sort_order(vec![LogicalType::NONE], SortOrder::UNSIGNED); + } + + #[test] + fn test_column_order_get_default_sort_order() { + // Comparison based on physical type + assert_eq!( + ColumnOrder::get_default_sort_order(Type::BOOLEAN), + SortOrder::UNSIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::INT32), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::INT64), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::INT96), + SortOrder::UNDEFINED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::FLOAT), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::DOUBLE), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::BYTE_ARRAY), + SortOrder::UNSIGNED + ); + assert_eq!( + ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY), + SortOrder::UNSIGNED + ); + } + + #[test] + fn test_column_order_sort_order() { + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED).sort_order(), + SortOrder::SIGNED + ); + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED).sort_order(), + SortOrder::UNSIGNED + ); + assert_eq!( + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED).sort_order(), + SortOrder::UNDEFINED + ); + assert_eq!(ColumnOrder::UNDEFINED.sort_order(), SortOrder::SIGNED); + } +} diff --git a/rust/src/parquet/column/mod.rs b/rust/src/parquet/column/mod.rs new file mode 100644 index 0000000000000..09c4bde51f771 --- /dev/null +++ b/rust/src/parquet/column/mod.rs @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Low level column reader and writer APIs. +//! +//! This API is designed for reading and writing column values, definition and repetition +//! levels directly. +//! +//! # Example of writing and reading data +//! +//! Data has the following format: +//! ```text +//! +---------------+ +//! | values| +//! +---------------+ +//! |[1, 2] | +//! |[3, null, null]| +//! +---------------+ +//! ``` +//! +//! The example uses column writer and reader APIs to write raw values, definition and +//! repetition levels and read them to verify write/read correctness. +//! +//! ```rust +//! use std::{fs, path::Path, rc::Rc}; +//! +//! use arrow::parquet::{ +//! column::{reader::ColumnReader, writer::ColumnWriter}, +//! file::{ +//! properties::WriterProperties, +//! reader::{FileReader, SerializedFileReader}, +//! writer::{FileWriter, SerializedFileWriter}, +//! }, +//! schema::parser::parse_message_type, +//! }; +//! +//! let path = Path::new("target/debug/examples/column_sample.parquet"); +//! +//! // Writing data using column writer API. +//! +//! let message_type = " +//! message schema { +//! optional group values (LIST) { +//! repeated group list { +//! optional INT32 element; +//! } +//! } +//! } +//! "; +//! let schema = Rc::new(parse_message_type(message_type).unwrap()); +//! let props = Rc::new(WriterProperties::builder().build()); +//! let file = fs::File::create(path).unwrap(); +//! let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); +//! let mut row_group_writer = writer.next_row_group().unwrap(); +//! while let Some(mut col_writer) = row_group_writer.next_column().unwrap() { +//! match col_writer { +//! // You can also use `get_typed_column_writer` method to extract typed writer. +//! ColumnWriter::Int32ColumnWriter(ref mut typed_writer) => { +//! typed_writer +//! .write_batch(&[1, 2, 3], Some(&[3, 3, 3, 2, 2]), Some(&[0, 1, 0, 1, 1])) +//! .unwrap(); +//! } +//! _ => {} +//! } +//! row_group_writer.close_column(col_writer).unwrap(); +//! } +//! writer.close_row_group(row_group_writer).unwrap(); +//! writer.close().unwrap(); +//! +//! // Reading data using column reader API. +//! +//! let file = fs::File::open(path).unwrap(); +//! let reader = SerializedFileReader::new(file).unwrap(); +//! let metadata = reader.metadata(); +//! +//! let mut res = Ok((0, 0)); +//! let mut values = vec![0; 8]; +//! let mut def_levels = vec![0; 8]; +//! let mut rep_levels = vec![0; 8]; +//! +//! for i in 0..metadata.num_row_groups() { +//! let row_group_reader = reader.get_row_group(i).unwrap(); +//! let row_group_metadata = metadata.row_group(i); +//! +//! for j in 0..row_group_metadata.num_columns() { +//! let mut column_reader = row_group_reader.get_column_reader(j).unwrap(); +//! match column_reader { +//! // You can also use `get_typed_column_reader` method to extract typed reader. +//! ColumnReader::Int32ColumnReader(ref mut typed_reader) => { +//! res = typed_reader.read_batch( +//! 8, // batch size +//! Some(&mut def_levels), +//! Some(&mut rep_levels), +//! &mut values, +//! ); +//! } +//! _ => {} +//! } +//! } +//! } +//! +//! assert_eq!(res, Ok((3, 5))); +//! assert_eq!(values, vec![1, 2, 3, 0, 0, 0, 0, 0]); +//! assert_eq!(def_levels, vec![3, 3, 3, 2, 2, 0, 0, 0]); +//! assert_eq!(rep_levels, vec![0, 1, 0, 1, 1, 0, 0, 0]); +//! ``` + +pub mod page; +pub mod reader; +pub mod writer; diff --git a/rust/src/parquet/column/page.rs b/rust/src/parquet/column/page.rs new file mode 100644 index 0000000000000..115037cba0bd5 --- /dev/null +++ b/rust/src/parquet/column/page.rs @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains Parquet Page definitions and page reader interface. + +use crate::parquet::basic::{Encoding, PageType}; +use crate::parquet::errors::Result; +use crate::parquet::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; +use crate::parquet::util::memory::ByteBufferPtr; + +/// Parquet Page definition. +/// +/// List of supported pages. +/// These are 1-to-1 mapped from the equivalent Thrift definitions, except `buf` which +/// used to store uncompressed bytes of the page. +pub enum Page { + DataPage { + buf: ByteBufferPtr, + num_values: u32, + encoding: Encoding, + def_level_encoding: Encoding, + rep_level_encoding: Encoding, + statistics: Option, + }, + DataPageV2 { + buf: ByteBufferPtr, + num_values: u32, + encoding: Encoding, + num_nulls: u32, + num_rows: u32, + def_levels_byte_len: u32, + rep_levels_byte_len: u32, + is_compressed: bool, + statistics: Option, + }, + DictionaryPage { + buf: ByteBufferPtr, + num_values: u32, + encoding: Encoding, + is_sorted: bool, + }, +} + +impl Page { + /// Returns [`PageType`](`::basic::PageType`) for this page. + pub fn page_type(&self) -> PageType { + match self { + &Page::DataPage { .. } => PageType::DATA_PAGE, + &Page::DataPageV2 { .. } => PageType::DATA_PAGE_V2, + &Page::DictionaryPage { .. } => PageType::DICTIONARY_PAGE, + } + } + + /// Returns internal byte buffer reference for this page. + pub fn buffer(&self) -> &ByteBufferPtr { + match self { + &Page::DataPage { ref buf, .. } => &buf, + &Page::DataPageV2 { ref buf, .. } => &buf, + &Page::DictionaryPage { ref buf, .. } => &buf, + } + } + + /// Returns number of values in this page. + pub fn num_values(&self) -> u32 { + match self { + &Page::DataPage { num_values, .. } => num_values, + &Page::DataPageV2 { num_values, .. } => num_values, + &Page::DictionaryPage { num_values, .. } => num_values, + } + } + + /// Returns this page [`Encoding`](`::basic::Encoding`). + pub fn encoding(&self) -> Encoding { + match self { + &Page::DataPage { encoding, .. } => encoding, + &Page::DataPageV2 { encoding, .. } => encoding, + &Page::DictionaryPage { encoding, .. } => encoding, + } + } + + /// Returns optional [`Statistics`](`::file::metadata::Statistics`). + pub fn statistics(&self) -> Option<&Statistics> { + match self { + &Page::DataPage { ref statistics, .. } => statistics.as_ref(), + &Page::DataPageV2 { ref statistics, .. } => statistics.as_ref(), + &Page::DictionaryPage { .. } => None, + } + } +} + +/// Helper struct to represent pages with potentially compressed buffer (data page v1) or +/// compressed and concatenated buffer (def levels + rep levels + compressed values for +/// data page v2). +/// +/// The difference with `Page` is that `Page` buffer is always uncompressed. +pub struct CompressedPage { + compressed_page: Page, + uncompressed_size: usize, +} + +impl CompressedPage { + /// Creates `CompressedPage` from a page with potentially compressed buffer and + /// uncompressed size. + pub fn new(compressed_page: Page, uncompressed_size: usize) -> Self { + Self { + compressed_page, + uncompressed_size, + } + } + + /// Returns page type. + pub fn page_type(&self) -> PageType { + self.compressed_page.page_type() + } + + /// Returns underlying page with potentially compressed buffer. + pub fn compressed_page(&self) -> &Page { + &self.compressed_page + } + + /// Returns uncompressed size in bytes. + pub fn uncompressed_size(&self) -> usize { + self.uncompressed_size + } + + /// Returns compressed size in bytes. + /// + /// Note that it is assumed that buffer is compressed, but it may not be. In this + /// case compressed size will be equal to uncompressed size. + pub fn compressed_size(&self) -> usize { + self.compressed_page.buffer().len() + } + + /// Number of values in page. + pub fn num_values(&self) -> u32 { + self.compressed_page.num_values() + } + + /// Returns encoding for values in page. + pub fn encoding(&self) -> Encoding { + self.compressed_page.encoding() + } + + /// Returns slice of compressed buffer in the page. + pub fn data(&self) -> &[u8] { + self.compressed_page.buffer().data() + } +} + +/// Contains page write metrics. +pub struct PageWriteSpec { + pub page_type: PageType, + pub uncompressed_size: usize, + pub compressed_size: usize, + pub num_values: u32, + pub offset: u64, + pub bytes_written: u64, +} + +impl PageWriteSpec { + /// Creates new spec with default page write metrics. + pub fn new() -> Self { + Self { + page_type: PageType::DATA_PAGE, + uncompressed_size: 0, + compressed_size: 0, + num_values: 0, + offset: 0, + bytes_written: 0, + } + } +} + +/// API for reading pages from a column chunk. +/// This offers a iterator like API to get the next page. +pub trait PageReader { + /// Gets the next page in the column chunk associated with this reader. + /// Returns `None` if there are no pages left. + fn get_next_page(&mut self) -> Result>; +} + +/// API for writing pages in a column chunk. +/// +/// It is reasonable to assume that all pages will be written in the correct order, e.g. +/// dictionary page followed by data pages, or a set of data pages, etc. +pub trait PageWriter { + /// Writes a page into the output stream/sink. + /// Returns `PageWriteSpec` that contains information about written page metrics, + /// including number of bytes, size, number of values, offset, etc. + /// + /// This method is called for every compressed page we write into underlying buffer, + /// either data page or dictionary page. + fn write_page(&mut self, page: CompressedPage) -> Result; + + /// Writes column chunk metadata into the output stream/sink. + /// + /// This method is called once before page writer is closed, normally when writes are + /// finalised in column writer. + fn write_metadata(&mut self, metadata: &ColumnChunkMetaData) -> Result<()>; + + /// Closes resources and flushes underlying sink. + /// Page writer should not be used after this method is called. + fn close(&mut self) -> Result<()>; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_page() { + let data_page = Page::DataPage { + buf: ByteBufferPtr::new(vec![0, 1, 2]), + num_values: 10, + encoding: Encoding::PLAIN, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + }; + assert_eq!(data_page.page_type(), PageType::DATA_PAGE); + assert_eq!(data_page.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(data_page.num_values(), 10); + assert_eq!(data_page.encoding(), Encoding::PLAIN); + assert_eq!( + data_page.statistics(), + Some(&Statistics::int32(Some(1), Some(2), None, 1, true)) + ); + + let data_page_v2 = Page::DataPageV2 { + buf: ByteBufferPtr::new(vec![0, 1, 2]), + num_values: 10, + encoding: Encoding::PLAIN, + num_nulls: 5, + num_rows: 20, + def_levels_byte_len: 30, + rep_levels_byte_len: 40, + is_compressed: false, + statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + }; + assert_eq!(data_page_v2.page_type(), PageType::DATA_PAGE_V2); + assert_eq!(data_page_v2.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(data_page_v2.num_values(), 10); + assert_eq!(data_page_v2.encoding(), Encoding::PLAIN); + assert_eq!( + data_page_v2.statistics(), + Some(&Statistics::int32(Some(1), Some(2), None, 1, true)) + ); + + let dict_page = Page::DictionaryPage { + buf: ByteBufferPtr::new(vec![0, 1, 2]), + num_values: 10, + encoding: Encoding::PLAIN, + is_sorted: false, + }; + assert_eq!(dict_page.page_type(), PageType::DICTIONARY_PAGE); + assert_eq!(dict_page.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(dict_page.num_values(), 10); + assert_eq!(dict_page.encoding(), Encoding::PLAIN); + assert_eq!(dict_page.statistics(), None); + } + + #[test] + fn test_compressed_page() { + let data_page = Page::DataPage { + buf: ByteBufferPtr::new(vec![0, 1, 2]), + num_values: 10, + encoding: Encoding::PLAIN, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), + }; + + let cpage = CompressedPage::new(data_page, 5); + + assert_eq!(cpage.page_type(), PageType::DATA_PAGE); + assert_eq!(cpage.uncompressed_size(), 5); + assert_eq!(cpage.compressed_size(), 3); + assert_eq!(cpage.num_values(), 10); + assert_eq!(cpage.encoding(), Encoding::PLAIN); + assert_eq!(cpage.data(), &[0, 1, 2]); + } +} diff --git a/rust/src/parquet/column/reader.rs b/rust/src/parquet/column/reader.rs new file mode 100644 index 0000000000000..f3dde31ab9a14 --- /dev/null +++ b/rust/src/parquet/column/reader.rs @@ -0,0 +1,1576 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains column reader API. + +use std::{ + cmp::{max, min}, + collections::HashMap, + mem, +}; + +use super::page::{Page, PageReader}; +use crate::parquet::basic::*; +use crate::parquet::data_type::*; +use crate::parquet::encodings::{ + decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}, + levels::LevelDecoder, +}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::ColumnDescPtr; +use crate::parquet::util::memory::ByteBufferPtr; + +/// Column reader for a Parquet type. +pub enum ColumnReader { + BoolColumnReader(ColumnReaderImpl), + Int32ColumnReader(ColumnReaderImpl), + Int64ColumnReader(ColumnReaderImpl), + Int96ColumnReader(ColumnReaderImpl), + FloatColumnReader(ColumnReaderImpl), + DoubleColumnReader(ColumnReaderImpl), + ByteArrayColumnReader(ColumnReaderImpl), + FixedLenByteArrayColumnReader(ColumnReaderImpl), +} + +/// Gets a specific column reader corresponding to column descriptor `col_descr`. The +/// column reader will read from pages in `col_page_reader`. +pub fn get_column_reader( + col_descr: ColumnDescPtr, + col_page_reader: Box, +) -> ColumnReader { + match col_descr.physical_type() { + Type::BOOLEAN => { + ColumnReader::BoolColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT32 => { + ColumnReader::Int32ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT64 => { + ColumnReader::Int64ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT96 => { + ColumnReader::Int96ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::FLOAT => { + ColumnReader::FloatColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::DOUBLE => { + ColumnReader::DoubleColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::BYTE_ARRAY => { + ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::FIXED_LEN_BYTE_ARRAY => ColumnReader::FixedLenByteArrayColumnReader( + ColumnReaderImpl::new(col_descr, col_page_reader), + ), + } +} + +/// Gets a typed column reader for the specific type `T`, by "up-casting" `col_reader` of +/// non-generic type to a generic column reader type `ColumnReaderImpl`. +/// +/// NOTE: the caller MUST guarantee that the actual enum value for `col_reader` matches +/// the type `T`. Otherwise, disastrous consequence could happen. +pub fn get_typed_column_reader(col_reader: ColumnReader) -> ColumnReaderImpl { + match col_reader { + ColumnReader::BoolColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::Int32ColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::Int64ColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::Int96ColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::FloatColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::DoubleColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::ByteArrayColumnReader(r) => unsafe { mem::transmute(r) }, + ColumnReader::FixedLenByteArrayColumnReader(r) => unsafe { mem::transmute(r) }, + } +} + +/// Typed value reader for a particular primitive column. +pub struct ColumnReaderImpl { + descr: ColumnDescPtr, + def_level_decoder: Option, + rep_level_decoder: Option, + page_reader: Box, + current_encoding: Option, + + // The total number of values stored in the data page. + num_buffered_values: u32, + + // The number of values from the current data page that has been decoded into memory + // so far. + num_decoded_values: u32, + + // Cache of decoders for existing encodings + decoders: HashMap>>, +} + +impl ColumnReaderImpl { + /// Creates new column reader based on column descriptor and page reader. + pub fn new(descr: ColumnDescPtr, page_reader: Box) -> Self { + Self { + descr, + def_level_decoder: None, + rep_level_decoder: None, + page_reader, + current_encoding: None, + num_buffered_values: 0, + num_decoded_values: 0, + decoders: HashMap::new(), + } + } + + /// Reads a batch of values of at most `batch_size`. + /// + /// This will try to read from the row group, and fills up at most `batch_size` values + /// for `def_levels`, `rep_levels` and `values`. It will stop either when the row group + /// is depleted or `batch_size` values has been read, or there is no space in the input + /// slices (values/definition levels/repetition levels). + /// + /// Note that in case the field being read is not required, `values` could contain less + /// values than `def_levels`. Also note that this will skip reading def / rep levels if + /// the field is required / not repeated, respectively. + /// + /// If `def_levels` or `rep_levels` is `None`, this will also skip reading the + /// respective levels. This is useful when the caller of this function knows in advance + /// that the field is required and non-repeated, therefore can avoid allocating memory + /// for the levels data. Note that if field has definition levels, but caller provides + /// None, there might be inconsistency between levels/values (see comments below). + /// + /// Returns a tuple where the first element is the actual number of values read, + /// and the second element is the actual number of levels read. + #[inline] + pub fn read_batch( + &mut self, + batch_size: usize, + mut def_levels: Option<&mut [i16]>, + mut rep_levels: Option<&mut [i16]>, + values: &mut [T::T], + ) -> Result<(usize, usize)> { + let mut values_read = 0; + let mut levels_read = 0; + + // Compute the smallest batch size we can read based on provided slices + let mut batch_size = min(batch_size, values.len()); + if let Some(ref levels) = def_levels { + batch_size = min(batch_size, levels.len()); + } + if let Some(ref levels) = rep_levels { + batch_size = min(batch_size, levels.len()); + } + + // Read exhaustively all pages until we read all batch_size values/levels + // or there are no more values/levels to read. + while max(values_read, levels_read) < batch_size { + if !self.has_next()? { + break; + } + + // Batch size for the current iteration + let iter_batch_size = { + // Compute approximate value based on values decoded so far + let mut adjusted_size = min( + batch_size, + (self.num_buffered_values - self.num_decoded_values) as usize, + ); + + // Adjust batch size by taking into account how much space is left in values + // slice or levels slices (if available) + adjusted_size = min(adjusted_size, values.len() - values_read); + if let Some(ref levels) = def_levels { + adjusted_size = min(adjusted_size, levels.len() - levels_read); + } + if let Some(ref levels) = rep_levels { + adjusted_size = min(adjusted_size, levels.len() - levels_read); + } + + adjusted_size + }; + + let mut values_to_read = 0; + let mut num_def_levels = 0; + let mut num_rep_levels = 0; + + // If the field is required and non-repeated, there are no definition levels + if self.descr.max_def_level() > 0 && def_levels.as_ref().is_some() { + if let Some(ref mut levels) = def_levels { + num_def_levels = self + .read_def_levels(&mut levels[levels_read..levels_read + iter_batch_size])?; + for i in levels_read..levels_read + num_def_levels { + if levels[i] == self.descr.max_def_level() { + values_to_read += 1; + } + } + } + } else { + // If max definition level == 0, then it is REQUIRED field, read all values. + // If definition levels are not provided, we still read all values. + values_to_read = iter_batch_size; + } + + if self.descr.max_rep_level() > 0 && rep_levels.is_some() { + if let Some(ref mut levels) = rep_levels { + num_rep_levels = self + .read_rep_levels(&mut levels[levels_read..levels_read + iter_batch_size])?; + + // If definition levels are defined, check that rep levels == def levels + if def_levels.is_some() { + assert_eq!( + num_def_levels, num_rep_levels, + "Number of decoded rep / def levels did not match" + ); + } + } + } + + // At this point we have read values, definition and repetition levels. + // If both definition and repetition levels are defined, their counts + // should be equal. Values count is always less or equal to definition levels. + // + // Note that if field is not required, but no definition levels are provided, + // we would read values of batch size and (if provided, of course) repetition + // levels of batch size - [!] they will not be synced, because only definition + // levels enforce number of non-null values to read. + + let curr_values_read = + self.read_values(&mut values[values_read..values_read + values_to_read])?; + + // Update all "return" counters and internal state. + + // This is to account for when def or rep levels are not provided + let curr_levels_read = max(num_def_levels, num_rep_levels); + self.num_decoded_values += max(curr_levels_read, curr_values_read) as u32; + levels_read += curr_levels_read; + values_read += curr_values_read; + } + + Ok((values_read, levels_read)) + } + + /// Reads a new page and set up the decoders for levels, values or dictionary. + /// Returns false if there's no page left. + fn read_new_page(&mut self) -> Result { + #[allow(while_true)] + while true { + match self.page_reader.get_next_page()? { + // No more page to read + None => return Ok(false), + Some(current_page) => { + match current_page { + // 1. Dictionary page: configure dictionary for this page. + p @ Page::DictionaryPage { .. } => { + self.configure_dictionary(p)?; + continue; + } + // 2. Data page v1 + Page::DataPage { + buf, + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + statistics: _, + } => { + self.num_buffered_values = num_values; + self.num_decoded_values = 0; + + let mut buffer_ptr = buf; + + if self.descr.max_rep_level() > 0 { + let mut rep_decoder = LevelDecoder::v1( + rep_level_encoding, + self.descr.max_rep_level(), + ); + let total_bytes = rep_decoder + .set_data(self.num_buffered_values as usize, buffer_ptr.all()); + buffer_ptr = buffer_ptr.start_from(total_bytes); + self.rep_level_decoder = Some(rep_decoder); + } + + if self.descr.max_def_level() > 0 { + let mut def_decoder = LevelDecoder::v1( + def_level_encoding, + self.descr.max_def_level(), + ); + let total_bytes = def_decoder + .set_data(self.num_buffered_values as usize, buffer_ptr.all()); + buffer_ptr = buffer_ptr.start_from(total_bytes); + self.def_level_decoder = Some(def_decoder); + } + + // Data page v1 does not have offset, all content of buffer should be passed + self.set_current_page_encoding( + encoding, + &buffer_ptr, + 0, + num_values as usize, + )?; + return Ok(true); + } + // 3. Data page v2 + Page::DataPageV2 { + buf, + num_values, + encoding, + num_nulls: _, + num_rows: _, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed: _, + statistics: _, + } => { + self.num_buffered_values = num_values; + self.num_decoded_values = 0; + + let mut offset = 0; + + // DataPage v2 only supports RLE encoding for repetition levels + if self.descr.max_rep_level() > 0 { + let mut rep_decoder = LevelDecoder::v2(self.descr.max_rep_level()); + let bytes_read = rep_decoder.set_data_range( + self.num_buffered_values as usize, + &buf, + offset, + rep_levels_byte_len as usize, + ); + offset += bytes_read; + self.rep_level_decoder = Some(rep_decoder); + } + + // DataPage v2 only supports RLE encoding for definition levels + if self.descr.max_def_level() > 0 { + let mut def_decoder = LevelDecoder::v2(self.descr.max_def_level()); + let bytes_read = def_decoder.set_data_range( + self.num_buffered_values as usize, + &buf, + offset, + def_levels_byte_len as usize, + ); + offset += bytes_read; + self.def_level_decoder = Some(def_decoder); + } + + self.set_current_page_encoding( + encoding, + &buf, + offset, + num_values as usize, + )?; + return Ok(true); + } + }; + } + } + } + + Ok(true) + } + + /// Resolves and updates encoding and set decoder for the current page + fn set_current_page_encoding( + &mut self, + mut encoding: Encoding, + buffer_ptr: &ByteBufferPtr, + offset: usize, + len: usize, + ) -> Result<()> { + if encoding == Encoding::PLAIN_DICTIONARY { + encoding = Encoding::RLE_DICTIONARY; + } + + let decoder = if encoding == Encoding::RLE_DICTIONARY { + self.decoders + .get_mut(&encoding) + .expect("Decoder for dict should have been set") + } else { + // Search cache for data page decoder + if !self.decoders.contains_key(&encoding) { + // Initialize decoder for this page + let data_decoder = get_decoder::(self.descr.clone(), encoding)?; + self.decoders.insert(encoding, data_decoder); + } + self.decoders.get_mut(&encoding).unwrap() + }; + + decoder.set_data(buffer_ptr.start_from(offset), len as usize)?; + self.current_encoding = Some(encoding); + Ok(()) + } + + #[inline] + fn has_next(&mut self) -> Result { + if self.num_buffered_values == 0 || self.num_buffered_values == self.num_decoded_values { + // TODO: should we return false if read_new_page() = true and + // num_buffered_values = 0? + if !self.read_new_page()? { + Ok(false) + } else { + Ok(self.num_buffered_values != 0) + } + } else { + Ok(true) + } + } + + #[inline] + fn read_rep_levels(&mut self, buffer: &mut [i16]) -> Result { + let level_decoder = self + .rep_level_decoder + .as_mut() + .expect("rep_level_decoder be set"); + level_decoder.get(buffer) + } + + #[inline] + fn read_def_levels(&mut self, buffer: &mut [i16]) -> Result { + let level_decoder = self + .def_level_decoder + .as_mut() + .expect("def_level_decoder be set"); + level_decoder.get(buffer) + } + + #[inline] + fn read_values(&mut self, buffer: &mut [T::T]) -> Result { + let encoding = self + .current_encoding + .expect("current_encoding should be set"); + let current_decoder = self + .decoders + .get_mut(&encoding) + .expect(format!("decoder for encoding {} should be set", encoding).as_str()); + current_decoder.get(buffer) + } + + #[inline] + fn configure_dictionary(&mut self, page: Page) -> Result { + let mut encoding = page.encoding(); + if encoding == Encoding::PLAIN || encoding == Encoding::PLAIN_DICTIONARY { + encoding = Encoding::RLE_DICTIONARY + } + + if self.decoders.contains_key(&encoding) { + return Err(general_err!("Column cannot have more than one dictionary")); + } + + if encoding == Encoding::RLE_DICTIONARY { + let mut dictionary = PlainDecoder::::new(self.descr.type_length()); + let num_values = page.num_values(); + dictionary.set_data(page.buffer().clone(), num_values as usize)?; + + let mut decoder = DictDecoder::new(); + decoder.set_dict(Box::new(dictionary))?; + self.decoders.insert(encoding, Box::new(decoder)); + Ok(true) + } else { + Err(nyi_err!( + "Invalid/Unsupported encoding type for dictionary: {}", + encoding + )) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use rand::distributions::range::SampleRange; + use std::{collections::VecDeque, rc::Rc, vec::IntoIter}; + + use crate::parquet::basic::Type as PhysicalType; + use crate::parquet::column::page::Page; + use crate::parquet::encodings::{ + encoding::{get_encoder, DictEncoder, Encoder}, + levels::{max_buffer_size, LevelEncoder}, + }; + use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::parquet::util::{ + memory::{ByteBufferPtr, MemTracker, MemTrackerPtr}, + test_common::random_numbers_range, + }; + + const NUM_LEVELS: usize = 128; + const NUM_PAGES: usize = 2; + const MAX_DEF_LEVEL: i16 = 5; + const MAX_REP_LEVEL: i16 = 5; + + // Macro to generate test cases + macro_rules! test { + // branch for generating i32 cases + ($test_func:ident, i32, $func:ident, $def_level:expr, $rep_level:expr, + $num_pages:expr, $num_levels:expr, $batch_size:expr, $min:expr, $max:expr) => { + test_internal!( + $test_func, + Int32Type, + get_test_int32_type, + $func, + $def_level, + $rep_level, + $num_pages, + $num_levels, + $batch_size, + $min, + $max + ); + }; + // branch for generating i64 cases + ($test_func:ident, i64, $func:ident, $def_level:expr, $rep_level:expr, + $num_pages:expr, $num_levels:expr, $batch_size:expr, $min:expr, $max:expr) => { + test_internal!( + $test_func, + Int64Type, + get_test_int64_type, + $func, + $def_level, + $rep_level, + $num_pages, + $num_levels, + $batch_size, + $min, + $max + ); + }; + } + + macro_rules! test_internal { + ($test_func:ident, $ty:ident, $pty:ident, $func:ident, $def_level:expr, + $rep_level:expr, $num_pages:expr, $num_levels:expr, $batch_size:expr, + $min:expr, $max:expr) => { + #[test] + fn $test_func() { + let desc = Rc::new(ColumnDescriptor::new( + Rc::new($pty()), + None, + $def_level, + $rep_level, + ColumnPath::new(Vec::new()), + )); + let mut tester = ColumnReaderTester::<$ty>::new(); + tester.$func(desc, $num_pages, $num_levels, $batch_size, $min, $max); + } + }; + } + + test!( + test_read_plain_v1_int32, + i32, + plain_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i32::MIN, + ::std::i32::MAX + ); + test!( + test_read_plain_v2_int32, + i32, + plain_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i32::MIN, + ::std::i32::MAX + ); + + test!( + test_read_plain_v1_int32_uneven, + i32, + plain_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 17, + ::std::i32::MIN, + ::std::i32::MAX + ); + test!( + test_read_plain_v2_int32_uneven, + i32, + plain_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 17, + ::std::i32::MIN, + ::std::i32::MAX + ); + + test!( + test_read_plain_v1_int32_multi_page, + i32, + plain_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 512, + ::std::i32::MIN, + ::std::i32::MAX + ); + test!( + test_read_plain_v2_int32_multi_page, + i32, + plain_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 512, + ::std::i32::MIN, + ::std::i32::MAX + ); + + // test cases when column descriptor has MAX_DEF_LEVEL = 0 and MAX_REP_LEVEL = 0 + test!( + test_read_plain_v1_int32_required_non_repeated, + i32, + plain_v1, + 0, + 0, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i32::MIN, + ::std::i32::MAX + ); + test!( + test_read_plain_v2_int32_required_non_repeated, + i32, + plain_v2, + 0, + 0, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i32::MIN, + ::std::i32::MAX + ); + + test!( + test_read_plain_v1_int64, + i64, + plain_v1, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i64::MIN, + ::std::i64::MAX + ); + test!( + test_read_plain_v2_int64, + i64, + plain_v2, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i64::MIN, + ::std::i64::MAX + ); + + test!( + test_read_plain_v1_int64_uneven, + i64, + plain_v1, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 17, + ::std::i64::MIN, + ::std::i64::MAX + ); + test!( + test_read_plain_v2_int64_uneven, + i64, + plain_v2, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 17, + ::std::i64::MIN, + ::std::i64::MAX + ); + + test!( + test_read_plain_v1_int64_multi_page, + i64, + plain_v1, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 512, + ::std::i64::MIN, + ::std::i64::MAX + ); + test!( + test_read_plain_v2_int64_multi_page, + i64, + plain_v2, + 1, + 1, + NUM_PAGES, + NUM_LEVELS, + 512, + ::std::i64::MIN, + ::std::i64::MAX + ); + + // test cases when column descriptor has MAX_DEF_LEVEL = 0 and MAX_REP_LEVEL = 0 + test!( + test_read_plain_v1_int64_required_non_repeated, + i64, + plain_v1, + 0, + 0, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i64::MIN, + ::std::i64::MAX + ); + test!( + test_read_plain_v2_int64_required_non_repeated, + i64, + plain_v2, + 0, + 0, + NUM_PAGES, + NUM_LEVELS, + 16, + ::std::i64::MIN, + ::std::i64::MAX + ); + + test!( + test_read_dict_v1_int32_small, + i32, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + 2, + 2, + 16, + 0, + 3 + ); + test!( + test_read_dict_v2_int32_small, + i32, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + 2, + 2, + 16, + 0, + 3 + ); + + test!( + test_read_dict_v1_int32, + i32, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + 0, + 3 + ); + test!( + test_read_dict_v2_int32, + i32, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + 0, + 3 + ); + + test!( + test_read_dict_v1_int32_uneven, + i32, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 17, + 0, + 3 + ); + test!( + test_read_dict_v2_int32_uneven, + i32, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 17, + 0, + 3 + ); + + test!( + test_read_dict_v1_int32_multi_page, + i32, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 512, + 0, + 3 + ); + test!( + test_read_dict_v2_int32_multi_page, + i32, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 512, + 0, + 3 + ); + + test!( + test_read_dict_v1_int64, + i64, + dict_v1, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + 0, + 3 + ); + test!( + test_read_dict_v2_int64, + i64, + dict_v2, + MAX_DEF_LEVEL, + MAX_REP_LEVEL, + NUM_PAGES, + NUM_LEVELS, + 16, + 0, + 3 + ); + + #[test] + fn test_read_batch_values_only() { + test_read_batch_int32(16, &mut vec![0; 10], None, None); // < batch_size + test_read_batch_int32(16, &mut vec![0; 16], None, None); // == batch_size + test_read_batch_int32(16, &mut vec![0; 51], None, None); // > batch_size + } + + #[test] + fn test_read_batch_values_def_levels() { + test_read_batch_int32(16, &mut vec![0; 10], Some(&mut vec![0; 10]), None); + test_read_batch_int32(16, &mut vec![0; 16], Some(&mut vec![0; 16]), None); + test_read_batch_int32(16, &mut vec![0; 51], Some(&mut vec![0; 51]), None); + } + + #[test] + fn test_read_batch_values_rep_levels() { + test_read_batch_int32(16, &mut vec![0; 10], None, Some(&mut vec![0; 10])); + test_read_batch_int32(16, &mut vec![0; 16], None, Some(&mut vec![0; 16])); + test_read_batch_int32(16, &mut vec![0; 51], None, Some(&mut vec![0; 51])); + } + + #[test] + fn test_read_batch_different_buf_sizes() { + test_read_batch_int32( + 16, + &mut vec![0; 8], + Some(&mut vec![0; 9]), + Some(&mut vec![0; 7]), + ); + test_read_batch_int32( + 16, + &mut vec![0; 1], + Some(&mut vec![0; 9]), + Some(&mut vec![0; 3]), + ); + } + + #[test] + fn test_read_batch_values_def_rep_levels() { + test_read_batch_int32( + 128, + &mut vec![0; 128], + Some(&mut vec![0; 128]), + Some(&mut vec![0; 128]), + ); + } + + #[test] + fn test_read_batch_adjust_after_buffering_page() { + // This test covers scenario when buffering new page results in setting number + // of decoded values to 0, resulting on reading `batch_size` of values, but it is + // larger than we can insert into slice (affects values and levels). + // + // Note: values are chosen to reproduce the issue. + // + let primitive_type = get_test_int32_type(); + let desc = Rc::new(ColumnDescriptor::new( + Rc::new(primitive_type), + None, + 1, + 1, + ColumnPath::new(Vec::new()), + )); + + let num_pages = 2; + let num_levels = 4; + let batch_size = 5; + let values = &mut vec![0; 7]; + let def_levels = &mut vec![0; 7]; + let rep_levels = &mut vec![0; 7]; + + let mut tester = ColumnReaderTester::::new(); + tester.test_read_batch( + desc, + Encoding::RLE_DICTIONARY, + num_pages, + num_levels, + batch_size, + ::std::i32::MIN, + ::std::i32::MAX, + values, + Some(def_levels), + Some(rep_levels), + false, + ); + } + + // ---------------------------------------------------------------------- + // Helper methods to make pages and test + // + // # Overview + // + // Most of the test functionality is implemented in `ColumnReaderTester`, which + // provides some general data page test methods: + // - `test_read_batch_general` + // - `test_read_batch` + // + // There are also some high level wrappers that are part of `ColumnReaderTester`: + // - `plain_v1` -> call `test_read_batch_general` with data page v1 and plain encoding + // - `plain_v2` -> call `test_read_batch_general` with data page v2 and plain encoding + // - `dict_v1` -> call `test_read_batch_general` with data page v1 + dictionary page + // - `dict_v2` -> call `test_read_batch_general` with data page v2 + dictionary page + // + // And even higher level wrappers that simplify testing of almost the same test cases: + // - `get_test_int32_type`, provides dummy schema type + // - `get_test_int64_type`, provides dummy schema type + // - `test_read_batch_int32`, wrapper for `read_batch` tests, since they are basically + // the same, just different def/rep levels and batch size. + // + // # Page assembly + // + // Page construction and generation of values, definition and repetition levels happens + // in `make_pages` function. + // All values are randomly generated based on provided min/max, levels are calculated + // based on provided max level for column descriptor (which is basically either int32 + // or int64 type in tests) and `levels_per_page` variable. + // + // We use `DataPageBuilder` and its implementation `DataPageBuilderImpl` to actually + // turn values, definition and repetition levels into data pages (either v1 or v2). + // + // Those data pages are then stored as part of `TestPageReader` (we just pass vector + // of generated pages directly), which implements `PageReader` interface. + // + // # Comparison + // + // This allows us to pass test page reader into column reader, so we can test + // functionality of column reader - see `test_read_batch`, where we create column + // reader -> typed column reader, buffer values in `read_batch` method and compare + // output with generated data. + + // Returns dummy Parquet `Type` for primitive field, because most of our tests use + // INT32 physical type. + fn get_test_int32_type() -> SchemaType { + SchemaType::primitive_type_builder("a", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .with_length(-1) + .build() + .expect("build() should be OK") + } + + // Returns dummy Parquet `Type` for INT64 physical type. + fn get_test_int64_type() -> SchemaType { + SchemaType::primitive_type_builder("a", PhysicalType::INT64) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_64) + .with_length(-1) + .build() + .expect("build() should be OK") + } + + // Tests `read_batch()` functionality for INT32. + // + // This is a high level wrapper on `ColumnReaderTester` that allows us to specify some + // boilerplate code for setting up definition/repetition levels and column descriptor. + fn test_read_batch_int32( + batch_size: usize, + values: &mut [i32], + def_levels: Option<&mut [i16]>, + rep_levels: Option<&mut [i16]>, + ) { + let primitive_type = get_test_int32_type(); + // make field is required based on provided slices of levels + let max_def_level = if def_levels.is_some() { + MAX_DEF_LEVEL + } else { + 0 + }; + let max_rep_level = if def_levels.is_some() { + MAX_REP_LEVEL + } else { + 0 + }; + + let desc = Rc::new(ColumnDescriptor::new( + Rc::new(primitive_type), + None, + max_def_level, + max_rep_level, + ColumnPath::new(Vec::new()), + )); + let mut tester = ColumnReaderTester::::new(); + tester.test_read_batch( + desc, + Encoding::RLE_DICTIONARY, + NUM_PAGES, + NUM_LEVELS, + batch_size, + ::std::i32::MIN, + ::std::i32::MAX, + values, + def_levels, + rep_levels, + false, + ); + } + + struct ColumnReaderTester + where + T::T: PartialOrd + SampleRange + Copy, + { + rep_levels: Vec, + def_levels: Vec, + values: Vec, + } + + impl ColumnReaderTester + where + T::T: PartialOrd + SampleRange + Copy, + { + pub fn new() -> Self { + Self { + rep_levels: Vec::new(), + def_levels: Vec::new(), + values: Vec::new(), + } + } + + // Method to generate and test data pages v1 + fn plain_v1( + &mut self, + desc: ColumnDescPtr, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + ) { + self.test_read_batch_general( + desc, + Encoding::PLAIN, + num_pages, + num_levels, + batch_size, + min, + max, + false, + ); + } + + // Method to generate and test data pages v2 + fn plain_v2( + &mut self, + desc: ColumnDescPtr, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + ) { + self.test_read_batch_general( + desc, + Encoding::PLAIN, + num_pages, + num_levels, + batch_size, + min, + max, + true, + ); + } + + // Method to generate and test dictionary page + data pages v1 + fn dict_v1( + &mut self, + desc: ColumnDescPtr, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + ) { + self.test_read_batch_general( + desc, + Encoding::RLE_DICTIONARY, + num_pages, + num_levels, + batch_size, + min, + max, + false, + ); + } + + // Method to generate and test dictionary page + data pages v2 + fn dict_v2( + &mut self, + desc: ColumnDescPtr, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + ) { + self.test_read_batch_general( + desc, + Encoding::RLE_DICTIONARY, + num_pages, + num_levels, + batch_size, + min, + max, + true, + ); + } + + // Helper function for the general case of `read_batch()` where `values`, + // `def_levels` and `rep_levels` are always provided with enough space. + fn test_read_batch_general( + &mut self, + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + use_v2: bool, + ) { + let mut def_levels = vec![0; num_levels * num_pages]; + let mut rep_levels = vec![0; num_levels * num_pages]; + let mut values = vec![T::T::default(); num_levels * num_pages]; + self.test_read_batch( + desc, + encoding, + num_pages, + num_levels, + batch_size, + min, + max, + &mut values, + Some(&mut def_levels), + Some(&mut rep_levels), + use_v2, + ); + } + + // Helper function to test `read_batch()` method with custom buffers for values, + // definition and repetition levels. + fn test_read_batch( + &mut self, + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + num_levels: usize, + batch_size: usize, + min: T::T, + max: T::T, + values: &mut [T::T], + mut def_levels: Option<&mut [i16]>, + mut rep_levels: Option<&mut [i16]>, + use_v2: bool, + ) { + let mut pages = VecDeque::new(); + make_pages::( + desc.clone(), + encoding, + num_pages, + num_levels, + min, + max, + &mut self.def_levels, + &mut self.rep_levels, + &mut self.values, + &mut pages, + use_v2, + ); + let max_def_level = desc.max_def_level(); + let page_reader = TestPageReader::new(Vec::from(pages)); + let column_reader: ColumnReader = get_column_reader(desc, Box::new(page_reader)); + let mut typed_column_reader = get_typed_column_reader::(column_reader); + + let mut curr_values_read = 0; + let mut curr_levels_read = 0; + let mut done = false; + while !done { + let actual_def_levels = match &mut def_levels { + Some(ref mut vec) => Some(&mut vec[curr_levels_read..]), + None => None, + }; + let actual_rep_levels = match rep_levels { + Some(ref mut vec) => Some(&mut vec[curr_levels_read..]), + None => None, + }; + + let (values_read, levels_read) = typed_column_reader + .read_batch( + batch_size, + actual_def_levels, + actual_rep_levels, + &mut values[curr_values_read..], + ) + .expect("read_batch() should be OK"); + + if values_read == 0 && levels_read == 0 { + done = true; + } + + curr_values_read += values_read; + curr_levels_read += levels_read; + } + + assert!( + values.len() >= curr_values_read, + "values.len() >= values_read" + ); + assert_eq!( + &values[0..curr_values_read], + &self.values[0..curr_values_read], + "values content doesn't match" + ); + + if let Some(ref levels) = def_levels { + assert!( + levels.len() >= curr_levels_read, + "def_levels.len() >= levels_read" + ); + assert_eq!( + &levels[0..curr_levels_read], + &self.def_levels[0..curr_levels_read], + "definition levels content doesn't match" + ); + } + + if let Some(ref levels) = rep_levels { + assert!( + levels.len() >= curr_levels_read, + "rep_levels.len() >= levels_read" + ); + assert_eq!( + &levels[0..curr_levels_read], + &self.rep_levels[0..curr_levels_read], + "repetition levels content doesn't match" + ); + } + + if def_levels.is_none() && rep_levels.is_none() { + assert!( + curr_levels_read == 0, + "expected to read 0 levels, found {}", + curr_levels_read + ); + } else if def_levels.is_some() && max_def_level > 0 { + assert!( + curr_levels_read >= curr_values_read, + "expected levels read to be greater than values read" + ); + } + } + } + + struct TestPageReader { + pages: IntoIter, + } + + impl TestPageReader { + pub fn new(pages: Vec) -> Self { + Self { + pages: pages.into_iter(), + } + } + } + + impl PageReader for TestPageReader { + fn get_next_page(&mut self) -> Result> { + Ok(self.pages.next()) + } + } + + // ---------------------------------------------------------------------- + // Utility functions for generating testing pages + + trait DataPageBuilder { + fn add_rep_levels(&mut self, max_level: i16, rep_levels: &[i16]); + fn add_def_levels(&mut self, max_level: i16, def_levels: &[i16]); + fn add_values(&mut self, encoding: Encoding, values: &[T::T]); + fn add_indices(&mut self, indices: ByteBufferPtr); + fn consume(self) -> Page; + } + + /// A utility struct for building data pages (v1 or v2). Callers must call: + /// - add_rep_levels() + /// - add_def_levels() + /// - add_values() for normal data page / add_indices() for dictionary data page + /// - consume() + /// in order to populate and obtain a data page. + struct DataPageBuilderImpl { + desc: ColumnDescPtr, + encoding: Option, + mem_tracker: MemTrackerPtr, + num_values: u32, + buffer: Vec, + rep_levels_byte_len: u32, + def_levels_byte_len: u32, + datapage_v2: bool, + } + + impl DataPageBuilderImpl { + // `num_values` is the number of non-null values to put in the data page. + // `datapage_v2` flag is used to indicate if the generated data page should use V2 + // format or not. + fn new(desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) -> Self { + DataPageBuilderImpl { + desc, + encoding: None, + mem_tracker: Rc::new(MemTracker::new()), + num_values, + buffer: vec![], + rep_levels_byte_len: 0, + def_levels_byte_len: 0, + datapage_v2, + } + } + + // Adds levels to the buffer and return number of encoded bytes + fn add_levels(&mut self, max_level: i16, levels: &[i16]) -> u32 { + let size = max_buffer_size(Encoding::RLE, max_level, levels.len()); + let mut level_encoder = LevelEncoder::v1(Encoding::RLE, max_level, vec![0; size]); + level_encoder.put(levels).expect("put() should be OK"); + let encoded_levels = level_encoder.consume().expect("consume() should be OK"); + // Actual encoded bytes (without length offset) + let encoded_bytes = &encoded_levels[mem::size_of::()..]; + if self.datapage_v2 { + // Level encoder always initializes with offset of i32, where it stores length of + // encoded data; for data page v2 we explicitly store length, therefore we should + // skip i32 bytes. + self.buffer.extend_from_slice(encoded_bytes); + } else { + self.buffer.extend_from_slice(encoded_levels.as_slice()); + } + encoded_bytes.len() as u32 + } + } + + impl DataPageBuilder for DataPageBuilderImpl { + fn add_rep_levels(&mut self, max_levels: i16, rep_levels: &[i16]) { + self.num_values = rep_levels.len() as u32; + self.rep_levels_byte_len = self.add_levels(max_levels, rep_levels); + } + + fn add_def_levels(&mut self, max_levels: i16, def_levels: &[i16]) { + assert!( + self.num_values == def_levels.len() as u32, + "Must call `add_rep_levels() first!`" + ); + + self.def_levels_byte_len = self.add_levels(max_levels, def_levels); + } + + fn add_values(&mut self, encoding: Encoding, values: &[T::T]) { + assert!( + self.num_values >= values.len() as u32, + "num_values: {}, values.len(): {}", + self.num_values, + values.len() + ); + self.encoding = Some(encoding); + let mut encoder: Box> = + get_encoder::(self.desc.clone(), encoding, self.mem_tracker.clone()) + .expect("get_encoder() should be OK"); + encoder.put(values).expect("put() should be OK"); + let encoded_values = encoder + .flush_buffer() + .expect("consume_buffer() should be OK"); + self.buffer.extend_from_slice(encoded_values.data()); + } + + fn add_indices(&mut self, indices: ByteBufferPtr) { + self.encoding = Some(Encoding::RLE_DICTIONARY); + self.buffer.extend_from_slice(indices.data()); + } + + fn consume(self) -> Page { + if self.datapage_v2 { + Page::DataPageV2 { + buf: ByteBufferPtr::new(self.buffer), + num_values: self.num_values, + encoding: self.encoding.unwrap(), + num_nulls: 0, // set to dummy value - don't need this when reading data page + num_rows: self.num_values, // also don't need this when reading data page + def_levels_byte_len: self.def_levels_byte_len, + rep_levels_byte_len: self.rep_levels_byte_len, + is_compressed: false, + statistics: None, // set to None, we do not need statistics for tests + } + } else { + Page::DataPage { + buf: ByteBufferPtr::new(self.buffer), + num_values: self.num_values, + encoding: self.encoding.unwrap(), + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: None, // set to None, we do not need statistics for tests + } + } + } + } + + fn make_pages( + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + levels_per_page: usize, + min: T::T, + max: T::T, + def_levels: &mut Vec, + rep_levels: &mut Vec, + values: &mut Vec, + pages: &mut VecDeque, + use_v2: bool, + ) where + T::T: PartialOrd + SampleRange + Copy, + { + let mut num_values = 0; + let max_def_level = desc.max_def_level(); + let max_rep_level = desc.max_rep_level(); + + let mem_tracker = Rc::new(MemTracker::new()); + let mut dict_encoder = DictEncoder::::new(desc.clone(), mem_tracker); + + for i in 0..num_pages { + let mut num_values_cur_page = 0; + let level_range = i * levels_per_page..(i + 1) * levels_per_page; + + if max_def_level > 0 { + random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); + for dl in &def_levels[level_range.clone()] { + if *dl == max_def_level { + num_values_cur_page += 1; + } + } + } else { + num_values_cur_page = levels_per_page; + } + if max_rep_level > 0 { + random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); + } + random_numbers_range(num_values_cur_page, min, max, values); + + // Generate the current page + + let mut pb = DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); + if max_rep_level > 0 { + pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); + } + if max_def_level > 0 { + pb.add_def_levels(max_def_level, &def_levels[level_range]); + } + + let value_range = num_values..num_values + num_values_cur_page; + match encoding { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + let _ = dict_encoder.put(&values[value_range.clone()]); + let indices = dict_encoder + .write_indices() + .expect("write_indices() should be OK"); + pb.add_indices(indices); + } + Encoding::PLAIN => { + pb.add_values::(encoding, &values[value_range]); + } + enc @ _ => panic!("Unexpected encoding {}", enc), + } + + let data_page = pb.consume(); + pages.push_back(data_page); + num_values += num_values_cur_page; + } + + if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { + let dict = dict_encoder + .write_dict() + .expect("write_dict() should be OK"); + let dict_page = Page::DictionaryPage { + buf: dict, + num_values: dict_encoder.num_entries() as u32, + encoding: Encoding::RLE_DICTIONARY, + is_sorted: false, + }; + pages.push_front(dict_page); + } + } +} diff --git a/rust/src/parquet/column/writer.rs b/rust/src/parquet/column/writer.rs new file mode 100644 index 0000000000000..4798d9ad17927 --- /dev/null +++ b/rust/src/parquet/column/writer.rs @@ -0,0 +1,1617 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains column writer API. + +use std::{cmp, collections::VecDeque, mem, rc::Rc}; + +use crate::parquet::basic::{Compression, Encoding, PageType, Type}; +use crate::parquet::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter}; +use crate::parquet::compression::{create_codec, Codec}; +use crate::parquet::data_type::*; +use crate::parquet::encodings::{ + encoding::{get_encoder, DictEncoder, Encoder}, + levels::{max_buffer_size, LevelEncoder}, +}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::{ + metadata::ColumnChunkMetaData, + properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}, +}; +use crate::parquet::schema::types::ColumnDescPtr; +use crate::parquet::util::memory::{ByteBufferPtr, MemTracker}; + +/// Column writer for a Parquet type. +pub enum ColumnWriter { + BoolColumnWriter(ColumnWriterImpl), + Int32ColumnWriter(ColumnWriterImpl), + Int64ColumnWriter(ColumnWriterImpl), + Int96ColumnWriter(ColumnWriterImpl), + FloatColumnWriter(ColumnWriterImpl), + DoubleColumnWriter(ColumnWriterImpl), + ByteArrayColumnWriter(ColumnWriterImpl), + FixedLenByteArrayColumnWriter(ColumnWriterImpl), +} + +/// Gets a specific column writer corresponding to column descriptor `descr`. +pub fn get_column_writer( + descr: ColumnDescPtr, + props: WriterPropertiesPtr, + page_writer: Box, +) -> ColumnWriter { + match descr.physical_type() { + Type::BOOLEAN => { + ColumnWriter::BoolColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT32 => { + ColumnWriter::Int32ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT64 => { + ColumnWriter::Int64ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT96 => { + ColumnWriter::Int96ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::FLOAT => { + ColumnWriter::FloatColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::DOUBLE => { + ColumnWriter::DoubleColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::BYTE_ARRAY => { + ColumnWriter::ByteArrayColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::FIXED_LEN_BYTE_ARRAY => ColumnWriter::FixedLenByteArrayColumnWriter( + ColumnWriterImpl::new(descr, props, page_writer), + ), + } +} + +/// Gets a typed column writer for the specific type `T`, by "up-casting" `col_writer` of +/// non-generic type to a generic column writer type `ColumnWriterImpl`. +/// +/// NOTE: the caller MUST guarantee that the actual enum value for `col_writer` matches +/// the type `T`. Otherwise, disastrous consequence could happen. +pub fn get_typed_column_writer(col_writer: ColumnWriter) -> ColumnWriterImpl { + match col_writer { + ColumnWriter::BoolColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::Int32ColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::Int64ColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::Int96ColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::FloatColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::DoubleColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::ByteArrayColumnWriter(r) => unsafe { mem::transmute(r) }, + ColumnWriter::FixedLenByteArrayColumnWriter(r) => unsafe { mem::transmute(r) }, + } +} + +/// Typed column writer for a primitive column. +pub struct ColumnWriterImpl { + // Column writer properties + descr: ColumnDescPtr, + props: WriterPropertiesPtr, + page_writer: Box, + has_dictionary: bool, + dict_encoder: Option>, + encoder: Box>, + codec: Compression, + compressor: Option>, + // Metrics per page + num_buffered_values: u32, + num_buffered_encoded_values: u32, + num_buffered_rows: u32, + // Metrics per column writer + total_bytes_written: u64, + total_rows_written: u64, + total_uncompressed_size: u64, + total_compressed_size: u64, + total_num_values: u64, + dictionary_page_offset: Option, + data_page_offset: Option, + // Reused buffers + def_levels_sink: Vec, + rep_levels_sink: Vec, + data_pages: VecDeque, +} + +impl ColumnWriterImpl { + pub fn new( + descr: ColumnDescPtr, + props: WriterPropertiesPtr, + page_writer: Box, + ) -> Self { + let codec = props.compression(descr.path()); + let compressor = create_codec(codec).unwrap(); + + // Optionally set dictionary encoder. + let dict_encoder = + if props.dictionary_enabled(descr.path()) && Self::has_dictionary_support(&props) { + Some(DictEncoder::new(descr.clone(), Rc::new(MemTracker::new()))) + } else { + None + }; + + // Whether or not this column writer has a dictionary encoding. + let has_dictionary = dict_encoder.is_some(); + + // Set either main encoder or fallback encoder. + let fallback_encoder = get_encoder( + descr.clone(), + props + .encoding(descr.path()) + .unwrap_or(Self::fallback_encoding(&props)), + Rc::new(MemTracker::new()), + ) + .unwrap(); + + Self { + descr, + props, + page_writer, + has_dictionary, + dict_encoder, + encoder: fallback_encoder, + codec, + compressor, + num_buffered_values: 0, + num_buffered_encoded_values: 0, + num_buffered_rows: 0, + total_bytes_written: 0, + total_rows_written: 0, + total_uncompressed_size: 0, + total_compressed_size: 0, + total_num_values: 0, + dictionary_page_offset: None, + data_page_offset: None, + def_levels_sink: vec![], + rep_levels_sink: vec![], + data_pages: VecDeque::new(), + } + } + + /// Writes batch of values, definition levels and repetition levels. + /// Returns number of values processed (written). + /// + /// If definition and repetition levels are provided, we write fully those levels and + /// select how many values to write (this number will be returned), since number of + /// actual written values may be smaller than provided values. + /// + /// If only values are provided, then all values are written and the length of + /// of the values buffer is returned. + /// + /// Definition and/or repetition levels can be omitted, if values are + /// non-nullable and/or non-repeated. + pub fn write_batch( + &mut self, + values: &[T::T], + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + ) -> Result { + // We check for DataPage limits only after we have inserted the values. If a user + // writes a large number of values, the DataPage size can be well above the limit. + // + // The purpose of this chunking is to bound this. Even if a user writes large number + // of values, the chunking will ensure that we add data page at a reasonable pagesize + // limit. + + // TODO: find out why we don't account for size of levels when we estimate page size. + + // Find out the minimal length to prevent index out of bound errors. + let mut min_len = values.len(); + if let Some(levels) = def_levels { + min_len = cmp::min(min_len, levels.len()); + } + if let Some(levels) = rep_levels { + min_len = cmp::min(min_len, levels.len()); + } + + // Find out number of batches to process. + let write_batch_size = self.props.write_batch_size(); + let num_batches = min_len / write_batch_size; + + let mut values_offset = 0; + let mut levels_offset = 0; + + for _ in 0..num_batches { + values_offset += self.write_mini_batch( + &values[values_offset..values_offset + write_batch_size], + def_levels.map(|lv| &lv[levels_offset..levels_offset + write_batch_size]), + rep_levels.map(|lv| &lv[levels_offset..levels_offset + write_batch_size]), + )?; + levels_offset += write_batch_size; + } + + values_offset += self.write_mini_batch( + &values[values_offset..], + def_levels.map(|lv| &lv[levels_offset..]), + rep_levels.map(|lv| &lv[levels_offset..]), + )?; + + // Return total number of values processed. + Ok(values_offset) + } + + /// Returns total number of bytes written by this column writer so far. + /// This value is also returned when column writer is closed. + pub fn get_total_bytes_written(&self) -> u64 { + self.total_bytes_written + } + + /// Returns total number of rows written by this column writer so far. + /// This value is also returned when column writer is closed. + pub fn get_total_rows_written(&self) -> u64 { + self.total_rows_written + } + + /// Finalises writes and closes the column writer. + /// Returns total bytes written, total rows written and column chunk metadata. + pub fn close(mut self) -> Result<(u64, u64, ColumnChunkMetaData)> { + if self.dict_encoder.is_some() { + self.write_dictionary_page()?; + } + self.flush_data_pages()?; + let metadata = self.write_column_metadata()?; + self.dict_encoder = None; + self.page_writer.close()?; + + Ok((self.total_bytes_written, self.total_rows_written, metadata)) + } + + /// Writes mini batch of values, definition and repetition levels. + /// This allows fine-grained processing of values and maintaining a reasonable + /// page size. + fn write_mini_batch( + &mut self, + values: &[T::T], + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + ) -> Result { + let num_values; + let mut values_to_write = 0; + + // Check if number of definition levels is the same as number of repetition levels. + if def_levels.is_some() && rep_levels.is_some() { + let def = def_levels.unwrap(); + let rep = rep_levels.unwrap(); + if def.len() != rep.len() { + return Err(general_err!( + "Inconsistent length of definition and repetition levels: {} != {}", + def.len(), + rep.len() + )); + } + } + + // Process definition levels and determine how many values to write. + if self.descr.max_def_level() > 0 { + if def_levels.is_none() { + return Err(general_err!( + "Definition levels are required, because max definition level = {}", + self.descr.max_def_level() + )); + } + + let levels = def_levels.unwrap(); + num_values = levels.len(); + for &level in levels { + values_to_write += (level == self.descr.max_def_level()) as usize; + } + + self.write_definition_levels(levels); + } else { + values_to_write = values.len(); + num_values = values_to_write; + } + + // Process repetition levels and determine how many rows we are about to process. + if self.descr.max_rep_level() > 0 { + // A row could contain more than one value. + if rep_levels.is_none() { + return Err(general_err!( + "Repetition levels are required, because max repetition level = {}", + self.descr.max_rep_level() + )); + } + + // Count the occasions where we start a new row + let levels = rep_levels.unwrap(); + for &level in levels { + self.num_buffered_rows += (level == 0) as u32 + } + + self.write_repetition_levels(levels); + } else { + // Each value is exactly one row. + // Equals to the number of values, we count nulls as well. + self.num_buffered_rows += num_values as u32; + } + + // Check that we have enough values to write. + if values.len() < values_to_write { + return Err(general_err!( + "Expected to write {} values, but have only {}", + values_to_write, + values.len() + )); + } + + // TODO: update page statistics + + self.write_values(&values[0..values_to_write])?; + + self.num_buffered_values += num_values as u32; + self.num_buffered_encoded_values += values_to_write as u32; + + if self.should_add_data_page() { + self.add_data_page()?; + } + + if self.should_dict_fallback() { + self.dict_fallback()?; + } + + Ok(values_to_write) + } + + #[inline] + fn write_definition_levels(&mut self, def_levels: &[i16]) { + self.def_levels_sink.extend_from_slice(def_levels); + } + + #[inline] + fn write_repetition_levels(&mut self, rep_levels: &[i16]) { + self.rep_levels_sink.extend_from_slice(rep_levels); + } + + #[inline] + fn write_values(&mut self, values: &[T::T]) -> Result<()> { + match self.dict_encoder { + Some(ref mut encoder) => encoder.put(values), + None => self.encoder.put(values), + } + } + + /// Returns true if we need to fall back to non-dictionary encoding. + /// + /// We can only fall back if dictionary encoder is set and we have exceeded dictionary + /// size. + #[inline] + fn should_dict_fallback(&self) -> bool { + match self.dict_encoder { + Some(ref encoder) => { + encoder.dict_encoded_size() >= self.props.dictionary_pagesize_limit() + } + None => false, + } + } + + /// Returns true if there is enough data for a data page, false otherwise. + #[inline] + fn should_add_data_page(&self) -> bool { + self.encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit() + } + + /// Performs dictionary fallback. + /// Prepares and writes dictionary and all data pages into page writer. + fn dict_fallback(&mut self) -> Result<()> { + // At this point we know that we need to fall back. + self.write_dictionary_page()?; + self.flush_data_pages()?; + self.dict_encoder = None; + Ok(()) + } + + /// Adds data page. + /// Data page is either buffered in case of dictionary encoding or written directly. + fn add_data_page(&mut self) -> Result<()> { + // Extract encoded values + let value_bytes = match self.dict_encoder { + Some(ref mut encoder) => encoder.write_indices()?, + None => self.encoder.flush_buffer()?, + }; + + // Select encoding based on current encoder and writer version (v1 or v2). + let encoding = if self.dict_encoder.is_some() { + self.props.dictionary_data_page_encoding() + } else { + self.encoder.encoding() + }; + + let max_def_level = self.descr.max_def_level(); + let max_rep_level = self.descr.max_rep_level(); + + let compressed_page = match self.props.writer_version() { + WriterVersion::PARQUET_1_0 => { + let mut buffer = vec![]; + + if max_rep_level > 0 { + buffer.extend_from_slice( + &self.encode_levels_v1( + Encoding::RLE, + &self.rep_levels_sink[..], + max_rep_level, + )?[..], + ); + } + + if max_def_level > 0 { + buffer.extend_from_slice( + &self.encode_levels_v1( + Encoding::RLE, + &self.def_levels_sink[..], + max_def_level, + )?[..], + ); + } + + buffer.extend_from_slice(value_bytes.data()); + let uncompressed_size = buffer.len(); + + if let Some(ref mut cmpr) = self.compressor { + let mut compressed_buf = Vec::with_capacity(value_bytes.data().len()); + cmpr.compress(&buffer[..], &mut compressed_buf)?; + buffer = compressed_buf; + } + + let data_page = Page::DataPage { + buf: ByteBufferPtr::new(buffer), + num_values: self.num_buffered_values, + encoding, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + // TODO: process statistics + statistics: None, + }; + + CompressedPage::new(data_page, uncompressed_size) + } + WriterVersion::PARQUET_2_0 => { + let mut rep_levels_byte_len = 0; + let mut def_levels_byte_len = 0; + let mut buffer = vec![]; + + if max_rep_level > 0 { + let levels = self.encode_levels_v2(&self.rep_levels_sink[..], max_rep_level)?; + rep_levels_byte_len = levels.len(); + buffer.extend_from_slice(&levels[..]); + } + + if max_def_level > 0 { + let levels = self.encode_levels_v2(&self.def_levels_sink[..], max_def_level)?; + def_levels_byte_len = levels.len(); + buffer.extend_from_slice(&levels[..]); + } + + let uncompressed_size = + rep_levels_byte_len + def_levels_byte_len + value_bytes.len(); + + // Data Page v2 compresses values only. + match self.compressor { + Some(ref mut cmpr) => { + let mut compressed_buf = Vec::with_capacity(value_bytes.data().len()); + cmpr.compress(value_bytes.data(), &mut compressed_buf)?; + buffer.extend_from_slice(&compressed_buf[..]); + } + None => { + buffer.extend_from_slice(value_bytes.data()); + } + } + + let data_page = Page::DataPageV2 { + buf: ByteBufferPtr::new(buffer), + num_values: self.num_buffered_values, + encoding, + num_nulls: self.num_buffered_values - self.num_buffered_encoded_values, + num_rows: self.num_buffered_rows, + def_levels_byte_len: def_levels_byte_len as u32, + rep_levels_byte_len: rep_levels_byte_len as u32, + is_compressed: self.compressor.is_some(), + // TODO: process statistics + statistics: None, + }; + + CompressedPage::new(data_page, uncompressed_size) + } + }; + + // Check if we need to buffer data page or flush it to the sink directly. + if self.dict_encoder.is_some() { + self.data_pages.push_back(compressed_page); + } else { + self.write_data_page(compressed_page)?; + } + + // Update total number of rows. + self.total_rows_written += self.num_buffered_rows as u64; + + // Reset state. + self.rep_levels_sink.clear(); + self.def_levels_sink.clear(); + self.num_buffered_values = 0; + self.num_buffered_encoded_values = 0; + self.num_buffered_rows = 0; + + Ok(()) + } + + /// Finalises any outstanding data pages and flushes buffered data pages from + /// dictionary encoding into underlying sink. + #[inline] + fn flush_data_pages(&mut self) -> Result<()> { + // Write all outstanding data to a new page. + if self.num_buffered_values > 0 { + self.add_data_page()?; + } + + while let Some(page) = self.data_pages.pop_front() { + self.write_data_page(page)?; + } + + Ok(()) + } + + /// Assembles and writes column chunk metadata. + fn write_column_metadata(&mut self) -> Result { + let total_compressed_size = self.total_compressed_size as i64; + let total_uncompressed_size = self.total_uncompressed_size as i64; + let num_values = self.total_num_values as i64; + let dict_page_offset = self.dictionary_page_offset.map(|v| v as i64); + // If data page offset is not set, then no pages have been written + let data_page_offset = self.data_page_offset.unwrap_or(0) as i64; + + let file_offset; + let mut encodings = Vec::new(); + + if self.has_dictionary { + assert!(dict_page_offset.is_some(), "Dictionary offset is not set"); + file_offset = dict_page_offset.unwrap() + total_compressed_size; + // NOTE: This should be in sync with writing dictionary pages. + encodings.push(self.props.dictionary_page_encoding()); + encodings.push(self.props.dictionary_data_page_encoding()); + // Fallback to alternative encoding, add it to the list. + if self.dict_encoder.is_none() { + encodings.push(self.encoder.encoding()); + } + } else { + file_offset = data_page_offset + total_compressed_size; + encodings.push(self.encoder.encoding()); + } + // We use only RLE level encoding for data page v1 and data page v2. + encodings.push(Encoding::RLE); + + let metadata = ColumnChunkMetaData::builder(self.descr.clone()) + .set_compression(self.codec) + .set_encodings(encodings) + .set_file_offset(file_offset) + .set_total_compressed_size(total_compressed_size) + .set_total_uncompressed_size(total_uncompressed_size) + .set_num_values(num_values) + .set_data_page_offset(data_page_offset) + .set_dictionary_page_offset(dict_page_offset) + .build()?; + + self.page_writer.write_metadata(&metadata)?; + + Ok(metadata) + } + + /// Encodes definition or repetition levels for Data Page v1. + #[inline] + fn encode_levels_v1( + &self, + encoding: Encoding, + levels: &[i16], + max_level: i16, + ) -> Result> { + let size = max_buffer_size(encoding, max_level, levels.len()); + let mut encoder = LevelEncoder::v1(encoding, max_level, vec![0; size]); + encoder.put(&levels)?; + encoder.consume() + } + + /// Encodes definition or repetition levels for Data Page v2. + /// Encoding is always RLE. + #[inline] + fn encode_levels_v2(&self, levels: &[i16], max_level: i16) -> Result> { + let size = max_buffer_size(Encoding::RLE, max_level, levels.len()); + let mut encoder = LevelEncoder::v2(max_level, vec![0; size]); + encoder.put(&levels)?; + encoder.consume() + } + + /// Writes compressed data page into underlying sink and updates global metrics. + #[inline] + fn write_data_page(&mut self, page: CompressedPage) -> Result<()> { + let page_spec = self.page_writer.write_page(page)?; + self.update_metrics_for_page(page_spec); + Ok(()) + } + + /// Writes dictionary page into underlying sink. + #[inline] + fn write_dictionary_page(&mut self) -> Result<()> { + if self.dict_encoder.is_none() { + return Err(general_err!("Dictionary encoder is not set")); + } + + let compressed_page = { + let encoder = self.dict_encoder.as_ref().unwrap(); + let is_sorted = encoder.is_sorted(); + let num_values = encoder.num_entries(); + let mut values_buf = encoder.write_dict()?; + let uncompressed_size = values_buf.len(); + + if let Some(ref mut cmpr) = self.compressor { + let mut output_buf = Vec::with_capacity(uncompressed_size); + cmpr.compress(values_buf.data(), &mut output_buf)?; + values_buf = ByteBufferPtr::new(output_buf); + } + + let dict_page = Page::DictionaryPage { + buf: values_buf, + num_values: num_values as u32, + encoding: self.props.dictionary_page_encoding(), + is_sorted, + }; + CompressedPage::new(dict_page, uncompressed_size) + }; + + let page_spec = self.page_writer.write_page(compressed_page)?; + self.update_metrics_for_page(page_spec); + Ok(()) + } + + /// Updates column writer metrics with each page metadata. + #[inline] + fn update_metrics_for_page(&mut self, page_spec: PageWriteSpec) { + self.total_uncompressed_size += page_spec.uncompressed_size as u64; + self.total_compressed_size += page_spec.compressed_size as u64; + self.total_num_values += page_spec.num_values as u64; + self.total_bytes_written += page_spec.bytes_written; + + match page_spec.page_type { + PageType::DATA_PAGE | PageType::DATA_PAGE_V2 => { + if self.data_page_offset.is_none() { + self.data_page_offset = Some(page_spec.offset); + } + } + PageType::DICTIONARY_PAGE => { + assert!( + self.dictionary_page_offset.is_none(), + "Dictionary offset is already set" + ); + self.dictionary_page_offset = Some(page_spec.offset); + } + _ => {} + } + } + + /// Returns reference to the underlying page writer. + /// This method is intended to use in tests only. + fn get_page_writer_ref(&self) -> &Box { + &self.page_writer + } +} + +// ---------------------------------------------------------------------- +// Encoding support for column writer. +// This mirrors parquet-mr default encodings for writes. See: +// https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV1ValuesWriterFactory.java +// https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV2ValuesWriterFactory.java + +/// Trait to define default encoding for types, including whether or not the type +/// supports dictionary encoding. +trait EncodingWriteSupport { + /// Returns encoding for a column when no other encoding is provided in writer + /// properties. + fn fallback_encoding(props: &WriterProperties) -> Encoding; + + /// Returns true if dictionary is supported for column writer, false otherwise. + fn has_dictionary_support(props: &WriterProperties) -> bool; +} + +// Basic implementation, always falls back to PLAIN and supports dictionary. +impl EncodingWriteSupport for ColumnWriterImpl { + default fn fallback_encoding(_props: &WriterProperties) -> Encoding { + Encoding::PLAIN + } + + default fn has_dictionary_support(_props: &WriterProperties) -> bool { + true + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::RLE, + } + } + + // Boolean column does not support dictionary encoding and should fall back to + // whatever fallback encoding is defined. + fn has_dictionary_support(_props: &WriterProperties) -> bool { + false + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BINARY_PACKED, + } + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BINARY_PACKED, + } + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY, + } + } +} + +impl EncodingWriteSupport for ColumnWriterImpl { + fn fallback_encoding(props: &WriterProperties) -> Encoding { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY, + } + } + + fn has_dictionary_support(props: &WriterProperties) -> bool { + match props.writer_version() { + // Dictionary encoding was not enabled in PARQUET 1.0 + WriterVersion::PARQUET_1_0 => false, + WriterVersion::PARQUET_2_0 => true, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::error::Error; + + use rand::distributions::range::SampleRange; + + use crate::parquet::column::{ + page::PageReader, + reader::{get_column_reader, get_typed_column_reader, ColumnReaderImpl}, + }; + use crate::parquet::file::{ + properties::WriterProperties, reader::SerializedPageReader, writer::SerializedPageWriter, + }; + use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::parquet::util::{ + io::{FileSink, FileSource}, + test_common::{get_temp_file, random_numbers_range}, + }; + + #[test] + fn test_column_writer_inconsistent_def_rep_length() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 1, 1, props); + let res = writer.write_batch(&[1, 2, 3, 4], Some(&[1, 1, 1]), Some(&[0, 0])); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Inconsistent length of definition and repetition levels: 3 != 2" + ); + } + } + + #[test] + fn test_column_writer_invalid_def_levels() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 1, 0, props); + let res = writer.write_batch(&[1, 2, 3, 4], None, None); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Definition levels are required, because max definition level = 1" + ); + } + } + + #[test] + fn test_column_writer_invalid_rep_levels() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 0, 1, props); + let res = writer.write_batch(&[1, 2, 3, 4], None, None); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Repetition levels are required, because max repetition level = 1" + ); + } + } + + #[test] + fn test_column_writer_not_enough_values_to_write() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 1, 0, props); + let res = writer.write_batch(&[1, 2], Some(&[1, 1, 1, 1]), None); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Expected to write 4 values, but have only 2" + ); + } + } + + #[test] + #[should_panic(expected = "Dictionary offset is already set")] + fn test_column_writer_write_only_one_dictionary_page() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + // First page should be correctly written. + let res = writer.write_dictionary_page(); + assert!(res.is_ok()); + writer.write_dictionary_page().unwrap(); + } + + #[test] + fn test_column_writer_error_when_writing_disabled_dictionary() { + let page_writer = get_test_page_writer(); + let props = Rc::new( + WriterProperties::builder() + .set_dictionary_enabled(false) + .build(), + ); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + let res = writer.write_dictionary_page(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "Dictionary encoder is not set"); + } + } + + #[test] + fn test_column_writer_boolean_type_does_not_support_dictionary() { + let page_writer = get_test_page_writer(); + let props = Rc::new( + WriterProperties::builder() + .set_dictionary_enabled(true) + .build(), + ); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer + .write_batch(&[true, false, true, false], None, None) + .unwrap(); + + let (bytes_written, rows_written, metadata) = writer.close().unwrap(); + // PlainEncoder uses bit writer to write boolean values, which all fit into 1 byte. + assert_eq!(bytes_written, 1); + assert_eq!(rows_written, 4); + assert_eq!(metadata.encodings(), &vec![Encoding::PLAIN, Encoding::RLE]); + assert_eq!(metadata.num_values(), 4); // just values + assert_eq!(metadata.dictionary_page_offset(), None); + } + + #[test] + fn test_column_writer_default_encoding_support_bool() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[true, false], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[true, false], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[true, false], + None, + &[Encoding::RLE, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[true, false], + None, + &[Encoding::RLE, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_int32() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[1, 2], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[1, 2], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[1, 2], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[1, 2], + None, + &[Encoding::DELTA_BINARY_PACKED, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_int64() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[1, 2], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[1, 2], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[1, 2], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[1, 2], + None, + &[Encoding::DELTA_BINARY_PACKED, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_int96() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[Int96::from(vec![1, 2, 3])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[Int96::from(vec![1, 2, 3])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[Int96::from(vec![1, 2, 3])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[Int96::from(vec![1, 2, 3])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_float() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[1.0, 2.0], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[1.0, 2.0], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[1.0, 2.0], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[1.0, 2.0], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_double() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[1.0, 2.0], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[1.0, 2.0], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[1.0, 2.0], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[1.0, 2.0], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_byte_array() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[ByteArray::from(vec![1u8])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[ByteArray::from(vec![1u8])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::DELTA_BYTE_ARRAY, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_default_encoding_support_fixed_len_byte_array() { + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + true, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_1_0, + false, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::PLAIN, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + true, + &[ByteArray::from(vec![1u8])], + Some(0), + &[Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE], + ); + check_encoding_write_support::( + WriterVersion::PARQUET_2_0, + false, + &[ByteArray::from(vec![1u8])], + None, + &[Encoding::DELTA_BYTE_ARRAY, Encoding::RLE], + ); + } + + #[test] + fn test_column_writer_check_metadata() { + let page_writer = get_test_page_writer(); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + + let (bytes_written, rows_written, metadata) = writer.close().unwrap(); + assert_eq!(bytes_written, 20); + assert_eq!(rows_written, 4); + assert_eq!( + metadata.encodings(), + &vec![Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::RLE] + ); + assert_eq!(metadata.num_values(), 8); // dictionary + value indexes + assert_eq!(metadata.compressed_size(), 20); + assert_eq!(metadata.uncompressed_size(), 20); + assert_eq!(metadata.data_page_offset(), 0); + assert_eq!(metadata.dictionary_page_offset(), Some(0)); + } + + #[test] + fn test_column_writer_empty_column_roundtrip() { + let props = WriterProperties::builder().build(); + column_roundtrip::("test_col_writer_rnd_1", props, &[], None, None); + } + + #[test] + fn test_column_writer_non_nullable_values_roundtrip() { + let props = WriterProperties::builder().build(); + column_roundtrip_random::( + "test_col_writer_rnd_2", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 0, + 0, + ); + } + + #[test] + fn test_column_writer_nullable_non_repeated_values_roundtrip() { + let props = WriterProperties::builder().build(); + column_roundtrip_random::( + "test_column_writer_nullable_non_repeated_values_roundtrip", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 0, + ); + } + + #[test] + fn test_column_writer_nullable_repeated_values_roundtrip() { + let props = WriterProperties::builder().build(); + column_roundtrip_random::( + "test_col_writer_rnd_3", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_dictionary_fallback_small_data_page() { + let props = WriterProperties::builder() + .set_dictionary_pagesize_limit(32) + .set_data_pagesize_limit(32) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_4", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_small_write_batch_size() { + for i in vec![1, 2, 5, 10, 11, 1023] { + let props = WriterProperties::builder().set_write_batch_size(i).build(); + + column_roundtrip_random::( + "test_col_writer_rnd_5", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + } + + #[test] + fn test_column_writer_dictionary_disabled_v1() { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_1_0) + .set_dictionary_enabled(false) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_6", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_dictionary_disabled_v2() { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_2_0) + .set_dictionary_enabled(false) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_7", + props, + 1024, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_compression_v1() { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_1_0) + .set_compression(Compression::SNAPPY) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_8", + props, + 2048, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + #[test] + fn test_column_writer_compression_v2() { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_2_0) + .set_compression(Compression::SNAPPY) + .build(); + column_roundtrip_random::( + "test_col_writer_rnd_9", + props, + 2048, + ::std::i32::MIN, + ::std::i32::MAX, + 10, + 10, + ); + } + + /// Performs write-read roundtrip with randomly generated values and levels. + /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write + /// for a column. + fn column_roundtrip_random<'a, T: DataType>( + file_name: &'a str, + props: WriterProperties, + max_size: usize, + min_value: T::T, + max_value: T::T, + max_def_level: i16, + max_rep_level: i16, + ) where + T::T: PartialOrd + SampleRange + Copy, + { + let mut num_values: usize = 0; + + let mut buf: Vec = Vec::new(); + let def_levels = if max_def_level > 0 { + random_numbers_range(max_size, 0, max_def_level + 1, &mut buf); + for &dl in &buf[..] { + if dl == max_def_level { + num_values += 1; + } + } + Some(&buf[..]) + } else { + num_values = max_size; + None + }; + + let mut buf: Vec = Vec::new(); + let rep_levels = if max_rep_level > 0 { + random_numbers_range(max_size, 0, max_rep_level + 1, &mut buf); + Some(&buf[..]) + } else { + None + }; + + let mut values: Vec = Vec::new(); + random_numbers_range(num_values, min_value, max_value, &mut values); + + column_roundtrip::(file_name, props, &values[..], def_levels, rep_levels); + } + + /// Performs write-read roundtrip and asserts written values and levels. + fn column_roundtrip<'a, T: DataType>( + file_name: &'a str, + props: WriterProperties, + values: &[T::T], + def_levels: Option<&[i16]>, + rep_levels: Option<&[i16]>, + ) { + let file = get_temp_file(file_name, &[]); + let sink = FileSink::new(&file); + let page_writer = Box::new(SerializedPageWriter::new(sink)); + + let max_def_level = match def_levels { + Some(buf) => *buf.iter().max().unwrap_or(&0i16), + None => 0i16, + }; + + let max_rep_level = match rep_levels { + Some(buf) => *buf.iter().max().unwrap_or(&0i16), + None => 0i16, + }; + + let mut max_batch_size = values.len(); + if let Some(levels) = def_levels { + max_batch_size = cmp::max(max_batch_size, levels.len()); + } + if let Some(levels) = rep_levels { + max_batch_size = cmp::max(max_batch_size, levels.len()); + } + + let mut writer = + get_test_column_writer::(page_writer, max_def_level, max_rep_level, Rc::new(props)); + + let values_written = writer.write_batch(values, def_levels, rep_levels).unwrap(); + assert_eq!(values_written, values.len()); + let (bytes_written, rows_written, column_metadata) = writer.close().unwrap(); + + let source = FileSource::new(&file, 0, bytes_written as usize); + let page_reader = Box::new( + SerializedPageReader::new( + source, + column_metadata.num_values(), + column_metadata.compression(), + T::get_physical_type(), + ) + .unwrap(), + ); + let reader = get_test_column_reader::(page_reader, max_def_level, max_rep_level); + + let mut actual_values = vec![T::T::default(); max_batch_size]; + let mut actual_def_levels = match def_levels { + Some(_) => Some(vec![0i16; max_batch_size]), + None => None, + }; + let mut actual_rep_levels = match rep_levels { + Some(_) => Some(vec![0i16; max_batch_size]), + None => None, + }; + + let (values_read, levels_read) = read_fully( + reader, + max_batch_size, + actual_def_levels.as_mut(), + actual_rep_levels.as_mut(), + actual_values.as_mut_slice(), + ); + + // Assert values, definition and repetition levels. + + assert_eq!(&actual_values[..values_read], values); + match actual_def_levels { + Some(ref vec) => assert_eq!(Some(&vec[..levels_read]), def_levels), + None => assert_eq!(None, def_levels), + } + match actual_rep_levels { + Some(ref vec) => assert_eq!(Some(&vec[..levels_read]), rep_levels), + None => assert_eq!(None, rep_levels), + } + + // Assert written rows. + + if let Some(levels) = actual_rep_levels { + let mut actual_rows_written = 0; + for l in levels { + if l == 0 { + actual_rows_written += 1; + } + } + assert_eq!(actual_rows_written, rows_written); + } else if actual_def_levels.is_some() { + assert_eq!(levels_read as u64, rows_written); + } else { + assert_eq!(values_read as u64, rows_written); + } + } + + /// Performs write of provided values and returns column metadata of those values. + /// Used to test encoding support for column writer. + fn column_write_and_get_metadata( + props: WriterProperties, + values: &[T::T], + ) -> ColumnChunkMetaData { + let page_writer = get_test_page_writer(); + let props = Rc::new(props); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + writer.write_batch(values, None, None).unwrap(); + let (_, _, metadata) = writer.close().unwrap(); + metadata + } + + // Function to use in tests for EncodingWriteSupport. This checks that dictionary + // offset and encodings to make sure that column writer uses provided by trait + // encodings. + fn check_encoding_write_support( + version: WriterVersion, + dict_enabled: bool, + data: &[T::T], + dictionary_page_offset: Option, + encodings: &[Encoding], + ) { + let props = WriterProperties::builder() + .set_writer_version(version) + .set_dictionary_enabled(dict_enabled) + .build(); + let meta = column_write_and_get_metadata::(props, data); + assert_eq!(meta.dictionary_page_offset(), dictionary_page_offset); + assert_eq!(meta.encodings(), &encodings); + } + + /// Reads one batch of data, considering that batch is large enough to capture all of + /// the values and levels. + fn read_fully( + mut reader: ColumnReaderImpl, + batch_size: usize, + mut def_levels: Option<&mut Vec>, + mut rep_levels: Option<&mut Vec>, + values: &mut [T::T], + ) -> (usize, usize) { + let actual_def_levels = match &mut def_levels { + Some(ref mut vec) => Some(&mut vec[..]), + None => None, + }; + let actual_rep_levels = match rep_levels { + Some(ref mut vec) => Some(&mut vec[..]), + None => None, + }; + reader + .read_batch(batch_size, actual_def_levels, actual_rep_levels, values) + .unwrap() + } + + /// Returns column writer. + fn get_test_column_writer( + page_writer: Box, + max_def_level: i16, + max_rep_level: i16, + props: WriterPropertiesPtr, + ) -> ColumnWriterImpl { + let descr = Rc::new(get_test_column_descr::(max_def_level, max_rep_level)); + let column_writer = get_column_writer(descr, props, page_writer); + get_typed_column_writer::(column_writer) + } + + /// Returns column reader. + fn get_test_column_reader( + page_reader: Box, + max_def_level: i16, + max_rep_level: i16, + ) -> ColumnReaderImpl { + let descr = Rc::new(get_test_column_descr::(max_def_level, max_rep_level)); + let column_reader = get_column_reader(descr, page_reader); + get_typed_column_reader::(column_reader) + } + + /// Returns descriptor for primitive column. + fn get_test_column_descr( + max_def_level: i16, + max_rep_level: i16, + ) -> ColumnDescriptor { + let path = ColumnPath::from("col"); + let tpe = SchemaType::primitive_type_builder("col", T::get_physical_type()) + // length is set for "encoding support" tests for FIXED_LEN_BYTE_ARRAY type, + // it should be no-op for other types + .with_length(1) + .build() + .unwrap(); + ColumnDescriptor::new(Rc::new(tpe), None, max_def_level, max_rep_level, path) + } + + /// Returns page writer that collects pages without serializing them. + fn get_test_page_writer() -> Box { + Box::new(TestPageWriter {}) + } + + struct TestPageWriter {} + + impl PageWriter for TestPageWriter { + fn write_page(&mut self, page: CompressedPage) -> Result { + let mut res = PageWriteSpec::new(); + res.page_type = page.page_type(); + res.uncompressed_size = page.uncompressed_size(); + res.compressed_size = page.compressed_size(); + res.num_values = page.num_values(); + res.offset = 0; + res.bytes_written = page.data().len() as u64; + Ok(res) + } + + fn write_metadata(&mut self, _metadata: &ColumnChunkMetaData) -> Result<()> { + Ok(()) + } + + fn close(&mut self) -> Result<()> { + Ok(()) + } + } +} diff --git a/rust/src/parquet/compression.rs b/rust/src/parquet/compression.rs new file mode 100644 index 0000000000000..3690cca032361 --- /dev/null +++ b/rust/src/parquet/compression.rs @@ -0,0 +1,321 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains codec interface and supported codec implementations. +//! +//! See [`Compression`](`::basic::Compression`) enum for all available compression +//! algorithms. +//! +//! # Example +//! +//! ```rust +//! use arrow::parquet::{basic::Compression, compression::create_codec}; +//! +//! let mut codec = match create_codec(Compression::SNAPPY) { +//! Ok(Some(codec)) => codec, +//! _ => panic!(), +//! }; +//! +//! let data = vec![b'p', b'a', b'r', b'q', b'u', b'e', b't']; +//! let mut compressed = vec![]; +//! codec.compress(&data[..], &mut compressed).unwrap(); +//! +//! let mut output = vec![]; +//! codec.decompress(&compressed[..], &mut output).unwrap(); +//! +//! assert_eq!(output, data); +//! ``` + +use std::io::{self, Read, Write}; + +use brotli; +use flate2::{read, write, Compression}; +use lz4; +use snap::{decompress_len, max_compress_len, Decoder, Encoder}; +use zstd; + +use crate::parquet::basic::Compression as CodecType; +use crate::parquet::errors::{ParquetError, Result}; + +/// Parquet compression codec interface. +pub trait Codec { + /// Compresses data stored in slice `input_buf` and writes the compressed result + /// to `output_buf`. + /// Note that you'll need to call `clear()` before reusing the same `output_buf` across + /// different `compress` calls. + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()>; + + /// Decompresses data stored in slice `input_buf` and writes output to `output_buf`. + /// Returns the total number of bytes written. + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result; +} + +/// Given the compression type `codec`, returns a codec used to compress and decompress +/// bytes for the compression type. +/// This returns `None` if the codec type is `UNCOMPRESSED`. +pub fn create_codec(codec: CodecType) -> Result>> { + match codec { + CodecType::BROTLI => Ok(Some(Box::new(BrotliCodec::new()))), + CodecType::GZIP => Ok(Some(Box::new(GZipCodec::new()))), + CodecType::SNAPPY => Ok(Some(Box::new(SnappyCodec::new()))), + CodecType::LZ4 => Ok(Some(Box::new(LZ4Codec::new()))), + CodecType::ZSTD => Ok(Some(Box::new(ZSTDCodec::new()))), + CodecType::UNCOMPRESSED => Ok(None), + _ => Err(nyi_err!("The codec type {} is not supported yet", codec)), + } +} + +/// Codec for Snappy compression format. +pub struct SnappyCodec { + decoder: Decoder, + encoder: Encoder, +} + +impl SnappyCodec { + /// Creates new Snappy compression codec. + fn new() -> Self { + Self { + decoder: Decoder::new(), + encoder: Encoder::new(), + } + } +} + +impl Codec for SnappyCodec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + let len = decompress_len(input_buf)?; + output_buf.resize(len, 0); + self.decoder + .decompress(input_buf, output_buf) + .map_err(|e| e.into()) + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let required_len = max_compress_len(input_buf.len()); + if output_buf.len() < required_len { + output_buf.resize(required_len, 0); + } + let n = self.encoder.compress(input_buf, &mut output_buf[..])?; + output_buf.truncate(n); + Ok(()) + } +} + +/// Codec for GZIP compression algorithm. +pub struct GZipCodec {} + +impl GZipCodec { + /// Creates new GZIP compression codec. + fn new() -> Self { + Self {} + } +} + +impl Codec for GZipCodec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + let mut decoder = read::GzDecoder::new(input_buf); + decoder.read_to_end(output_buf).map_err(|e| e.into()) + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let mut encoder = write::GzEncoder::new(output_buf, Compression::default()); + encoder.write_all(input_buf)?; + encoder.try_finish().map_err(|e| e.into()) + } +} + +const BROTLI_DEFAULT_BUFFER_SIZE: usize = 4096; +const BROTLI_DEFAULT_COMPRESSION_QUALITY: u32 = 1; // supported levels 0-9 +const BROTLI_DEFAULT_LG_WINDOW_SIZE: u32 = 22; // recommended between 20-22 + +/// Codec for Brotli compression algorithm. +pub struct BrotliCodec {} + +impl BrotliCodec { + /// Creates new Brotli compression codec. + fn new() -> Self { + Self {} + } +} + +impl Codec for BrotliCodec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + brotli::Decompressor::new(input_buf, BROTLI_DEFAULT_BUFFER_SIZE) + .read_to_end(output_buf) + .map_err(|e| e.into()) + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let mut encoder = brotli::CompressorWriter::new( + output_buf, + BROTLI_DEFAULT_BUFFER_SIZE, + BROTLI_DEFAULT_COMPRESSION_QUALITY, + BROTLI_DEFAULT_LG_WINDOW_SIZE, + ); + encoder.write_all(&input_buf[..])?; + encoder.flush().map_err(|e| e.into()) + } +} + +const LZ4_BUFFER_SIZE: usize = 4096; + +/// Codec for LZ4 compression algorithm. +pub struct LZ4Codec {} + +impl LZ4Codec { + /// Creates new LZ4 compression codec. + fn new() -> Self { + Self {} + } +} + +impl Codec for LZ4Codec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + let mut decoder = lz4::Decoder::new(input_buf)?; + let mut buffer: [u8; LZ4_BUFFER_SIZE] = [0; LZ4_BUFFER_SIZE]; + let mut total_len = 0; + loop { + let len = decoder.read(&mut buffer)?; + if len == 0 { + break; + } + total_len += len; + output_buf.write_all(&buffer[0..len])?; + } + Ok(total_len) + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let mut encoder = lz4::EncoderBuilder::new().build(output_buf)?; + let mut from = 0; + loop { + let to = ::std::cmp::min(from + LZ4_BUFFER_SIZE, input_buf.len()); + encoder.write_all(&input_buf[from..to])?; + from += LZ4_BUFFER_SIZE; + if from >= input_buf.len() { + break; + } + } + encoder.finish().1.map_err(|e| e.into()) + } +} + +/// Codec for Zstandard compression algorithm. +pub struct ZSTDCodec {} + +impl ZSTDCodec { + /// Creates new Zstandard compression codec. + fn new() -> Self { + Self {} + } +} + +/// Compression level (1-21) for ZSTD. Choose 1 here for better compression speed. +const ZSTD_COMPRESSION_LEVEL: i32 = 1; + +impl Codec for ZSTDCodec { + fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result { + let mut decoder = zstd::Decoder::new(input_buf)?; + match io::copy(&mut decoder, output_buf) { + Ok(n) => Ok(n as usize), + Err(e) => Err(e.into()), + } + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let mut encoder = zstd::Encoder::new(output_buf, ZSTD_COMPRESSION_LEVEL)?; + encoder.write_all(&input_buf[..])?; + match encoder.finish() { + Ok(_) => Ok(()), + Err(e) => Err(e.into()), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::util::test_common::*; + + fn test_roundtrip(c: CodecType, data: &Vec) { + let mut c1 = create_codec(c).unwrap().unwrap(); + let mut c2 = create_codec(c).unwrap().unwrap(); + + // Compress with c1 + let mut compressed = Vec::new(); + let mut decompressed = Vec::new(); + c1.compress(data.as_slice(), &mut compressed) + .expect("Error when compressing"); + + // Decompress with c2 + let mut decompressed_size = c2 + .decompress(compressed.as_slice(), &mut decompressed) + .expect("Error when decompressing"); + assert_eq!(data.len(), decompressed_size); + decompressed.truncate(decompressed_size); + assert_eq!(*data, decompressed); + + compressed.clear(); + + // Compress with c2 + c2.compress(data.as_slice(), &mut compressed) + .expect("Error when compressing"); + + // Decompress with c1 + decompressed_size = c1 + .decompress(compressed.as_slice(), &mut decompressed) + .expect("Error when decompressing"); + assert_eq!(data.len(), decompressed_size); + decompressed.truncate(decompressed_size); + assert_eq!(*data, decompressed); + } + + fn test_codec(c: CodecType) { + let sizes = vec![100, 10000, 100000]; + for size in sizes { + let mut data = random_bytes(size); + test_roundtrip(c, &mut data); + } + } + + #[test] + fn test_codec_snappy() { + test_codec(CodecType::SNAPPY); + } + + #[test] + fn test_codec_gzip() { + test_codec(CodecType::GZIP); + } + + #[test] + fn test_codec_brotli() { + test_codec(CodecType::BROTLI); + } + + #[test] + fn test_codec_lz4() { + test_codec(CodecType::LZ4); + } + + #[test] + fn test_codec_zstd() { + test_codec(CodecType::ZSTD); + } + +} diff --git a/rust/src/parquet/data_type.rs b/rust/src/parquet/data_type.rs new file mode 100644 index 0000000000000..26bdebd71bc8b --- /dev/null +++ b/rust/src/parquet/data_type.rs @@ -0,0 +1,463 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Data types that connect Parquet physical types with their Rust-specific +//! representations. + +use std::mem; + +use byteorder::{BigEndian, ByteOrder}; + +use crate::parquet::basic::Type; +use crate::parquet::util::memory::{ByteBuffer, ByteBufferPtr}; + +/// Rust representation for logical type INT96, value is backed by an array of `u32`. +/// The type only takes 12 bytes, without extra padding. +#[derive(Clone, Debug)] +pub struct Int96 { + value: Option<[u32; 3]>, +} + +impl Int96 { + /// Creates new INT96 type struct with no data set. + pub fn new() -> Self { + Self { value: None } + } + + /// Returns underlying data as slice of [`u32`]. + pub fn data(&self) -> &[u32] { + assert!(self.value.is_some()); + self.value.as_ref().unwrap() + } + + /// Sets data for this INT96 type. + pub fn set_data(&mut self, elem0: u32, elem1: u32, elem2: u32) { + self.value = Some([elem0, elem1, elem2]); + } +} + +impl Default for Int96 { + fn default() -> Self { + Self { value: None } + } +} + +impl PartialEq for Int96 { + fn eq(&self, other: &Int96) -> bool { + self.data() == other.data() + } +} + +impl From> for Int96 { + fn from(buf: Vec) -> Self { + assert_eq!(buf.len(), 3); + let mut result = Self::new(); + result.set_data(buf[0], buf[1], buf[2]); + result + } +} + +/// Rust representation for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY Parquet physical types. +/// Value is backed by a byte buffer. +#[derive(Clone, Debug)] +pub struct ByteArray { + data: Option, +} + +impl ByteArray { + /// Creates new byte array with no data set. + pub fn new() -> Self { + ByteArray { data: None } + } + + /// Gets length of the underlying byte buffer. + pub fn len(&self) -> usize { + assert!(self.data.is_some()); + self.data.as_ref().unwrap().len() + } + + /// Returns slice of data. + pub fn data(&self) -> &[u8] { + assert!(self.data.is_some()); + self.data.as_ref().unwrap().as_ref() + } + + /// Set data from another byte buffer. + pub fn set_data(&mut self, data: ByteBufferPtr) { + self.data = Some(data); + } + + /// Returns `ByteArray` instance with slice of values for a data. + pub fn slice(&self, start: usize, len: usize) -> Self { + assert!(self.data.is_some()); + Self::from(self.data.as_ref().unwrap().range(start, len)) + } +} + +impl From> for ByteArray { + fn from(buf: Vec) -> ByteArray { + Self { + data: Some(ByteBufferPtr::new(buf)), + } + } +} + +impl<'a> From<&'a str> for ByteArray { + fn from(s: &'a str) -> ByteArray { + let mut v = Vec::new(); + v.extend_from_slice(s.as_bytes()); + Self { + data: Some(ByteBufferPtr::new(v)), + } + } +} + +impl From for ByteArray { + fn from(ptr: ByteBufferPtr) -> ByteArray { + Self { data: Some(ptr) } + } +} + +impl From for ByteArray { + fn from(mut buf: ByteBuffer) -> ByteArray { + Self { + data: Some(buf.consume()), + } + } +} + +impl Default for ByteArray { + fn default() -> Self { + ByteArray { data: None } + } +} + +impl PartialEq for ByteArray { + fn eq(&self, other: &ByteArray) -> bool { + self.data() == other.data() + } +} + +/// Rust representation for Decimal values. +/// +/// This is not a representation of Parquet physical type, but rather a wrapper for +/// DECIMAL logical type, and serves as container for raw parts of decimal values: +/// unscaled value in bytes, precision and scale. +#[derive(Clone, Debug)] +pub enum Decimal { + /// Decimal backed by `i32`. + Int32 { + value: [u8; 4], + precision: i32, + scale: i32, + }, + /// Decimal backed by `i64`. + Int64 { + value: [u8; 8], + precision: i32, + scale: i32, + }, + /// Decimal backed by byte array. + Bytes { + value: ByteArray, + precision: i32, + scale: i32, + }, +} + +impl Decimal { + /// Creates new decimal value from `i32`. + pub fn from_i32(value: i32, precision: i32, scale: i32) -> Self { + let mut bytes = [0; 4]; + BigEndian::write_i32(&mut bytes, value); + Decimal::Int32 { + value: bytes, + precision, + scale, + } + } + + /// Creates new decimal value from `i64`. + pub fn from_i64(value: i64, precision: i32, scale: i32) -> Self { + let mut bytes = [0; 8]; + BigEndian::write_i64(&mut bytes, value); + Decimal::Int64 { + value: bytes, + precision, + scale, + } + } + + /// Creates new decimal value from `ByteArray`. + pub fn from_bytes(value: ByteArray, precision: i32, scale: i32) -> Self { + Decimal::Bytes { + value, + precision, + scale, + } + } + + /// Returns bytes of unscaled value. + pub fn data(&self) -> &[u8] { + match *self { + Decimal::Int32 { ref value, .. } => value, + Decimal::Int64 { ref value, .. } => value, + Decimal::Bytes { ref value, .. } => value.data(), + } + } + + /// Returns decimal precision. + pub fn precision(&self) -> i32 { + match *self { + Decimal::Int32 { precision, .. } => precision, + Decimal::Int64 { precision, .. } => precision, + Decimal::Bytes { precision, .. } => precision, + } + } + + /// Returns decimal scale. + pub fn scale(&self) -> i32 { + match *self { + Decimal::Int32 { scale, .. } => scale, + Decimal::Int64 { scale, .. } => scale, + Decimal::Bytes { scale, .. } => scale, + } + } +} + +impl Default for Decimal { + fn default() -> Self { + Self::from_i32(0, 0, 0) + } +} + +impl PartialEq for Decimal { + fn eq(&self, other: &Decimal) -> bool { + self.precision() == other.precision() + && self.scale() == other.scale() + && self.data() == other.data() + } +} + +/// Converts an instance of data type to a slice of bytes as `u8`. +pub trait AsBytes { + /// Returns slice of bytes for this data type. + fn as_bytes(&self) -> &[u8]; +} + +macro_rules! gen_as_bytes { + ($source_ty:ident) => { + impl AsBytes for $source_ty { + fn as_bytes(&self) -> &[u8] { + unsafe { + ::std::slice::from_raw_parts( + self as *const $source_ty as *const u8, + ::std::mem::size_of::<$source_ty>(), + ) + } + } + } + }; +} + +gen_as_bytes!(bool); +gen_as_bytes!(u8); +gen_as_bytes!(i32); +gen_as_bytes!(u32); +gen_as_bytes!(i64); +gen_as_bytes!(f32); +gen_as_bytes!(f64); + +impl AsBytes for Int96 { + fn as_bytes(&self) -> &[u8] { + unsafe { ::std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) } + } +} + +impl AsBytes for ByteArray { + fn as_bytes(&self) -> &[u8] { + self.data() + } +} + +impl AsBytes for Decimal { + fn as_bytes(&self) -> &[u8] { + self.data() + } +} + +impl AsBytes for Vec { + fn as_bytes(&self) -> &[u8] { + self.as_slice() + } +} + +impl<'a> AsBytes for &'a str { + fn as_bytes(&self) -> &[u8] { + (self as &str).as_bytes() + } +} + +impl AsBytes for str { + fn as_bytes(&self) -> &[u8] { + (self as &str).as_bytes() + } +} + +/// Contains the Parquet physical type information as well as the Rust primitive type +/// presentation. +pub trait DataType: 'static { + type T: ::std::cmp::PartialEq + + ::std::fmt::Debug + + ::std::default::Default + + ::std::clone::Clone + + AsBytes; + + /// Returns Parquet physical type. + fn get_physical_type() -> Type; + + /// Returns size in bytes for Rust representation of the physical type. + fn get_type_size() -> usize; +} + +macro_rules! make_type { + ($name:ident, $physical_ty:path, $native_ty:ty, $size:expr) => { + pub struct $name {} + + impl DataType for $name { + type T = $native_ty; + + fn get_physical_type() -> Type { + $physical_ty + } + + fn get_type_size() -> usize { + $size + } + } + }; +} + +/// Generate struct definitions for all physical types + +make_type!(BoolType, Type::BOOLEAN, bool, 1); +make_type!(Int32Type, Type::INT32, i32, 4); +make_type!(Int64Type, Type::INT64, i64, 8); +make_type!(Int96Type, Type::INT96, Int96, mem::size_of::()); +make_type!(FloatType, Type::FLOAT, f32, 4); +make_type!(DoubleType, Type::DOUBLE, f64, 8); +make_type!( + ByteArrayType, + Type::BYTE_ARRAY, + ByteArray, + mem::size_of::() +); +make_type!( + FixedLenByteArrayType, + Type::FIXED_LEN_BYTE_ARRAY, + ByteArray, + mem::size_of::() +); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_as_bytes() { + assert_eq!(false.as_bytes(), &[0]); + assert_eq!(true.as_bytes(), &[1]); + assert_eq!((7 as i32).as_bytes(), &[7, 0, 0, 0]); + assert_eq!((555 as i32).as_bytes(), &[43, 2, 0, 0]); + assert_eq!((555 as u32).as_bytes(), &[43, 2, 0, 0]); + assert_eq!(i32::max_value().as_bytes(), &[255, 255, 255, 127]); + assert_eq!(i32::min_value().as_bytes(), &[0, 0, 0, 128]); + assert_eq!((7 as i64).as_bytes(), &[7, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!((555 as i64).as_bytes(), &[43, 2, 0, 0, 0, 0, 0, 0]); + assert_eq!( + (i64::max_value()).as_bytes(), + &[255, 255, 255, 255, 255, 255, 255, 127] + ); + assert_eq!((i64::min_value()).as_bytes(), &[0, 0, 0, 0, 0, 0, 0, 128]); + assert_eq!((3.14 as f32).as_bytes(), &[195, 245, 72, 64]); + assert_eq!( + (3.14 as f64).as_bytes(), + &[31, 133, 235, 81, 184, 30, 9, 64] + ); + assert_eq!("hello".as_bytes(), &[b'h', b'e', b'l', b'l', b'o']); + assert_eq!( + Vec::from("hello".as_bytes()).as_bytes(), + &[b'h', b'e', b'l', b'l', b'o'] + ); + + // Test Int96 + let i96 = Int96::from(vec![1, 2, 3]); + assert_eq!(i96.as_bytes(), &[1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0]); + + // Test ByteArray + let ba = ByteArray::from(vec![1, 2, 3]); + assert_eq!(ba.as_bytes(), &[1, 2, 3]); + + // Test Decimal + let decimal = Decimal::from_i32(123, 5, 2); + assert_eq!(decimal.as_bytes(), &[0, 0, 0, 123]); + let decimal = Decimal::from_i64(123, 5, 2); + assert_eq!(decimal.as_bytes(), &[0, 0, 0, 0, 0, 0, 0, 123]); + let decimal = Decimal::from_bytes(ByteArray::from(vec![1, 2, 3]), 5, 2); + assert_eq!(decimal.as_bytes(), &[1, 2, 3]); + } + + #[test] + fn test_int96_from() { + assert_eq!( + Int96::from(vec![1, 12345, 1234567890]).data(), + &[1, 12345, 1234567890] + ); + } + + #[test] + fn test_byte_array_from() { + assert_eq!( + ByteArray::from(vec![b'A', b'B', b'C']).data(), + &[b'A', b'B', b'C'] + ); + assert_eq!(ByteArray::from("ABC").data(), &[b'A', b'B', b'C']); + assert_eq!( + ByteArray::from(ByteBufferPtr::new(vec![1u8, 2u8, 3u8, 4u8, 5u8])).data(), + &[1u8, 2u8, 3u8, 4u8, 5u8] + ); + let mut buf = ByteBuffer::new(); + buf.set_data(vec![6u8, 7u8, 8u8, 9u8, 10u8]); + assert_eq!(ByteArray::from(buf).data(), &[6u8, 7u8, 8u8, 9u8, 10u8]); + } + + #[test] + fn test_decimal_partial_eq() { + assert_eq!(Decimal::default(), Decimal::from_i32(0, 0, 0)); + assert_eq!(Decimal::from_i32(222, 5, 2), Decimal::from_i32(222, 5, 2)); + assert_eq!( + Decimal::from_bytes(ByteArray::from(vec![0, 0, 0, 3]), 5, 2), + Decimal::from_i32(3, 5, 2) + ); + + assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(111, 5, 2)); + assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(222, 6, 2)); + assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(222, 5, 3)); + + assert!(Decimal::from_i64(222, 5, 2) != Decimal::from_i32(222, 5, 2)); + } +} diff --git a/rust/src/parquet/encodings/decoding.rs b/rust/src/parquet/encodings/decoding.rs new file mode 100644 index 0000000000000..c6a6fd49ee336 --- /dev/null +++ b/rust/src/parquet/encodings/decoding.rs @@ -0,0 +1,1403 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains all supported decoders for Parquet. + +use std::{cmp, marker::PhantomData, mem, slice::from_raw_parts_mut}; + +use super::rle::RleDecoder; + +use byteorder::{ByteOrder, LittleEndian}; + +use crate::parquet::basic::*; +use crate::parquet::data_type::*; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::ColumnDescPtr; +use crate::parquet::util::{ + bit_util::BitReader, + memory::{ByteBuffer, ByteBufferPtr}, +}; + +// ---------------------------------------------------------------------- +// Decoders + +/// A Parquet decoder for the data type `T`. +pub trait Decoder { + /// Sets the data to decode to be `data`, which should contain `num_values` of values + /// to decode. + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()>; + + /// Consumes values from this decoder and write the results to `buffer`. This will try + /// to fill up `buffer`. + /// + /// Returns the actual number of values decoded, which should be equal to `buffer.len()` + /// unless the remaining number of values is less than `buffer.len()`. + fn get(&mut self, buffer: &mut [T::T]) -> Result; + + /// Returns the number of values left in this decoder stream. + fn values_left(&self) -> usize; + + /// Returns the encoding for this decoder. + fn encoding(&self) -> Encoding; +} + +/// Gets a decoder for the column descriptor `descr` and encoding type `encoding`. +/// +/// NOTE: the primitive type in `descr` MUST match the data type `T`, otherwise +/// disastrous consequence could occur. +pub fn get_decoder( + descr: ColumnDescPtr, + encoding: Encoding, +) -> Result>> { + let decoder: Box> = match encoding { + Encoding::PLAIN => Box::new(PlainDecoder::new(descr.type_length())), + Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { + return Err(general_err!( + "Cannot initialize this encoding through this function" + )); + } + Encoding::RLE => Box::new(RleValueDecoder::new()), + Encoding::DELTA_BINARY_PACKED => Box::new(DeltaBitPackDecoder::new()), + Encoding::DELTA_LENGTH_BYTE_ARRAY => Box::new(DeltaLengthByteArrayDecoder::new()), + Encoding::DELTA_BYTE_ARRAY => Box::new(DeltaByteArrayDecoder::new()), + e => return Err(nyi_err!("Encoding {} is not supported", e)), + }; + Ok(decoder) +} + +// ---------------------------------------------------------------------- +// PLAIN Decoding + +/// Plain decoding that supports all types. +/// Values are encoded back to back. For native types, data is encoded as little endian. +/// Floating point types are encoded in IEEE. +/// See [`PlainDecoder`](`::encoding::PlainEncoder`) for more information. +pub struct PlainDecoder { + // The remaining number of values in the byte array + num_values: usize, + + // The current starting index in the byte array. + start: usize, + + // The length for the type `T`. Only used when `T` is `FixedLenByteArrayType` + type_length: i32, + + // The byte array to decode from. Not set if `T` is bool. + data: Option, + + // Read `data` bit by bit. Only set if `T` is bool. + bit_reader: Option, + + // To allow `T` in the generic parameter for this struct. This doesn't take any space. + _phantom: PhantomData, +} + +impl PlainDecoder { + /// Creates new plain decoder. + pub fn new(type_length: i32) -> Self { + PlainDecoder { + data: None, + bit_reader: None, + type_length, + num_values: 0, + start: 0, + _phantom: PhantomData, + } + } +} + +impl Decoder for PlainDecoder { + #[inline] + default fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + self.num_values = num_values; + self.start = 0; + self.data = Some(data); + Ok(()) + } + + #[inline] + fn values_left(&self) -> usize { + self.num_values + } + + #[inline] + fn encoding(&self) -> Encoding { + Encoding::PLAIN + } + + #[inline] + default fn get(&mut self, buffer: &mut [T::T]) -> Result { + assert!(self.data.is_some()); + + let data = self.data.as_mut().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + let bytes_left = data.len() - self.start; + let bytes_to_decode = mem::size_of::() * num_values; + if bytes_left < bytes_to_decode { + return Err(eof_err!("Not enough bytes to decode")); + } + let raw_buffer: &mut [u8] = + unsafe { from_raw_parts_mut(buffer.as_ptr() as *mut u8, bytes_to_decode) }; + raw_buffer.copy_from_slice(data.range(self.start, bytes_to_decode).as_ref()); + self.start += bytes_to_decode; + self.num_values -= num_values; + + Ok(num_values) + } +} + +impl Decoder for PlainDecoder { + fn get(&mut self, buffer: &mut [Int96]) -> Result { + assert!(self.data.is_some()); + + let data = self.data.as_ref().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + let bytes_left = data.len() - self.start; + let bytes_to_decode = 12 * num_values; + if bytes_left < bytes_to_decode { + return Err(eof_err!("Not enough bytes to decode")); + } + + let data_range = data.range(self.start, bytes_to_decode); + let bytes: &[u8] = data_range.data(); + self.start += bytes_to_decode; + + let mut pos = 0; // position in byte array + for i in 0..num_values { + let elem0 = LittleEndian::read_u32(&bytes[pos..pos + 4]); + let elem1 = LittleEndian::read_u32(&bytes[pos + 4..pos + 8]); + let elem2 = LittleEndian::read_u32(&bytes[pos + 8..pos + 12]); + buffer[i].set_data(elem0, elem1, elem2); + pos += 12; + } + self.num_values -= num_values; + + Ok(num_values) + } +} + +impl Decoder for PlainDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + self.num_values = num_values; + self.bit_reader = Some(BitReader::new(data)); + Ok(()) + } + + fn get(&mut self, buffer: &mut [bool]) -> Result { + assert!(self.bit_reader.is_some()); + + let bit_reader = self.bit_reader.as_mut().unwrap(); + let values_read = bit_reader.get_batch::(buffer, 1); + self.num_values -= values_read; + + Ok(values_read) + } +} + +impl Decoder for PlainDecoder { + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + assert!(self.data.is_some()); + + let data = self.data.as_mut().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + for i in 0..num_values { + let len: usize = read_num_bytes!(u32, 4, data.start_from(self.start).as_ref()) as usize; + self.start += mem::size_of::(); + if data.len() < self.start + len { + return Err(eof_err!("Not enough bytes to decode")); + } + buffer[i].set_data(data.range(self.start, len)); + self.start += len; + } + self.num_values -= num_values; + + Ok(num_values) + } +} + +impl Decoder for PlainDecoder { + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + assert!(self.data.is_some()); + assert!(self.type_length > 0); + + let data = self.data.as_mut().unwrap(); + let type_length = self.type_length as usize; + let num_values = cmp::min(buffer.len(), self.num_values); + for i in 0..num_values { + if data.len() < self.start + type_length { + return Err(eof_err!("Not enough bytes to decode")); + } + buffer[i].set_data(data.range(self.start, type_length)); + self.start += type_length; + } + self.num_values -= num_values; + + Ok(num_values) + } +} + +// ---------------------------------------------------------------------- +// RLE_DICTIONARY/PLAIN_DICTIONARY Decoding + +/// Dictionary decoder. +/// The dictionary encoding builds a dictionary of values encountered in a given column. +/// The dictionary is be stored in a dictionary page per column chunk. +/// See [`DictEncoder`](`::encoding::DictEncoder`) for more information. +pub struct DictDecoder { + // The dictionary, which maps ids to the values + dictionary: Vec, + + // Whether `dictionary` has been initialized + has_dictionary: bool, + + // The decoder for the value ids + rle_decoder: Option, + + // Number of values left in the data stream + num_values: usize, +} + +impl DictDecoder { + /// Creates new dictionary decoder. + pub fn new() -> Self { + Self { + dictionary: vec![], + has_dictionary: false, + rle_decoder: None, + num_values: 0, + } + } + + /// Decodes and sets values for dictionary using `decoder` decoder. + pub fn set_dict(&mut self, mut decoder: Box>) -> Result<()> { + let num_values = decoder.values_left(); + self.dictionary.resize(num_values, T::T::default()); + let _ = decoder.get(&mut self.dictionary)?; + self.has_dictionary = true; + Ok(()) + } +} + +impl Decoder for DictDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + // First byte in `data` is bit width + let bit_width = data.as_ref()[0]; + let mut rle_decoder = RleDecoder::new(bit_width); + rle_decoder.set_data(data.start_from(1)); + self.num_values = num_values; + self.rle_decoder = Some(rle_decoder); + Ok(()) + } + + fn get(&mut self, buffer: &mut [T::T]) -> Result { + assert!(self.rle_decoder.is_some()); + assert!(self.has_dictionary, "Must call set_dict() first!"); + + let rle = self.rle_decoder.as_mut().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + rle.get_batch_with_dict(&self.dictionary[..], buffer, num_values) + } + + /// Number of values left in this decoder stream + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::RLE_DICTIONARY + } +} + +// ---------------------------------------------------------------------- +// RLE Decoding + +/// RLE/Bit-Packing hybrid decoding for values. +/// Currently is used only for data pages v2 and supports boolean types. +/// See [`RleValueEncoder`](`::encoding::RleValueEncoder`) for more information. +pub struct RleValueDecoder { + values_left: usize, + decoder: Option, + _phantom: PhantomData, +} + +impl RleValueDecoder { + pub fn new() -> Self { + Self { + values_left: 0, + decoder: None, + _phantom: PhantomData, + } + } + + #[inline] + fn set_data_internal(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + // We still need to remove prefix of i32 from the stream. + let i32_size = mem::size_of::(); + let data_size = read_num_bytes!(i32, i32_size, data.as_ref()) as usize; + let rle_decoder = self + .decoder + .as_mut() + .expect("RLE decoder is not initialized"); + rle_decoder.set_data(data.range(i32_size, data_size)); + self.values_left = num_values; + Ok(()) + } +} + +impl Decoder for RleValueDecoder { + #[inline] + default fn set_data(&mut self, _data: ByteBufferPtr, _num_values: usize) -> Result<()> { + panic!("RleValueDecoder only supports BoolType"); + } + + #[inline] + fn values_left(&self) -> usize { + self.values_left + } + + #[inline] + fn encoding(&self) -> Encoding { + Encoding::RLE + } + + #[inline] + fn get(&mut self, buffer: &mut [T::T]) -> Result { + let rle_decoder = self + .decoder + .as_mut() + .expect("RLE decoder is not initialized"); + let values_read = rle_decoder.get_batch(buffer)?; + self.values_left -= values_read; + Ok(values_read) + } +} + +impl Decoder for RleValueDecoder { + #[inline] + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + // Only support RLE value reader for boolean values with bit width of 1. + self.decoder = Some(RleDecoder::new(1)); + self.set_data_internal(data, num_values) + } +} + +// ---------------------------------------------------------------------- +// DELTA_BINARY_PACKED Decoding + +/// Delta binary packed decoder. +/// Supports INT32 and INT64 types. +/// See [`DeltaBitPackEncoder`](`::encoding::DeltaBitPackEncoder`) for more information. +pub struct DeltaBitPackDecoder { + bit_reader: BitReader, + initialized: bool, + + // Header info + num_values: usize, + num_mini_blocks: i64, + values_per_mini_block: usize, + values_current_mini_block: usize, + first_value: i64, + first_value_read: bool, + + // Per block info + min_delta: i64, + mini_block_idx: usize, + delta_bit_width: u8, + delta_bit_widths: ByteBuffer, + deltas_in_mini_block: Vec, // eagerly loaded deltas for a mini block + use_batch: bool, + + current_value: i64, + + _phantom: PhantomData, +} + +impl DeltaBitPackDecoder { + /// Creates new delta bit packed decoder. + pub fn new() -> Self { + Self { + bit_reader: BitReader::from(vec![]), + initialized: false, + num_values: 0, + num_mini_blocks: 0, + values_per_mini_block: 0, + values_current_mini_block: 0, + first_value: 0, + first_value_read: false, + min_delta: 0, + mini_block_idx: 0, + delta_bit_width: 0, + delta_bit_widths: ByteBuffer::new(), + deltas_in_mini_block: vec![], + use_batch: mem::size_of::() == 4, + current_value: 0, + _phantom: PhantomData, + } + } + + /// Returns underlying bit reader offset. + pub fn get_offset(&self) -> usize { + assert!(self.initialized, "Bit reader is not initialized"); + self.bit_reader.get_byte_offset() + } + + /// Initializes new mini block. + #[inline] + fn init_block(&mut self) -> Result<()> { + self.min_delta = self + .bit_reader + .get_zigzag_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'min_delta'"))?; + + let mut widths = vec![]; + for _ in 0..self.num_mini_blocks { + let w = self + .bit_reader + .get_aligned::(1) + .ok_or(eof_err!("Not enough data to decode 'width'"))?; + widths.push(w); + } + + self.delta_bit_widths.set_data(widths); + self.mini_block_idx = 0; + self.delta_bit_width = self.delta_bit_widths.data()[0]; + self.values_current_mini_block = self.values_per_mini_block; + Ok(()) + } + + /// Loads delta into mini block. + #[inline] + fn load_deltas_in_mini_block(&mut self) -> Result<()> { + self.deltas_in_mini_block.clear(); + if self.use_batch { + self.deltas_in_mini_block + .resize(self.values_current_mini_block, T::T::default()); + let loaded = self.bit_reader.get_batch::( + &mut self.deltas_in_mini_block[..], + self.delta_bit_width as usize, + ); + assert!(loaded == self.values_current_mini_block); + } else { + for _ in 0..self.values_current_mini_block { + // TODO: load one batch at a time similar to int32 + let delta = self + .bit_reader + .get_value::(self.delta_bit_width as usize) + .ok_or(eof_err!("Not enough data to decode 'delta'"))?; + self.deltas_in_mini_block.push(delta); + } + } + + Ok(()) + } +} + +impl Decoder for DeltaBitPackDecoder { + // # of total values is derived from encoding + #[inline] + default fn set_data(&mut self, data: ByteBufferPtr, _: usize) -> Result<()> { + self.bit_reader = BitReader::new(data); + self.initialized = true; + + let block_size = self + .bit_reader + .get_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'block_size'"))?; + self.num_mini_blocks = self + .bit_reader + .get_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'num_mini_blocks'"))?; + self.num_values = + self.bit_reader + .get_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'num_values'"))? as usize; + self.first_value = self + .bit_reader + .get_zigzag_vlq_int() + .ok_or(eof_err!("Not enough data to decode 'first_value'"))?; + + // Reset decoding state + self.first_value_read = false; + self.mini_block_idx = 0; + self.delta_bit_widths.clear(); + self.values_current_mini_block = 0; + + self.values_per_mini_block = (block_size / self.num_mini_blocks) as usize; + assert!(self.values_per_mini_block % 8 == 0); + + Ok(()) + } + + default fn get(&mut self, buffer: &mut [T::T]) -> Result { + assert!(self.initialized, "Bit reader is not initialized"); + + let num_values = cmp::min(buffer.len(), self.num_values); + for i in 0..num_values { + if !self.first_value_read { + self.set_decoded_value(buffer, i, self.first_value); + self.current_value = self.first_value; + self.first_value_read = true; + continue; + } + + if self.values_current_mini_block == 0 { + self.mini_block_idx += 1; + if self.mini_block_idx < self.delta_bit_widths.size() { + self.delta_bit_width = self.delta_bit_widths.data()[self.mini_block_idx]; + self.values_current_mini_block = self.values_per_mini_block; + } else { + self.init_block()?; + } + self.load_deltas_in_mini_block()?; + } + + // we decrement values in current mini block, so we need to invert index for delta + let delta = + self.get_delta(self.deltas_in_mini_block.len() - self.values_current_mini_block); + // It is OK for deltas to contain "overflowed" values after encoding, + // e.g. i64::MAX - i64::MIN, so we use `wrapping_add` to "overflow" again and + // restore original value. + self.current_value = self.current_value.wrapping_add(self.min_delta); + self.current_value = self.current_value.wrapping_add(delta as i64); + self.set_decoded_value(buffer, i, self.current_value); + self.values_current_mini_block -= 1; + } + + self.num_values -= num_values; + Ok(num_values) + } + + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_BINARY_PACKED + } +} + +/// Helper trait to define specific conversions when decoding values +trait DeltaBitPackDecoderConversion { + /// Sets decoded value based on type `T`. + #[inline] + fn get_delta(&self, index: usize) -> i64; + + #[inline] + fn set_decoded_value(&self, buffer: &mut [T::T], index: usize, value: i64); +} + +impl DeltaBitPackDecoderConversion for DeltaBitPackDecoder { + #[inline] + default fn get_delta(&self, _: usize) -> i64 { + panic!("DeltaBitPackDecoder only supports Int32Type and Int64Type") + } + + #[inline] + default fn set_decoded_value(&self, _: &mut [T::T], _: usize, _: i64) { + panic!("DeltaBitPackDecoder only supports Int32Type and Int64Type") + } +} + +impl DeltaBitPackDecoderConversion for DeltaBitPackDecoder { + #[inline] + fn get_delta(&self, index: usize) -> i64 { + self.deltas_in_mini_block[index] as i64 + } + + #[inline] + fn set_decoded_value(&self, buffer: &mut [i32], index: usize, value: i64) { + buffer[index] = value as i32; + } +} + +impl DeltaBitPackDecoderConversion for DeltaBitPackDecoder { + #[inline] + fn get_delta(&self, index: usize) -> i64 { + self.deltas_in_mini_block[index] + } + + #[inline] + fn set_decoded_value(&self, buffer: &mut [i64], index: usize, value: i64) { + buffer[index] = value; + } +} + +// ---------------------------------------------------------------------- +// DELTA_LENGTH_BYTE_ARRAY Decoding + +/// Delta length byte array decoder. +/// Only applied to byte arrays to separate the length values and the data, the lengths +/// are encoded using DELTA_BINARY_PACKED encoding. +/// See [`DeltaLengthByteArrayEncoder`](`::encoding::DeltaLengthByteArrayEncoder`) +/// for more information. +pub struct DeltaLengthByteArrayDecoder { + // Lengths for each byte array in `data` + // TODO: add memory tracker to this + lengths: Vec, + + // Current index into `lengths` + current_idx: usize, + + // Concatenated byte array data + data: Option, + + // Offset into `data`, always point to the beginning of next byte array. + offset: usize, + + // Number of values left in this decoder stream + num_values: usize, + + // Placeholder to allow `T` as generic parameter + _phantom: PhantomData, +} + +impl DeltaLengthByteArrayDecoder { + /// Creates new delta length byte array decoder. + pub fn new() -> Self { + Self { + lengths: vec![], + current_idx: 0, + data: None, + offset: 0, + num_values: 0, + _phantom: PhantomData, + } + } +} + +impl Decoder for DeltaLengthByteArrayDecoder { + default fn set_data(&mut self, _: ByteBufferPtr, _: usize) -> Result<()> { + Err(general_err!( + "DeltaLengthByteArrayDecoder only support ByteArrayType" + )) + } + + default fn get(&mut self, _: &mut [T::T]) -> Result { + Err(general_err!( + "DeltaLengthByteArrayDecoder only support ByteArrayType" + )) + } + + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } +} + +impl Decoder for DeltaLengthByteArrayDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + let mut len_decoder = DeltaBitPackDecoder::::new(); + len_decoder.set_data(data.all(), num_values)?; + let num_lengths = len_decoder.values_left(); + self.lengths.resize(num_lengths, 0); + len_decoder.get(&mut self.lengths[..])?; + + self.data = Some(data.start_from(len_decoder.get_offset())); + self.offset = 0; + self.current_idx = 0; + self.num_values = num_lengths; + Ok(()) + } + + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + assert!(self.data.is_some()); + + let data = self.data.as_ref().unwrap(); + let num_values = cmp::min(buffer.len(), self.num_values); + for i in 0..num_values { + let len = self.lengths[self.current_idx] as usize; + buffer[i].set_data(data.range(self.offset, len)); + self.offset += len; + self.current_idx += 1; + } + + self.num_values -= num_values; + Ok(num_values) + } +} + +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY Decoding + +/// Delta byte array decoder. +/// Prefix lengths are encoded using `DELTA_BINARY_PACKED` encoding, Suffixes are stored +/// using `DELTA_LENGTH_BYTE_ARRAY` encoding. +/// See [`DeltaByteArrayEncoder`](`::encoding::DeltaByteArrayEncoder`) for more +/// information. +pub struct DeltaByteArrayDecoder { + // Prefix lengths for each byte array + // TODO: add memory tracker to this + prefix_lengths: Vec, + + // The current index into `prefix_lengths`, + current_idx: usize, + + // Decoder for all suffixes, the # of which should be the same as + // `prefix_lengths.len()` + suffix_decoder: Option>, + + // The last byte array, used to derive the current prefix + previous_value: Vec, + + // Number of values left + num_values: usize, + + // Placeholder to allow `T` as generic parameter + _phantom: PhantomData, +} + +impl DeltaByteArrayDecoder { + /// Creates new delta byte array decoder. + pub fn new() -> Self { + Self { + prefix_lengths: vec![], + current_idx: 0, + suffix_decoder: None, + previous_value: vec![], + num_values: 0, + _phantom: PhantomData, + } + } +} + +impl<'m, T: DataType> Decoder for DeltaByteArrayDecoder { + default fn set_data(&mut self, _: ByteBufferPtr, _: usize) -> Result<()> { + Err(general_err!( + "DeltaByteArrayDecoder only supports ByteArrayType and FixedLenByteArrayType" + )) + } + + default fn get(&mut self, _: &mut [T::T]) -> Result { + Err(general_err!( + "DeltaByteArrayDecoder only supports ByteArrayType and FixedLenByteArrayType" + )) + } + + fn values_left(&self) -> usize { + self.num_values + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_BYTE_ARRAY + } +} + +impl Decoder for DeltaByteArrayDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + let mut prefix_len_decoder = DeltaBitPackDecoder::::new(); + prefix_len_decoder.set_data(data.all(), num_values)?; + let num_prefixes = prefix_len_decoder.values_left(); + self.prefix_lengths.resize(num_prefixes, 0); + prefix_len_decoder.get(&mut self.prefix_lengths[..])?; + + let mut suffix_decoder = DeltaLengthByteArrayDecoder::new(); + suffix_decoder.set_data(data.start_from(prefix_len_decoder.get_offset()), num_values)?; + self.suffix_decoder = Some(suffix_decoder); + self.num_values = num_prefixes; + self.current_idx = 0; + self.previous_value.clear(); + Ok(()) + } + + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + assert!(self.suffix_decoder.is_some()); + + let num_values = cmp::min(buffer.len(), self.num_values); + let mut v: [ByteArray; 1] = [ByteArray::new(); 1]; + for i in 0..num_values { + // Process suffix + // TODO: this is awkward - maybe we should add a non-vectorized API? + let suffix_decoder = self.suffix_decoder.as_mut().unwrap(); + suffix_decoder.get(&mut v[..])?; + let suffix = v[0].data(); + + // Extract current prefix length, can be 0 + let prefix_len = self.prefix_lengths[self.current_idx] as usize; + + // Concatenate prefix with suffix + let mut result = Vec::new(); + result.extend_from_slice(&self.previous_value[0..prefix_len]); + result.extend_from_slice(suffix); + + let data = ByteBufferPtr::new(result.clone()); + buffer[i].set_data(data); + self.previous_value = result; + self.current_idx += 1; + } + + self.num_values -= num_values; + Ok(num_values) + } +} + +impl Decoder for DeltaByteArrayDecoder { + fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + let s: &mut DeltaByteArrayDecoder = unsafe { mem::transmute(self) }; + s.set_data(data, num_values) + } + + fn get(&mut self, buffer: &mut [ByteArray]) -> Result { + let s: &mut DeltaByteArrayDecoder = unsafe { mem::transmute(self) }; + s.get(buffer) + } +} + +#[cfg(test)] +mod tests { + use super::{super::encoding::*, *}; + + use std::{mem, rc::Rc}; + + use crate::parquet::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, + }; + use crate::parquet::util::{bit_util::set_array_bit, memory::MemTracker, test_common::RandGen}; + + #[test] + fn test_get_decoders() { + // supported encodings + create_and_check_decoder::(Encoding::PLAIN, None); + create_and_check_decoder::(Encoding::DELTA_BINARY_PACKED, None); + create_and_check_decoder::(Encoding::DELTA_LENGTH_BYTE_ARRAY, None); + create_and_check_decoder::(Encoding::DELTA_BYTE_ARRAY, None); + create_and_check_decoder::(Encoding::RLE, None); + + // error when initializing + create_and_check_decoder::( + Encoding::RLE_DICTIONARY, + Some(general_err!( + "Cannot initialize this encoding through this function" + )), + ); + create_and_check_decoder::( + Encoding::PLAIN_DICTIONARY, + Some(general_err!( + "Cannot initialize this encoding through this function" + )), + ); + + // unsupported + create_and_check_decoder::( + Encoding::BIT_PACKED, + Some(nyi_err!("Encoding BIT_PACKED is not supported")), + ); + } + + #[test] + fn test_plain_decode_int32() { + let data = vec![42, 18, 52]; + let data_bytes = Int32Type::to_byte_array(&data[..]); + let mut buffer = vec![0; 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_int64() { + let data = vec![42, 18, 52]; + let data_bytes = Int64Type::to_byte_array(&data[..]); + let mut buffer = vec![0; 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_float() { + let data = vec![3.14, 2.414, 12.51]; + let data_bytes = FloatType::to_byte_array(&data[..]); + let mut buffer = vec![0.0; 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_double() { + let data = vec![3.14f64, 2.414f64, 12.51f64]; + let data_bytes = DoubleType::to_byte_array(&data[..]); + let mut buffer = vec![0.0f64; 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_int96() { + let mut data = vec![Int96::new(); 4]; + data[0].set_data(11, 22, 33); + data[1].set_data(44, 55, 66); + data[2].set_data(10, 20, 30); + data[3].set_data(40, 50, 60); + let data_bytes = Int96Type::to_byte_array(&data[..]); + let mut buffer = vec![Int96::new(); 4]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 4, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_bool() { + let data = vec![ + false, true, false, false, true, false, true, true, false, true, + ]; + let data_bytes = BoolType::to_byte_array(&data[..]); + let mut buffer = vec![false; 10]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 10, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_byte_array() { + let mut data = vec![ByteArray::new(); 2]; + data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes())); + data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes())); + let data_bytes = ByteArrayType::to_byte_array(&data[..]); + let mut buffer = vec![ByteArray::new(); 2]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 2, + -1, + &mut buffer[..], + &data[..], + ); + } + + #[test] + fn test_plain_decode_fixed_len_byte_array() { + let mut data = vec![ByteArray::default(); 3]; + data[0].set_data(ByteBufferPtr::new(String::from("bird").into_bytes())); + data[1].set_data(ByteBufferPtr::new(String::from("come").into_bytes())); + data[2].set_data(ByteBufferPtr::new(String::from("flow").into_bytes())); + let data_bytes = FixedLenByteArrayType::to_byte_array(&data[..]); + let mut buffer = vec![ByteArray::default(); 3]; + test_plain_decode::( + ByteBufferPtr::new(data_bytes), + 3, + 4, + &mut buffer[..], + &data[..], + ); + } + + #[test] + #[should_panic(expected = "RleValueEncoder only supports BoolType")] + fn test_rle_value_encode_int32_not_supported() { + let mut encoder = RleValueEncoder::::new(); + encoder.put(&vec![1, 2, 3, 4]).unwrap(); + } + + #[test] + #[should_panic(expected = "RleValueDecoder only supports BoolType")] + fn test_rle_value_decode_int32_not_supported() { + let mut decoder = RleValueDecoder::::new(); + decoder + .set_data(ByteBufferPtr::new(vec![5, 0, 0, 0]), 1) + .unwrap(); + } + + #[test] + fn test_rle_value_decode_bool_decode() { + // Test multiple 'put' calls on the same encoder + let data = vec![ + BoolType::gen_vec(-1, 256), + BoolType::gen_vec(-1, 257), + BoolType::gen_vec(-1, 126), + ]; + test_rle_value_decode::(data); + } + + #[test] + #[should_panic(expected = "Bit reader is not initialized")] + fn test_delta_bit_packed_not_initialized_offset() { + // Fail if set_data() is not called before get_offset() + let decoder = DeltaBitPackDecoder::::new(); + decoder.get_offset(); + } + + #[test] + #[should_panic(expected = "Bit reader is not initialized")] + fn test_delta_bit_packed_not_initialized_get() { + // Fail if set_data() is not called before get() + let mut decoder = DeltaBitPackDecoder::::new(); + let mut buffer = vec![]; + decoder.get(&mut buffer).unwrap(); + } + + #[test] + fn test_delta_bit_packed_int32_empty() { + let data = vec![vec![0; 0]]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int32_repeat() { + let block_data = vec![ + 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, + 6, 7, 8, + ]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_uneven() { + let block_data = vec![1, -2, 3, -4, 5, 6, 7, 8, 9, 10, 11]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_same_values() { + let block_data = vec![ + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + ]; + test_delta_bit_packed_decode::(vec![block_data]); + + let block_data = vec![ + -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, -127, + -127, -127, + ]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_min_max() { + let block_data = vec![ + i32::min_value(), + i32::max_value(), + i32::min_value(), + i32::max_value(), + i32::min_value(), + i32::max_value(), + i32::min_value(), + i32::max_value(), + ]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int32_multiple_blocks() { + // Test multiple 'put' calls on the same encoder + let data = vec![ + Int32Type::gen_vec(-1, 64), + Int32Type::gen_vec(-1, 128), + Int32Type::gen_vec(-1, 64), + ]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int32_data_across_blocks() { + // Test multiple 'put' calls on the same encoder + let data = vec![Int32Type::gen_vec(-1, 256), Int32Type::gen_vec(-1, 257)]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int32_with_empty_blocks() { + let data = vec![ + Int32Type::gen_vec(-1, 128), + vec![0; 0], + Int32Type::gen_vec(-1, 64), + ]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int64_empty() { + let data = vec![vec![0; 0]]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_int64_min_max() { + let block_data = vec![ + i64::min_value(), + i64::max_value(), + i64::min_value(), + i64::max_value(), + i64::min_value(), + i64::max_value(), + i64::min_value(), + i64::max_value(), + ]; + test_delta_bit_packed_decode::(vec![block_data]); + } + + #[test] + fn test_delta_bit_packed_int64_multiple_blocks() { + // Test multiple 'put' calls on the same encoder + let data = vec![ + Int64Type::gen_vec(-1, 64), + Int64Type::gen_vec(-1, 128), + Int64Type::gen_vec(-1, 64), + ]; + test_delta_bit_packed_decode::(data); + } + + #[test] + fn test_delta_bit_packed_decoder_sample() { + let data_bytes = vec![ + 128, 1, 4, 3, 58, 28, 6, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ]; + let buffer = ByteBufferPtr::new(data_bytes); + let mut decoder: DeltaBitPackDecoder = DeltaBitPackDecoder::new(); + decoder.set_data(buffer, 3).unwrap(); + // check exact offsets, because when reading partial values we end up with + // some data not being read from bit reader + assert_eq!(decoder.get_offset(), 5); + let mut result = vec![0, 0, 0]; + decoder.get(&mut result).unwrap(); + assert_eq!(decoder.get_offset(), 34); + assert_eq!(result, vec![29, 43, 89]); + } + + #[test] + fn test_delta_byte_array_same_arrays() { + let data = vec![ + vec![ByteArray::from(vec![1, 2, 3, 4, 5, 6])], + vec![ + ByteArray::from(vec![1, 2, 3, 4, 5, 6]), + ByteArray::from(vec![1, 2, 3, 4, 5, 6]), + ], + vec![ + ByteArray::from(vec![1, 2, 3, 4, 5, 6]), + ByteArray::from(vec![1, 2, 3, 4, 5, 6]), + ], + ]; + test_delta_byte_array_decode(data); + } + + #[test] + fn test_delta_byte_array_unique_arrays() { + let data = vec![ + vec![ByteArray::from(vec![1])], + vec![ByteArray::from(vec![2, 3]), ByteArray::from(vec![4, 5, 6])], + vec![ + ByteArray::from(vec![7, 8]), + ByteArray::from(vec![9, 0, 1, 2]), + ], + ]; + test_delta_byte_array_decode(data); + } + + #[test] + fn test_delta_byte_array_single_array() { + let data = vec![vec![ByteArray::from(vec![1, 2, 3, 4, 5, 6])]]; + test_delta_byte_array_decode(data); + } + + fn test_plain_decode( + data: ByteBufferPtr, + num_values: usize, + type_length: i32, + buffer: &mut [T::T], + expected: &[T::T], + ) { + let mut decoder: PlainDecoder = PlainDecoder::new(type_length); + let result = decoder.set_data(data, num_values); + assert!(result.is_ok()); + let result = decoder.get(&mut buffer[..]); + assert!(result.is_ok()); + assert_eq!(decoder.values_left(), 0); + assert_eq!(buffer, expected); + } + + fn test_rle_value_decode(data: Vec>) { + test_encode_decode::(data, Encoding::RLE); + } + + fn test_delta_bit_packed_decode(data: Vec>) { + test_encode_decode::(data, Encoding::DELTA_BINARY_PACKED); + } + + fn test_delta_byte_array_decode(data: Vec>) { + test_encode_decode::(data, Encoding::DELTA_BYTE_ARRAY); + } + + // Input data represents vector of data slices to write (test multiple `put()` calls) + // For example, + // vec![vec![1, 2, 3]] invokes `put()` once and writes {1, 2, 3} + // vec![vec![1, 2], vec![3]] invokes `put()` twice and writes {1, 2, 3} + fn test_encode_decode(data: Vec>, encoding: Encoding) { + // Type length should not really matter for encode/decode test, + // otherwise change it based on type + let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); + + // Encode data + let mut encoder = get_encoder::(col_descr.clone(), encoding, Rc::new(MemTracker::new())) + .expect("get encoder"); + + for v in &data[..] { + encoder.put(&v[..]).expect("ok to encode"); + } + let bytes = encoder.flush_buffer().expect("ok to flush buffer"); + + // Flatten expected data as contiguous array of values + let expected: Vec = data.iter().flat_map(|s| s.clone()).collect(); + + // Decode data and compare with original + let mut decoder = get_decoder::(col_descr.clone(), encoding).expect("get decoder"); + + let mut result = vec![T::T::default(); expected.len()]; + decoder + .set_data(bytes, expected.len()) + .expect("ok to set data"); + let mut result_num_values = 0; + while decoder.values_left() > 0 { + result_num_values += decoder + .get(&mut result[result_num_values..]) + .expect("ok to decode"); + } + assert_eq!(result_num_values, expected.len()); + assert_eq!(result, expected); + } + + fn create_and_check_decoder(encoding: Encoding, err: Option) { + let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); + let decoder = get_decoder::(descr, encoding); + match err { + Some(parquet_error) => { + assert!(decoder.is_err()); + assert_eq!(decoder.err().unwrap(), parquet_error); + } + None => { + assert!(decoder.is_ok()); + assert_eq!(decoder.unwrap().encoding(), encoding); + } + } + } + + // Creates test column descriptor. + fn create_test_col_desc_ptr(type_len: i32, t: Type) -> ColumnDescPtr { + let ty = SchemaType::primitive_type_builder("t", t) + .with_length(type_len) + .build() + .unwrap(); + Rc::new(ColumnDescriptor::new( + Rc::new(ty), + None, + 0, + 0, + ColumnPath::new(vec![]), + )) + } + + fn usize_to_bytes(v: usize) -> [u8; 4] { + unsafe { mem::transmute::(v as u32) } + } + + /// A util trait to convert slices of different types to byte arrays + trait ToByteArray { + fn to_byte_array(data: &[T::T]) -> Vec; + } + + impl ToByteArray for T + where + T: DataType, + { + default fn to_byte_array(data: &[T::T]) -> Vec { + let mut v = vec![]; + let type_len = ::std::mem::size_of::(); + v.extend_from_slice(unsafe { + ::std::slice::from_raw_parts(data.as_ptr() as *const u8, data.len() * type_len) + }); + v + } + } + + impl ToByteArray for BoolType { + fn to_byte_array(data: &[bool]) -> Vec { + let mut v = vec![]; + for i in 0..data.len() { + if i % 8 == 0 { + v.push(0); + } + if data[i] { + set_array_bit(&mut v[..], i); + } + } + v + } + } + + impl ToByteArray for Int96Type { + fn to_byte_array(data: &[Int96]) -> Vec { + let mut v = vec![]; + for d in data { + unsafe { + let copy = ::std::slice::from_raw_parts(d.data().as_ptr() as *const u8, 12); + v.extend_from_slice(copy); + }; + } + v + } + } + + impl ToByteArray for ByteArrayType { + fn to_byte_array(data: &[ByteArray]) -> Vec { + let mut v = vec![]; + for d in data { + let buf = d.data(); + let len = &usize_to_bytes(buf.len()); + v.extend_from_slice(len); + v.extend(buf); + } + v + } + } + + impl ToByteArray for FixedLenByteArrayType { + fn to_byte_array(data: &[ByteArray]) -> Vec { + let mut v = vec![]; + for d in data { + let buf = d.data(); + v.extend(buf); + } + v + } + } +} diff --git a/rust/src/parquet/encodings/encoding.rs b/rust/src/parquet/encodings/encoding.rs new file mode 100644 index 0000000000000..cecb03cb540a9 --- /dev/null +++ b/rust/src/parquet/encodings/encoding.rs @@ -0,0 +1,1360 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains all supported encoders for Parquet. + +use std::{cmp, io::Write, marker::PhantomData, mem, slice}; + +use crate::parquet::basic::*; +use crate::parquet::data_type::*; +use crate::parquet::encodings::rle::RleEncoder; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::ColumnDescPtr; +use crate::parquet::util::{ + bit_util::{log2, num_required_bits, BitWriter}, + hash_util, + memory::{Buffer, ByteBuffer, ByteBufferPtr, MemTrackerPtr}, +}; + +// ---------------------------------------------------------------------- +// Encoders + +/// An Parquet encoder for the data type `T`. +/// +/// Currently this allocates internal buffers for the encoded values. After done putting +/// values, caller should call `flush_buffer()` to get an immutable buffer pointer. +pub trait Encoder { + /// Encodes data from `values`. + fn put(&mut self, values: &[T::T]) -> Result<()>; + + /// Returns the encoding type of this encoder. + fn encoding(&self) -> Encoding; + + /// Returns an estimate of the encoded data, in bytes. + /// Method call must be O(1). + fn estimated_data_encoded_size(&self) -> usize; + + /// Flushes the underlying byte buffer that's being processed by this encoder, and + /// return the immutable copy of it. This will also reset the internal state. + fn flush_buffer(&mut self) -> Result; +} + +/// Gets a encoder for the particular data type `T` and encoding `encoding`. Memory usage +/// for the encoder instance is tracked by `mem_tracker`. +pub fn get_encoder( + desc: ColumnDescPtr, + encoding: Encoding, + mem_tracker: MemTrackerPtr, +) -> Result>> { + let encoder: Box> = match encoding { + Encoding::PLAIN => Box::new(PlainEncoder::new(desc, mem_tracker, vec![])), + Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { + return Err(general_err!( + "Cannot initialize this encoding through this function" + )); + } + Encoding::RLE => Box::new(RleValueEncoder::new()), + Encoding::DELTA_BINARY_PACKED => Box::new(DeltaBitPackEncoder::new()), + Encoding::DELTA_LENGTH_BYTE_ARRAY => Box::new(DeltaLengthByteArrayEncoder::new()), + Encoding::DELTA_BYTE_ARRAY => Box::new(DeltaByteArrayEncoder::new()), + e => return Err(nyi_err!("Encoding {} is not supported", e)), + }; + Ok(encoder) +} + +// ---------------------------------------------------------------------- +// Plain encoding + +/// Plain encoding that supports all types. +/// Values are encoded back to back. +/// The plain encoding is used whenever a more efficient encoding can not be used. +/// It stores the data in the following format: +/// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. +/// - INT32 - 4 bytes per value, stored as little-endian. +/// - INT64 - 8 bytes per value, stored as little-endian. +/// - FLOAT - 4 bytes per value, stored as IEEE little-endian. +/// - DOUBLE - 8 bytes per value, stored as IEEE little-endian. +/// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. +/// - FIXED_LEN_BYTE_ARRAY - just the bytes are stored. +pub struct PlainEncoder { + buffer: ByteBuffer, + bit_writer: BitWriter, + desc: ColumnDescPtr, + _phantom: PhantomData, +} + +impl PlainEncoder { + /// Creates new plain encoder. + pub fn new(desc: ColumnDescPtr, mem_tracker: MemTrackerPtr, vec: Vec) -> Self { + let mut byte_buffer = ByteBuffer::new().with_mem_tracker(mem_tracker); + byte_buffer.set_data(vec); + Self { + buffer: byte_buffer, + bit_writer: BitWriter::new(256), + desc, + _phantom: PhantomData, + } + } +} + +impl Encoder for PlainEncoder { + default fn put(&mut self, values: &[T::T]) -> Result<()> { + let bytes = unsafe { + slice::from_raw_parts( + values as *const [T::T] as *const u8, + mem::size_of::() * values.len(), + ) + }; + self.buffer.write(bytes)?; + Ok(()) + } + + fn encoding(&self) -> Encoding { + Encoding::PLAIN + } + + fn estimated_data_encoded_size(&self) -> usize { + self.buffer.size() + self.bit_writer.bytes_written() + } + + #[inline] + default fn flush_buffer(&mut self) -> Result { + self.buffer.write(self.bit_writer.flush_buffer())?; + self.buffer.flush()?; + self.bit_writer.clear(); + + Ok(self.buffer.consume()) + } +} + +impl Encoder for PlainEncoder { + fn put(&mut self, values: &[bool]) -> Result<()> { + for v in values { + self.bit_writer.put_value(*v as u64, 1); + } + Ok(()) + } +} + +impl Encoder for PlainEncoder { + fn put(&mut self, values: &[Int96]) -> Result<()> { + for v in values { + self.buffer.write(v.as_bytes())?; + } + self.buffer.flush()?; + Ok(()) + } +} + +impl Encoder for PlainEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + for v in values { + self.buffer.write(&(v.len().to_le() as u32).as_bytes())?; + self.buffer.write(v.data())?; + } + self.buffer.flush()?; + Ok(()) + } +} + +impl Encoder for PlainEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + for v in values { + self.buffer.write(v.data())?; + } + self.buffer.flush()?; + Ok(()) + } +} + +// ---------------------------------------------------------------------- +// Dictionary encoding + +const INITIAL_HASH_TABLE_SIZE: usize = 1024; +const MAX_HASH_LOAD: f32 = 0.7; +const HASH_SLOT_EMPTY: i32 = -1; + +/// Dictionary encoder. +/// The dictionary encoding builds a dictionary of values encountered in a given column. +/// The dictionary page is written first, before the data pages of the column chunk. +/// +/// Dictionary page format: the entries in the dictionary - in dictionary order - +/// using the plain encoding. +/// +/// Data page format: the bit width used to encode the entry ids stored as 1 byte +/// (max bit width = 32), followed by the values encoded using RLE/Bit packed described +/// above (with the given bit width). +pub struct DictEncoder { + // Descriptor for the column to be encoded. + desc: ColumnDescPtr, + + // Size of the table. **Must be** a power of 2. + hash_table_size: usize, + + // Store `hash_table_size` - 1, so that `j & mod_bitmask` is equivalent to + // `j % hash_table_size`, but uses far fewer CPU cycles. + mod_bitmask: u32, + + // Stores indices which map (many-to-one) to the values in the `uniques` array. + // Here we are using fix-sized array with linear probing. + // A slot with `HASH_SLOT_EMPTY` indicates the slot is not currently occupied. + hash_slots: Buffer, + + // Indices that have not yet be written out by `write_indices()`. + buffered_indices: Buffer, + + // The unique observed values. + uniques: Buffer, + + // Size in bytes needed to encode this dictionary. + uniques_size_in_bytes: usize, + + // Tracking memory usage for the various data structures in this struct. + mem_tracker: MemTrackerPtr, +} + +impl DictEncoder { + /// Creates new dictionary encoder. + pub fn new(desc: ColumnDescPtr, mem_tracker: MemTrackerPtr) -> Self { + let mut slots = Buffer::new().with_mem_tracker(mem_tracker.clone()); + slots.resize(INITIAL_HASH_TABLE_SIZE, -1); + Self { + desc, + hash_table_size: INITIAL_HASH_TABLE_SIZE, + mod_bitmask: (INITIAL_HASH_TABLE_SIZE - 1) as u32, + hash_slots: slots, + buffered_indices: Buffer::new().with_mem_tracker(mem_tracker.clone()), + uniques: Buffer::new().with_mem_tracker(mem_tracker.clone()), + uniques_size_in_bytes: 0, + mem_tracker, + } + } + + /// Returns true if dictionary entries are sorted, false otherwise. + #[inline] + pub fn is_sorted(&self) -> bool { + // Sorting is not supported currently. + false + } + + /// Returns number of unique values (keys) in the dictionary. + pub fn num_entries(&self) -> usize { + self.uniques.size() + } + + /// Returns size of unique values (keys) in the dictionary, in bytes. + pub fn dict_encoded_size(&self) -> usize { + self.uniques_size_in_bytes + } + + /// Writes out the dictionary values with PLAIN encoding in a byte buffer, and return + /// the result. + #[inline] + pub fn write_dict(&self) -> Result { + let mut plain_encoder = + PlainEncoder::::new(self.desc.clone(), self.mem_tracker.clone(), vec![]); + plain_encoder.put(self.uniques.data())?; + plain_encoder.flush_buffer() + } + + /// Writes out the dictionary values with RLE encoding in a byte buffer, and return the + /// result. + #[inline] + pub fn write_indices(&mut self) -> Result { + // TODO: the caller should allocate the buffer + let buffer_len = self.estimated_data_encoded_size(); + let mut buffer: Vec = vec![0; buffer_len as usize]; + buffer[0] = self.bit_width() as u8; + self.mem_tracker.alloc(buffer.capacity() as i64); + + // Write bit width in the first byte + buffer.write((self.bit_width() as u8).as_bytes())?; + let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer, 1); + for index in self.buffered_indices.data() { + if !encoder.put(*index as u64)? { + return Err(general_err!("Encoder doesn't have enough space")); + } + } + self.buffered_indices.clear(); + Ok(ByteBufferPtr::new(encoder.consume()?)) + } + + #[inline] + fn put_one(&mut self, value: &T::T) -> Result<()> { + let mut j = (hash_util::hash(value, 0) & self.mod_bitmask) as usize; + let mut index = self.hash_slots[j]; + + while index != HASH_SLOT_EMPTY && self.uniques[index as usize] != *value { + j += 1; + if j == self.hash_table_size { + j = 0; + } + index = self.hash_slots[j]; + } + + if index == HASH_SLOT_EMPTY { + index = self.uniques.size() as i32; + self.hash_slots[j] = index; + self.add_dict_key(value.clone()); + + if self.uniques.size() > (self.hash_table_size as f32 * MAX_HASH_LOAD) as usize { + self.double_table_size(); + } + } + + self.buffered_indices.push(index); + Ok(()) + } + + #[inline] + fn add_dict_key(&mut self, value: T::T) { + self.uniques_size_in_bytes += self.get_encoded_size(&value); + self.uniques.push(value); + } + + #[inline] + fn bit_width(&self) -> u8 { + let num_entries = self.uniques.size(); + if num_entries == 0 { + 0 + } else if num_entries == 1 { + 1 + } else { + log2(num_entries as u64) as u8 + } + } + + #[inline] + fn double_table_size(&mut self) { + let new_size = self.hash_table_size * 2; + let mut new_hash_slots = Buffer::new().with_mem_tracker(self.mem_tracker.clone()); + new_hash_slots.resize(new_size, HASH_SLOT_EMPTY); + for i in 0..self.hash_table_size { + let index = self.hash_slots[i]; + if index == HASH_SLOT_EMPTY { + continue; + } + let value = &self.uniques[index as usize]; + let mut j = (hash_util::hash(value, 0) & ((new_size - 1) as u32)) as usize; + let mut slot = new_hash_slots[j]; + while slot != HASH_SLOT_EMPTY && self.uniques[slot as usize] != *value { + j += 1; + if j == new_size { + j = 0; + } + slot = new_hash_slots[j]; + } + + new_hash_slots[j] = index; + } + + self.hash_table_size = new_size; + self.mod_bitmask = (new_size - 1) as u32; + mem::replace(&mut self.hash_slots, new_hash_slots); + } +} + +impl Encoder for DictEncoder { + #[inline] + fn put(&mut self, values: &[T::T]) -> Result<()> { + for i in values { + self.put_one(&i)? + } + Ok(()) + } + + #[inline] + fn encoding(&self) -> Encoding { + Encoding::PLAIN_DICTIONARY + } + + #[inline] + fn estimated_data_encoded_size(&self) -> usize { + let bit_width = self.bit_width(); + 1 + RleEncoder::min_buffer_size(bit_width) + + RleEncoder::max_buffer_size(bit_width, self.buffered_indices.size()) + } + + #[inline] + fn flush_buffer(&mut self) -> Result { + self.write_indices() + } +} + +/// Provides encoded size for a data type. +/// This is a workaround to calculate dictionary size in bytes. +trait DictEncodedSize { + #[inline] + fn get_encoded_size(&self, value: &T::T) -> usize; +} + +impl DictEncodedSize for DictEncoder { + #[inline] + default fn get_encoded_size(&self, _: &T::T) -> usize { + mem::size_of::() + } +} + +impl DictEncodedSize for DictEncoder { + #[inline] + fn get_encoded_size(&self, value: &ByteArray) -> usize { + mem::size_of::() + value.len() + } +} + +impl DictEncodedSize for DictEncoder { + #[inline] + fn get_encoded_size(&self, _value: &ByteArray) -> usize { + self.desc.type_length() as usize + } +} + +// ---------------------------------------------------------------------- +// RLE encoding + +const DEFAULT_RLE_BUFFER_LEN: usize = 1024; + +/// RLE/Bit-Packing hybrid encoding for values. +/// Currently is used only for data pages v2 and supports boolean types. +pub struct RleValueEncoder { + // Buffer with raw values that we collect, + // when flushing buffer they are encoded using RLE encoder + encoder: Option, + _phantom: PhantomData, +} + +impl RleValueEncoder { + /// Creates new rle value encoder. + pub fn new() -> Self { + Self { + encoder: None, + _phantom: PhantomData, + } + } +} + +impl Encoder for RleValueEncoder { + #[inline] + default fn put(&mut self, _values: &[T::T]) -> Result<()> { + panic!("RleValueEncoder only supports BoolType"); + } + + fn encoding(&self) -> Encoding { + Encoding::RLE + } + + #[inline] + default fn estimated_data_encoded_size(&self) -> usize { + match self.encoder { + Some(ref enc) => enc.len(), + None => 0, + } + } + + #[inline] + default fn flush_buffer(&mut self) -> Result { + panic!("RleValueEncoder only supports BoolType"); + } +} + +impl Encoder for RleValueEncoder { + #[inline] + default fn put(&mut self, values: &[bool]) -> Result<()> { + if self.encoder.is_none() { + self.encoder = Some(RleEncoder::new(1, DEFAULT_RLE_BUFFER_LEN)); + } + let rle_encoder = self.encoder.as_mut().unwrap(); + for value in values { + if !rle_encoder.put(*value as u64)? { + return Err(general_err!("RLE buffer is full")); + } + } + Ok(()) + } + + #[inline] + fn flush_buffer(&mut self) -> Result { + assert!( + self.encoder.is_some(), + "RLE value encoder is not initialized" + ); + let rle_encoder = self.encoder.as_mut().unwrap(); + + // Flush all encoder buffers and raw values + let encoded_data = { + let buf = rle_encoder.flush_buffer()?; + + // Note that buf does not have any offset, all data is encoded bytes + let len = (buf.len() as i32).to_le(); + let len_bytes = len.as_bytes(); + let mut encoded_data = Vec::new(); + encoded_data.extend_from_slice(len_bytes); + encoded_data.extend_from_slice(buf); + encoded_data + }; + // Reset rle encoder for the next batch + rle_encoder.clear(); + + Ok(ByteBufferPtr::new(encoded_data)) + } +} + +// ---------------------------------------------------------------------- +// DELTA_BINARY_PACKED encoding + +const MAX_PAGE_HEADER_WRITER_SIZE: usize = 32; +const MAX_BIT_WRITER_SIZE: usize = 10 * 1024 * 1024; +const DEFAULT_BLOCK_SIZE: usize = 128; +const DEFAULT_NUM_MINI_BLOCKS: usize = 4; + +/// Delta bit packed encoder. +/// Consists of a header followed by blocks of delta encoded values binary packed. +/// +/// Delta-binary-packing: +/// ```shell +/// [page-header] [block 1], [block 2], ... [block N] +/// ``` +/// +/// Each page header consists of: +/// ```shell +/// [block size] [number of miniblocks in a block] [total value count] [first value] +/// ``` +/// +/// Each block consists of: +/// ```shell +/// [min delta] [list of bitwidths of miniblocks] [miniblocks] +/// ``` +/// +/// Current implementation writes values in `put` method, multiple calls to `put` to +/// existing block or start new block if block size is exceeded. Calling `flush_buffer` +/// writes out all data and resets internal state, including page header. +/// +/// Supports only INT32 and INT64. +pub struct DeltaBitPackEncoder { + page_header_writer: BitWriter, + bit_writer: BitWriter, + total_values: usize, + first_value: i64, + current_value: i64, + block_size: usize, + mini_block_size: usize, + num_mini_blocks: usize, + values_in_block: usize, + deltas: Vec, + _phantom: PhantomData, +} + +impl DeltaBitPackEncoder { + /// Creates new delta bit packed encoder. + pub fn new() -> Self { + let block_size = DEFAULT_BLOCK_SIZE; + let num_mini_blocks = DEFAULT_NUM_MINI_BLOCKS; + let mini_block_size = block_size / num_mini_blocks; + assert!(mini_block_size % 8 == 0); + Self::assert_supported_type(); + + DeltaBitPackEncoder { + page_header_writer: BitWriter::new(MAX_PAGE_HEADER_WRITER_SIZE), + bit_writer: BitWriter::new(MAX_BIT_WRITER_SIZE), + total_values: 0, + first_value: 0, + current_value: 0, // current value to keep adding deltas + block_size, // can write fewer values than block size for last block + mini_block_size, + num_mini_blocks, + values_in_block: 0, // will be at most block_size + deltas: vec![0; block_size], + _phantom: PhantomData, + } + } + + /// Writes page header for blocks, this method is invoked when we are done encoding + /// values. It is also okay to encode when no values have been provided + fn write_page_header(&mut self) { + // We ignore the result of each 'put' operation, because MAX_PAGE_HEADER_WRITER_SIZE + // is chosen to fit all header values and guarantees that writes will not fail. + + // Write the size of each block + self.page_header_writer.put_vlq_int(self.block_size as u64); + // Write the number of mini blocks + self.page_header_writer + .put_vlq_int(self.num_mini_blocks as u64); + // Write the number of all values (including non-encoded first value) + self.page_header_writer + .put_vlq_int(self.total_values as u64); + // Write first value + self.page_header_writer.put_zigzag_vlq_int(self.first_value); + } + + // Write current delta buffer (<= 'block size' values) into bit writer + fn flush_block_values(&mut self) -> Result<()> { + if self.values_in_block == 0 { + return Ok(()); + } + + let mut min_delta = i64::max_value(); + for i in 0..self.values_in_block { + min_delta = cmp::min(min_delta, self.deltas[i]); + } + + // Write min delta + self.bit_writer.put_zigzag_vlq_int(min_delta); + + // Slice to store bit width for each mini block + // apply unsafe allocation to avoid double mutable borrow + let mini_block_widths: &mut [u8] = unsafe { + let tmp_slice = self.bit_writer.get_next_byte_ptr(self.num_mini_blocks)?; + slice::from_raw_parts_mut(tmp_slice.as_ptr() as *mut u8, self.num_mini_blocks) + }; + + for i in 0..self.num_mini_blocks { + // Find how many values we need to encode - either block size or whatever values + // left + let n = cmp::min(self.mini_block_size, self.values_in_block); + if n == 0 { + break; + } + + // Compute the max delta in current mini block + let mut max_delta = i64::min_value(); + for j in 0..n { + max_delta = cmp::max(max_delta, self.deltas[i * self.mini_block_size + j]); + } + + // Compute bit width to store (max_delta - min_delta) + let bit_width = num_required_bits(self.subtract_u64(max_delta, min_delta)); + mini_block_widths[i] = bit_width as u8; + + // Encode values in current mini block using min_delta and bit_width + for j in 0..n { + let packed_value = + self.subtract_u64(self.deltas[i * self.mini_block_size + j], min_delta); + self.bit_writer.put_value(packed_value, bit_width); + } + + // Pad the last block (n < mini_block_size) + for _ in n..self.mini_block_size { + self.bit_writer.put_value(0, bit_width); + } + + self.values_in_block -= n; + } + + assert!( + self.values_in_block == 0, + "Expected 0 values in block, found {}", + self.values_in_block + ); + Ok(()) + } +} + +// Implementation is shared between Int32Type and Int64Type, +// see `DeltaBitPackEncoderConversion` below for specifics. +impl Encoder for DeltaBitPackEncoder { + fn put(&mut self, values: &[T::T]) -> Result<()> { + if values.is_empty() { + return Ok(()); + } + + let mut idx; + // Define values to encode, initialize state + if self.total_values == 0 { + self.first_value = self.as_i64(values, 0); + self.current_value = self.first_value; + idx = 1; + } else { + idx = 0; + } + // Add all values (including first value) + self.total_values += values.len(); + + // Write block + while idx < values.len() { + let value = self.as_i64(values, idx); + self.deltas[self.values_in_block] = self.subtract(value, self.current_value); + self.current_value = value; + idx += 1; + self.values_in_block += 1; + if self.values_in_block == self.block_size { + self.flush_block_values()?; + } + } + Ok(()) + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_BINARY_PACKED + } + + fn estimated_data_encoded_size(&self) -> usize { + self.bit_writer.bytes_written() + } + + fn flush_buffer(&mut self) -> Result { + // Write remaining values + self.flush_block_values()?; + // Write page header with total values + self.write_page_header(); + + let mut buffer = ByteBuffer::new(); + buffer.write(self.page_header_writer.flush_buffer())?; + buffer.write(self.bit_writer.flush_buffer())?; + buffer.flush()?; + + // Reset state + self.page_header_writer.clear(); + self.bit_writer.clear(); + self.total_values = 0; + self.first_value = 0; + self.current_value = 0; + self.values_in_block = 0; + + Ok(buffer.consume()) + } +} + +/// Helper trait to define specific conversions and subtractions when computing deltas +trait DeltaBitPackEncoderConversion { + // Method should panic if type is not supported, otherwise no-op + #[inline] + fn assert_supported_type(); + + #[inline] + fn as_i64(&self, values: &[T::T], index: usize) -> i64; + + #[inline] + fn subtract(&self, left: i64, right: i64) -> i64; + + #[inline] + fn subtract_u64(&self, left: i64, right: i64) -> u64; +} + +impl DeltaBitPackEncoderConversion for DeltaBitPackEncoder { + #[inline] + default fn assert_supported_type() { + panic!("DeltaBitPackDecoder only supports Int32Type and Int64Type"); + } + + #[inline] + default fn as_i64(&self, _values: &[T::T], _index: usize) -> i64 { + 0 + } + + #[inline] + default fn subtract(&self, _left: i64, _right: i64) -> i64 { + 0 + } + + #[inline] + default fn subtract_u64(&self, _left: i64, _right: i64) -> u64 { + 0 + } +} + +impl DeltaBitPackEncoderConversion for DeltaBitPackEncoder { + #[inline] + fn assert_supported_type() { + // no-op: supported type + } + + #[inline] + fn as_i64(&self, values: &[i32], index: usize) -> i64 { + values[index] as i64 + } + + #[inline] + fn subtract(&self, left: i64, right: i64) -> i64 { + // It is okay for values to overflow, wrapping_sub wrapping around at the boundary + (left as i32).wrapping_sub(right as i32) as i64 + } + + #[inline] + fn subtract_u64(&self, left: i64, right: i64) -> u64 { + // Conversion of i32 -> u32 -> u64 is to avoid non-zero left most bytes in int + // representation + (left as i32).wrapping_sub(right as i32) as u32 as u64 + } +} + +impl DeltaBitPackEncoderConversion for DeltaBitPackEncoder { + #[inline] + fn assert_supported_type() { + // no-op: supported type + } + + #[inline] + fn as_i64(&self, values: &[i64], index: usize) -> i64 { + values[index] + } + + #[inline] + fn subtract(&self, left: i64, right: i64) -> i64 { + // It is okay for values to overflow, wrapping_sub wrapping around at the boundary + left.wrapping_sub(right) + } + + #[inline] + fn subtract_u64(&self, left: i64, right: i64) -> u64 { + left.wrapping_sub(right) as u64 + } +} + +// ---------------------------------------------------------------------- +// DELTA_LENGTH_BYTE_ARRAY encoding + +/// Encoding for byte arrays to separate the length values and the data. +/// The lengths are encoded using DELTA_BINARY_PACKED encoding, data is +/// stored as raw bytes. +pub struct DeltaLengthByteArrayEncoder { + // length encoder + len_encoder: DeltaBitPackEncoder, + // byte array data + data: Vec, + // data size in bytes of encoded values + encoded_size: usize, + _phantom: PhantomData, +} + +impl DeltaLengthByteArrayEncoder { + /// Creates new delta length byte array encoder. + pub fn new() -> Self { + Self { + len_encoder: DeltaBitPackEncoder::new(), + data: vec![], + encoded_size: 0, + _phantom: PhantomData, + } + } +} + +impl Encoder for DeltaLengthByteArrayEncoder { + default fn put(&mut self, _values: &[T::T]) -> Result<()> { + panic!("DeltaLengthByteArrayEncoder only supports ByteArrayType"); + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } + + fn estimated_data_encoded_size(&self) -> usize { + self.len_encoder.estimated_data_encoded_size() + self.encoded_size + } + + default fn flush_buffer(&mut self) -> Result { + panic!("DeltaLengthByteArrayEncoder only supports ByteArrayType"); + } +} + +impl Encoder for DeltaLengthByteArrayEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + let lengths: Vec = values + .iter() + .map(|byte_array| byte_array.len() as i32) + .collect(); + self.len_encoder.put(&lengths)?; + for byte_array in values { + self.encoded_size += byte_array.len(); + self.data.push(byte_array.clone()); + } + Ok(()) + } + + fn flush_buffer(&mut self) -> Result { + let mut total_bytes = vec![]; + let lengths = self.len_encoder.flush_buffer()?; + total_bytes.extend_from_slice(lengths.data()); + self.data.iter().for_each(|byte_array| { + total_bytes.extend_from_slice(byte_array.data()); + }); + self.data.clear(); + self.encoded_size = 0; + Ok(ByteBufferPtr::new(total_bytes)) + } +} + +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encoding + +/// Encoding for byte arrays, prefix lengths are encoded using DELTA_BINARY_PACKED +/// encoding, followed by suffixes with DELTA_LENGTH_BYTE_ARRAY encoding. +pub struct DeltaByteArrayEncoder { + prefix_len_encoder: DeltaBitPackEncoder, + suffix_writer: DeltaLengthByteArrayEncoder, + previous: Vec, + _phantom: PhantomData, +} + +impl DeltaByteArrayEncoder { + /// Creates new delta byte array encoder. + pub fn new() -> Self { + Self { + prefix_len_encoder: DeltaBitPackEncoder::::new(), + suffix_writer: DeltaLengthByteArrayEncoder::::new(), + previous: vec![], + _phantom: PhantomData, + } + } +} + +impl Encoder for DeltaByteArrayEncoder { + default fn put(&mut self, _values: &[T::T]) -> Result<()> { + panic!("DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType"); + } + + fn encoding(&self) -> Encoding { + Encoding::DELTA_BYTE_ARRAY + } + + fn estimated_data_encoded_size(&self) -> usize { + self.prefix_len_encoder.estimated_data_encoded_size() + + self.suffix_writer.estimated_data_encoded_size() + } + + default fn flush_buffer(&mut self) -> Result { + panic!("DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType"); + } +} + +impl Encoder for DeltaByteArrayEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + let mut prefix_lengths: Vec = vec![]; + let mut suffixes: Vec = vec![]; + + for byte_array in values { + let current = byte_array.data(); + // Maximum prefix length that is shared between previous value and current value + let prefix_len = cmp::min(self.previous.len(), current.len()); + let mut match_len = 0; + while match_len < prefix_len && self.previous[match_len] == current[match_len] { + match_len += 1; + } + prefix_lengths.push(match_len as i32); + suffixes.push(byte_array.slice(match_len, byte_array.len() - match_len)); + // Update previous for the next prefix + self.previous.clear(); + self.previous.extend_from_slice(current); + } + self.prefix_len_encoder.put(&prefix_lengths)?; + self.suffix_writer.put(&suffixes)?; + Ok(()) + } + + fn flush_buffer(&mut self) -> Result { + // TODO: investigate if we can merge lengths and suffixes + // without copying data into new vector. + let mut total_bytes = vec![]; + // Insert lengths ... + let lengths = self.prefix_len_encoder.flush_buffer()?; + total_bytes.extend_from_slice(lengths.data()); + // ... followed by suffixes + let suffixes = self.suffix_writer.flush_buffer()?; + total_bytes.extend_from_slice(suffixes.data()); + + self.previous.clear(); + Ok(ByteBufferPtr::new(total_bytes)) + } +} + +impl Encoder for DeltaByteArrayEncoder { + fn put(&mut self, values: &[ByteArray]) -> Result<()> { + let s: &mut DeltaByteArrayEncoder = unsafe { mem::transmute(self) }; + s.put(values) + } + + fn flush_buffer(&mut self) -> Result { + let s: &mut DeltaByteArrayEncoder = unsafe { mem::transmute(self) }; + s.flush_buffer() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::rc::Rc; + + use crate::parquet::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}; + use crate::parquet::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, + }; + use crate::parquet::util::{memory::MemTracker, test_common::RandGen}; + + const TEST_SET_SIZE: usize = 1024; + + #[test] + fn test_get_encoders() { + // supported encodings + create_and_check_encoder::(Encoding::PLAIN, None); + create_and_check_encoder::(Encoding::DELTA_BINARY_PACKED, None); + create_and_check_encoder::(Encoding::DELTA_LENGTH_BYTE_ARRAY, None); + create_and_check_encoder::(Encoding::DELTA_BYTE_ARRAY, None); + create_and_check_encoder::(Encoding::RLE, None); + + // error when initializing + create_and_check_encoder::( + Encoding::RLE_DICTIONARY, + Some(general_err!( + "Cannot initialize this encoding through this function" + )), + ); + create_and_check_encoder::( + Encoding::PLAIN_DICTIONARY, + Some(general_err!( + "Cannot initialize this encoding through this function" + )), + ); + + // unsupported + create_and_check_encoder::( + Encoding::BIT_PACKED, + Some(nyi_err!("Encoding BIT_PACKED is not supported")), + ); + } + + #[test] + fn test_bool() { + BoolType::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + BoolType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + BoolType::test(Encoding::RLE, TEST_SET_SIZE, -1); + } + + #[test] + fn test_i32() { + Int32Type::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + Int32Type::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + Int32Type::test(Encoding::DELTA_BINARY_PACKED, TEST_SET_SIZE, -1); + } + + #[test] + fn test_i64() { + Int64Type::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + Int64Type::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + Int64Type::test(Encoding::DELTA_BINARY_PACKED, TEST_SET_SIZE, -1); + } + + #[test] + fn test_i96() { + Int96Type::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + Int96Type::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + } + + #[test] + fn test_float() { + FloatType::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + FloatType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + } + + #[test] + fn test_double() { + DoubleType::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + DoubleType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + } + + #[test] + fn test_byte_array() { + ByteArrayType::test(Encoding::PLAIN, TEST_SET_SIZE, -1); + ByteArrayType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, -1); + ByteArrayType::test(Encoding::DELTA_LENGTH_BYTE_ARRAY, TEST_SET_SIZE, -1); + ByteArrayType::test(Encoding::DELTA_BYTE_ARRAY, TEST_SET_SIZE, -1); + } + + #[test] + fn test_fixed_lenbyte_array() { + FixedLenByteArrayType::test(Encoding::PLAIN, TEST_SET_SIZE, 100); + FixedLenByteArrayType::test(Encoding::PLAIN_DICTIONARY, TEST_SET_SIZE, 100); + FixedLenByteArrayType::test(Encoding::DELTA_BYTE_ARRAY, TEST_SET_SIZE, 100); + } + + #[test] + fn test_dict_encoded_size() { + fn run_test(type_length: i32, values: &[T::T], expected_size: usize) { + let mut encoder = create_test_dict_encoder::(type_length); + assert_eq!(encoder.dict_encoded_size(), 0); + encoder.put(values).unwrap(); + assert_eq!(encoder.dict_encoded_size(), expected_size); + // We do not reset encoded size of the dictionary keys after flush_buffer + encoder.flush_buffer().unwrap(); + assert_eq!(encoder.dict_encoded_size(), expected_size); + } + + // Only 2 variations of values 1 byte each + run_test::(-1, &[true, false, true, false, true], 2); + run_test::(-1, &[1i32, 2i32, 3i32, 4i32, 5i32], 20); + run_test::(-1, &[1i64, 2i64, 3i64, 4i64, 5i64], 40); + run_test::(-1, &[1f32, 2f32, 3f32, 4f32, 5f32], 20); + run_test::(-1, &[1f64, 2f64, 3f64, 4f64, 5f64], 40); + // Int96: len + reference + run_test::( + -1, + &[Int96::from(vec![1, 2, 3]), Int96::from(vec![2, 3, 4])], + 32, + ); + run_test::(-1, &[ByteArray::from("abcd"), ByteArray::from("efj")], 15); + run_test::(2, &[ByteArray::from("ab"), ByteArray::from("bc")], 4); + } + + #[test] + fn test_estimated_data_encoded_size() { + fn run_test( + encoding: Encoding, + type_length: i32, + values: &[T::T], + initial_size: usize, + max_size: usize, + flush_size: usize, + ) { + let mut encoder = match encoding { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + Box::new(create_test_dict_encoder::(type_length)) + } + _ => create_test_encoder::(type_length, encoding), + }; + assert_eq!(encoder.estimated_data_encoded_size(), initial_size); + + encoder.put(values).unwrap(); + assert_eq!(encoder.estimated_data_encoded_size(), max_size); + + encoder.flush_buffer().unwrap(); + assert_eq!(encoder.estimated_data_encoded_size(), flush_size); + } + + // PLAIN + run_test::(Encoding::PLAIN, -1, &vec![123; 1024], 0, 4096, 0); + + // DICTIONARY + // NOTE: The final size is almost the same because the dictionary entries are + // preserved after encoded values have been written. + run_test::(Encoding::RLE_DICTIONARY, -1, &vec![123, 1024], 11, 68, 66); + + // DELTA_BINARY_PACKED + run_test::( + Encoding::DELTA_BINARY_PACKED, + -1, + &vec![123; 1024], + 0, + 35, + 0, + ); + + // RLE + let mut values = vec![]; + values.extend_from_slice(&vec![true; 16]); + values.extend_from_slice(&vec![false; 16]); + run_test::(Encoding::RLE, -1, &values, 0, 2, 0); + + // DELTA_LENGTH_BYTE_ARRAY + run_test::( + Encoding::DELTA_LENGTH_BYTE_ARRAY, + -1, + &[ByteArray::from("ab"), ByteArray::from("abc")], + 0, + 5, // only value bytes, length encoder is not flushed yet + 0, + ); + + // DELTA_BYTE_ARRAY + run_test::( + Encoding::DELTA_BYTE_ARRAY, + -1, + &[ByteArray::from("ab"), ByteArray::from("abc")], + 0, + 3, // only suffix bytes, length encoder is not flushed yet + 0, + ); + } + + // See: https://github.com/sunchao/parquet-rs/issues/47 + #[test] + fn test_issue_47() { + let mut encoder = create_test_encoder::(0, Encoding::DELTA_BYTE_ARRAY); + let mut decoder = create_test_decoder::(0, Encoding::DELTA_BYTE_ARRAY); + + let mut input = vec![]; + input.push(ByteArray::from("aa")); + input.push(ByteArray::from("aaa")); + input.push(ByteArray::from("aa")); + input.push(ByteArray::from("aaa")); + let mut output = vec![ByteArray::default(); input.len()]; + + let mut result = put_and_get(&mut encoder, &mut decoder, &input[..2], &mut output[..2]); + assert!( + result.is_ok(), + "first put_and_get() failed with: {}", + result.unwrap_err() + ); + result = put_and_get(&mut encoder, &mut decoder, &input[2..], &mut output[2..]); + assert!( + result.is_ok(), + "second put_and_get() failed with: {}", + result.unwrap_err() + ); + assert_eq!(output, input); + } + + trait EncodingTester { + fn test(enc: Encoding, total: usize, type_length: i32) { + let result = match enc { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + Self::test_dict_internal(total, type_length) + } + enc @ _ => Self::test_internal(enc, total, type_length), + }; + + assert!( + result.is_ok(), + "Expected result to be OK but got err:\n {}", + result.unwrap_err() + ); + } + + fn test_internal(enc: Encoding, total: usize, type_length: i32) -> Result<()>; + + fn test_dict_internal(total: usize, type_length: i32) -> Result<()>; + } + + impl EncodingTester for T { + fn test_internal(enc: Encoding, total: usize, type_length: i32) -> Result<()> { + let mut encoder = create_test_encoder::(type_length, enc); + let mut decoder = create_test_decoder::(type_length, enc); + let mut values = >::gen_vec(type_length, total); + let mut result_data = vec![T::T::default(); total]; + + let mut actual_total = put_and_get( + &mut encoder, + &mut decoder, + &values[..], + &mut result_data[..], + )?; + assert_eq!(actual_total, total); + assert_eq!(result_data, values); + + // Encode more data after flush and test with decoder + + values = >::gen_vec(type_length, total); + actual_total = put_and_get( + &mut encoder, + &mut decoder, + &values[..], + &mut result_data[..], + )?; + assert_eq!(actual_total, total); + assert_eq!(result_data, values); + + Ok(()) + } + + fn test_dict_internal(total: usize, type_length: i32) -> Result<()> { + let mut encoder = create_test_dict_encoder::(type_length); + let mut values = >::gen_vec(type_length, total); + encoder.put(&values[..])?; + + let mut data = encoder.flush_buffer()?; + let mut decoder = create_test_dict_decoder::(); + let mut dict_decoder = PlainDecoder::::new(type_length); + dict_decoder.set_data(encoder.write_dict()?, encoder.num_entries())?; + decoder.set_dict(Box::new(dict_decoder))?; + let mut result_data = vec![T::T::default(); total]; + decoder.set_data(data, total)?; + let mut actual_total = decoder.get(&mut result_data)?; + + assert_eq!(actual_total, total); + assert_eq!(result_data, values); + + // Encode more data after flush and test with decoder + + values = >::gen_vec(type_length, total); + encoder.put(&values[..])?; + data = encoder.flush_buffer()?; + + let mut dict_decoder = PlainDecoder::::new(type_length); + dict_decoder.set_data(encoder.write_dict()?, encoder.num_entries())?; + decoder.set_dict(Box::new(dict_decoder))?; + decoder.set_data(data, total)?; + actual_total = decoder.get(&mut result_data)?; + + assert_eq!(actual_total, total); + assert_eq!(result_data, values); + + Ok(()) + } + } + + fn put_and_get( + encoder: &mut Box>, + decoder: &mut Box>, + input: &[T::T], + output: &mut [T::T], + ) -> Result { + encoder.put(input)?; + let data = encoder.flush_buffer()?; + decoder.set_data(data, input.len())?; + decoder.get(output) + } + + fn create_and_check_encoder(encoding: Encoding, err: Option) { + let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); + let mem_tracker = Rc::new(MemTracker::new()); + let encoder = get_encoder::(descr, encoding, mem_tracker); + match err { + Some(parquet_error) => { + assert!(encoder.is_err()); + assert_eq!(encoder.err().unwrap(), parquet_error); + } + None => { + assert!(encoder.is_ok()); + assert_eq!(encoder.unwrap().encoding(), encoding); + } + } + } + + // Creates test column descriptor. + fn create_test_col_desc_ptr(type_len: i32, t: Type) -> ColumnDescPtr { + let ty = SchemaType::primitive_type_builder("t", t) + .with_length(type_len) + .build() + .unwrap(); + Rc::new(ColumnDescriptor::new( + Rc::new(ty), + None, + 0, + 0, + ColumnPath::new(vec![]), + )) + } + + fn create_test_encoder(type_len: i32, enc: Encoding) -> Box> { + let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); + let mem_tracker = Rc::new(MemTracker::new()); + get_encoder(desc, enc, mem_tracker).unwrap() + } + + fn create_test_decoder(type_len: i32, enc: Encoding) -> Box> { + let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); + get_decoder(desc, enc).unwrap() + } + + fn create_test_dict_encoder(type_len: i32) -> DictEncoder { + let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); + let mem_tracker = Rc::new(MemTracker::new()); + DictEncoder::::new(desc, mem_tracker) + } + + fn create_test_dict_decoder() -> DictDecoder { + DictDecoder::::new() + } +} diff --git a/rust/src/parquet/encodings/levels.rs b/rust/src/parquet/encodings/levels.rs new file mode 100644 index 0000000000000..ec65198ce55f0 --- /dev/null +++ b/rust/src/parquet/encodings/levels.rs @@ -0,0 +1,529 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{cmp, mem}; + +use super::rle::{RleDecoder, RleEncoder}; + +use crate::parquet::basic::Encoding; +use crate::parquet::data_type::AsBytes; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::util::{ + bit_util::{ceil, log2, BitReader, BitWriter}, + memory::ByteBufferPtr, +}; + +/// Computes max buffer size for level encoder/decoder based on encoding, max +/// repetition/definition level and number of total buffered values (includes null +/// values). +#[inline] +pub fn max_buffer_size(encoding: Encoding, max_level: i16, num_buffered_values: usize) -> usize { + let bit_width = log2(max_level as u64 + 1) as u8; + match encoding { + Encoding::RLE => { + RleEncoder::max_buffer_size(bit_width, num_buffered_values) + + RleEncoder::min_buffer_size(bit_width) + } + Encoding::BIT_PACKED => ceil((num_buffered_values * bit_width as usize) as i64, 8) as usize, + _ => panic!("Unsupported encoding type {}", encoding), + } +} + +/// Encoder for definition/repetition levels. +/// Currently only supports RLE and BIT_PACKED (dev/null) encoding, including v2. +pub enum LevelEncoder { + RLE(RleEncoder), + RLE_V2(RleEncoder), + BIT_PACKED(u8, BitWriter), +} + +impl LevelEncoder { + /// Creates new level encoder based on encoding, max level and underlying byte buffer. + /// For bit packed encoding it is assumed that buffer is already allocated with + /// `levels::max_buffer_size` method. + /// + /// Used to encode levels for Data Page v1. + /// + /// Panics, if encoding is not supported. + pub fn v1(encoding: Encoding, max_level: i16, byte_buffer: Vec) -> Self { + let bit_width = log2(max_level as u64 + 1) as u8; + match encoding { + Encoding::RLE => LevelEncoder::RLE(RleEncoder::new_from_buf( + bit_width, + byte_buffer, + mem::size_of::(), + )), + Encoding::BIT_PACKED => { + // Here we set full byte buffer without adjusting for num_buffered_values, + // because byte buffer will already be allocated with size from + // `max_buffer_size()` method. + LevelEncoder::BIT_PACKED(bit_width, BitWriter::new_from_buf(byte_buffer, 0)) + } + _ => panic!("Unsupported encoding type {}", encoding), + } + } + + /// Creates new level encoder based on RLE encoding. Used to encode Data Page v2 + /// repetition and definition levels. + pub fn v2(max_level: i16, byte_buffer: Vec) -> Self { + let bit_width = log2(max_level as u64 + 1) as u8; + LevelEncoder::RLE_V2(RleEncoder::new_from_buf(bit_width, byte_buffer, 0)) + } + + /// Put/encode levels vector into this level encoder. + /// Returns number of encoded values that are less than or equal to length of the input + /// buffer. + /// + /// RLE and BIT_PACKED level encoders return Err() when internal buffer overflows or + /// flush fails. + #[inline] + pub fn put(&mut self, buffer: &[i16]) -> Result { + let mut num_encoded = 0; + match *self { + LevelEncoder::RLE(ref mut encoder) | LevelEncoder::RLE_V2(ref mut encoder) => { + for value in buffer { + if !encoder.put(*value as u64)? { + return Err(general_err!("RLE buffer is full")); + } + num_encoded += 1; + } + encoder.flush()?; + } + LevelEncoder::BIT_PACKED(bit_width, ref mut encoder) => { + for value in buffer { + if !encoder.put_value(*value as u64, bit_width as usize) { + return Err(general_err!("Not enough bytes left")); + } + num_encoded += 1; + } + encoder.flush(); + } + } + Ok(num_encoded) + } + + /// Finalizes level encoder, flush all intermediate buffers and return resulting + /// encoded buffer. Returned buffer is already truncated to encoded bytes only. + #[inline] + pub fn consume(self) -> Result> { + match self { + LevelEncoder::RLE(encoder) => { + let mut encoded_data = encoder.consume()?; + // Account for the buffer offset + let encoded_len = encoded_data.len() - mem::size_of::(); + let len = (encoded_len as i32).to_le(); + let len_bytes = len.as_bytes(); + encoded_data[0..len_bytes.len()].copy_from_slice(len_bytes); + Ok(encoded_data) + } + LevelEncoder::RLE_V2(encoder) => encoder.consume(), + LevelEncoder::BIT_PACKED(_, encoder) => Ok(encoder.consume()), + } + } +} + +/// Decoder for definition/repetition levels. +/// Currently only supports RLE and BIT_PACKED encoding for Data Page v1 and +/// RLE for Data Page v2. +pub enum LevelDecoder { + RLE(Option, RleDecoder), + RLE_V2(Option, RleDecoder), + BIT_PACKED(Option, u8, BitReader), +} + +impl LevelDecoder { + /// Creates new level decoder based on encoding and max definition/repetition level. + /// This method only initializes level decoder, `set_data` method must be called + /// before reading any value. + /// + /// Used to encode levels for Data Page v1. + /// + /// Panics if encoding is not supported + pub fn v1(encoding: Encoding, max_level: i16) -> Self { + let bit_width = log2(max_level as u64 + 1) as u8; + match encoding { + Encoding::RLE => LevelDecoder::RLE(None, RleDecoder::new(bit_width)), + Encoding::BIT_PACKED => { + LevelDecoder::BIT_PACKED(None, bit_width, BitReader::from(Vec::new())) + } + _ => panic!("Unsupported encoding type {}", encoding), + } + } + + /// Creates new level decoder based on RLE encoding. + /// Used to decode Data Page v2 repetition and definition levels. + /// + /// To set data for this decoder, use `set_data_range` method. + pub fn v2(max_level: i16) -> Self { + let bit_width = log2(max_level as u64 + 1) as u8; + LevelDecoder::RLE_V2(None, RleDecoder::new(bit_width)) + } + + /// Sets data for this level decoder, and returns total number of bytes set. + /// This is used for Data Page v1 levels. + /// + /// `data` is encoded data as byte buffer, `num_buffered_values` represents total + /// number of values that is expected. + /// + /// Both RLE and BIT_PACKED level decoders set `num_buffered_values` as total number of + /// values that they can return and track num values. + #[inline] + pub fn set_data(&mut self, num_buffered_values: usize, data: ByteBufferPtr) -> usize { + match *self { + LevelDecoder::RLE(ref mut num_values, ref mut decoder) => { + *num_values = Some(num_buffered_values); + let i32_size = mem::size_of::(); + let data_size = read_num_bytes!(i32, i32_size, data.as_ref()) as usize; + decoder.set_data(data.range(i32_size, data_size)); + i32_size + data_size + } + LevelDecoder::BIT_PACKED(ref mut num_values, bit_width, ref mut decoder) => { + *num_values = Some(num_buffered_values); + // Set appropriate number of bytes: if max size is larger than buffer - set full + // buffer + let num_bytes = ceil((num_buffered_values * bit_width as usize) as i64, 8); + let data_size = cmp::min(num_bytes as usize, data.len()); + decoder.reset(data.range(data.start(), data_size)); + data_size + } + _ => panic!(), + } + } + + /// Sets byte array explicitly when start position `start` and length `len` are known + /// in advance. Only supported by RLE level decoder and used for Data Page v2 levels. + /// Returns number of total bytes set for this decoder (len). + #[inline] + pub fn set_data_range( + &mut self, + num_buffered_values: usize, + data: &ByteBufferPtr, + start: usize, + len: usize, + ) -> usize { + match *self { + LevelDecoder::RLE_V2(ref mut num_values, ref mut decoder) => { + decoder.set_data(data.range(start, len)); + *num_values = Some(num_buffered_values); + len + } + _ => panic!("set_data_range() method is only supported by RLE v2 encoding type"), + } + } + + /// Returns true if data is set for decoder, false otherwise. + #[inline] + pub fn is_data_set(&self) -> bool { + match self { + LevelDecoder::RLE(ref num_values, _) => num_values.is_some(), + LevelDecoder::RLE_V2(ref num_values, _) => num_values.is_some(), + LevelDecoder::BIT_PACKED(ref num_values, ..) => num_values.is_some(), + } + } + + /// Decodes values and puts them into `buffer`. + /// Returns number of values that were successfully decoded (less than or equal to + /// buffer length). + #[inline] + pub fn get(&mut self, buffer: &mut [i16]) -> Result { + assert!(self.is_data_set(), "No data set for decoding"); + match *self { + LevelDecoder::RLE(ref mut num_values, ref mut decoder) + | LevelDecoder::RLE_V2(ref mut num_values, ref mut decoder) => { + // Max length we can read + let len = cmp::min(num_values.unwrap(), buffer.len()); + let values_read = decoder.get_batch::(&mut buffer[0..len])?; + *num_values = num_values.map(|len| len - values_read); + Ok(values_read) + } + LevelDecoder::BIT_PACKED(ref mut num_values, bit_width, ref mut decoder) => { + // When extracting values from bit reader, it might return more values than left + // because of padding to a full byte, we use num_values to track precise number + // of values. + let len = cmp::min(num_values.unwrap(), buffer.len()); + let values_read = decoder.get_batch::(&mut buffer[..len], bit_width as usize); + *num_values = num_values.map(|len| len - values_read); + Ok(values_read) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::util::test_common::random_numbers_range; + + fn test_internal_roundtrip(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { + let size = max_buffer_size(enc, max_level, levels.len()); + let mut encoder = if v2 { + LevelEncoder::v2(max_level, vec![0; size]) + } else { + LevelEncoder::v1(enc, max_level, vec![0; size]) + }; + encoder.put(&levels).expect("put() should be OK"); + let encoded_levels = encoder.consume().expect("consume() should be OK"); + + let byte_buf = ByteBufferPtr::new(encoded_levels); + let mut decoder; + if v2 { + decoder = LevelDecoder::v2(max_level); + decoder.set_data_range(levels.len(), &byte_buf, 0, byte_buf.len()); + } else { + decoder = LevelDecoder::v1(enc, max_level); + decoder.set_data(levels.len(), byte_buf); + }; + + let mut buffer = vec![0; levels.len()]; + let num_decoded = decoder.get(&mut buffer).expect("get() should be OK"); + assert_eq!(num_decoded, levels.len()); + assert_eq!(buffer, levels); + } + + // Performs incremental read until all bytes are read + fn test_internal_roundtrip_incremental( + enc: Encoding, + levels: &[i16], + max_level: i16, + v2: bool, + ) { + let size = max_buffer_size(enc, max_level, levels.len()); + let mut encoder = if v2 { + LevelEncoder::v2(max_level, vec![0; size]) + } else { + LevelEncoder::v1(enc, max_level, vec![0; size]) + }; + encoder.put(&levels).expect("put() should be OK"); + let encoded_levels = encoder.consume().expect("consume() should be OK"); + + let byte_buf = ByteBufferPtr::new(encoded_levels); + let mut decoder; + if v2 { + decoder = LevelDecoder::v2(max_level); + decoder.set_data_range(levels.len(), &byte_buf, 0, byte_buf.len()); + } else { + decoder = LevelDecoder::v1(enc, max_level); + decoder.set_data(levels.len(), byte_buf); + } + + let mut buffer = vec![0; levels.len() * 2]; + let mut total_decoded = 0; + let mut safe_stop = levels.len() * 2; // still terminate in case of issues in the code + while safe_stop > 0 { + safe_stop -= 1; + let num_decoded = decoder + .get(&mut buffer[total_decoded..total_decoded + 1]) + .expect("get() should be OK"); + if num_decoded == 0 { + break; + } + total_decoded += num_decoded; + } + assert!( + safe_stop > 0, + "Failed to read values incrementally, reached safe stop" + ); + assert_eq!(total_decoded, levels.len()); + assert_eq!(&buffer[0..levels.len()], levels); + } + + // Tests encoding/decoding of values when output buffer is larger than number of + // encoded values + fn test_internal_roundtrip_underflow(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { + let size = max_buffer_size(enc, max_level, levels.len()); + let mut encoder = if v2 { + LevelEncoder::v2(max_level, vec![0; size]) + } else { + LevelEncoder::v1(enc, max_level, vec![0; size]) + }; + // Encode only one value + let num_encoded = encoder.put(&levels[0..1]).expect("put() should be OK"); + let encoded_levels = encoder.consume().expect("consume() should be OK"); + assert_eq!(num_encoded, 1); + + let byte_buf = ByteBufferPtr::new(encoded_levels); + let mut decoder; + // Set one encoded value as `num_buffered_values` + if v2 { + decoder = LevelDecoder::v2(max_level); + decoder.set_data_range(1, &byte_buf, 0, byte_buf.len()); + } else { + decoder = LevelDecoder::v1(enc, max_level); + decoder.set_data(1, byte_buf); + } + + let mut buffer = vec![0; levels.len()]; + let num_decoded = decoder.get(&mut buffer).expect("get() should be OK"); + assert_eq!(num_decoded, num_encoded); + assert_eq!(buffer[0..num_decoded], levels[0..num_decoded]); + } + + // Tests when encoded values are larger than encoder's buffer + fn test_internal_roundtrip_overflow(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { + let size = max_buffer_size(enc, max_level, levels.len()); + let mut encoder = if v2 { + LevelEncoder::v2(max_level, vec![0; size]) + } else { + LevelEncoder::v1(enc, max_level, vec![0; size]) + }; + let mut found_err = false; + // Insert a large number of values, so we run out of space + for _ in 0..100 { + match encoder.put(&levels) { + Err(err) => { + assert!(format!("{}", err).contains("Not enough bytes left")); + found_err = true; + break; + } + Ok(_) => {} + } + } + if !found_err { + panic!("Failed test: no buffer overflow"); + } + } + + #[test] + fn test_roundtrip_one() { + let levels = vec![0, 1, 1, 1, 1, 0, 0, 0, 0, 1]; + let max_level = 1; + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip() { + let levels = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let max_level = 10; + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_incremental() { + let levels = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let max_level = 10; + test_internal_roundtrip_incremental(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip_incremental(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip_incremental(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_all_zeros() { + let levels = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let max_level = 1; + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_random() { + // This test is mainly for bit packed level encoder/decoder + let mut levels = Vec::new(); + let max_level = 5; + random_numbers_range::(120, 0, max_level, &mut levels); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_underflow() { + let levels = vec![1, 1, 2, 3, 2, 1, 1, 2, 3, 1]; + let max_level = 3; + test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip_underflow(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_roundtrip_overflow() { + let levels = vec![1, 1, 2, 3, 2, 1, 1, 2, 3, 1]; + let max_level = 3; + test_internal_roundtrip_overflow(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip_overflow(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip_overflow(Encoding::RLE, &levels, max_level, true); + } + + #[test] + fn test_rle_decoder_set_data_range() { + // Buffer containing both repetition and definition levels + let buffer = ByteBufferPtr::new(vec![5, 198, 2, 5, 42, 168, 10, 0, 2, 3, 36, 73]); + + let max_rep_level = 1; + let mut decoder = LevelDecoder::v2(max_rep_level); + assert_eq!(decoder.set_data_range(10, &buffer, 0, 3), 3); + let mut result = vec![0; 10]; + let num_decoded = decoder.get(&mut result).expect("get() should be OK"); + assert_eq!(num_decoded, 10); + assert_eq!(result, vec![0, 1, 1, 0, 0, 0, 1, 1, 0, 1]); + + let max_def_level = 2; + let mut decoder = LevelDecoder::v2(max_def_level); + assert_eq!(decoder.set_data_range(10, &buffer, 3, 5), 5); + let mut result = vec![0; 10]; + let num_decoded = decoder.get(&mut result).expect("get() should be OK"); + assert_eq!(num_decoded, 10); + assert_eq!(result, vec![2, 2, 2, 0, 0, 2, 2, 2, 2, 2]); + } + + #[test] + #[should_panic(expected = "set_data_range() method is only supported by RLE v2 encoding type")] + fn test_bit_packed_decoder_set_data_range() { + // Buffer containing both repetition and definition levels + let buffer = ByteBufferPtr::new(vec![1, 2, 3, 4, 5]); + let max_level = 1; + let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_level); + decoder.set_data_range(10, &buffer, 0, 3); + } + + #[test] + fn test_bit_packed_decoder_set_data() { + // Test the maximum size that is assigned based on number of values and buffer length + let buffer = ByteBufferPtr::new(vec![1, 2, 3, 4, 5]); + let max_level = 1; + let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_level); + // This should reset to entire buffer + assert_eq!(decoder.set_data(1024, buffer.all()), buffer.len()); + // This should set smallest num bytes + assert_eq!(decoder.set_data(3, buffer.all()), 1); + } + + #[test] + #[should_panic(expected = "No data set for decoding")] + fn test_rle_level_decoder_get_no_set_data() { + // `get()` normally panics because bit_reader is not set for RLE decoding + // we have explicit check now in set_data + let max_rep_level = 2; + let mut decoder = LevelDecoder::v1(Encoding::RLE, max_rep_level); + let mut buffer = vec![0; 16]; + decoder.get(&mut buffer).unwrap(); + } + + #[test] + #[should_panic(expected = "No data set for decoding")] + fn test_bit_packed_level_decoder_get_no_set_data() { + let max_rep_level = 2; + let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_rep_level); + let mut buffer = vec![0; 16]; + decoder.get(&mut buffer).unwrap(); + } +} diff --git a/rust/src/parquet/encodings/mod.rs b/rust/src/parquet/encodings/mod.rs new file mode 100644 index 0000000000000..33b1e233d8931 --- /dev/null +++ b/rust/src/parquet/encodings/mod.rs @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod decoding; +pub mod encoding; +pub mod levels; +mod rle; diff --git a/rust/src/parquet/encodings/rle.rs b/rust/src/parquet/encodings/rle.rs new file mode 100644 index 0000000000000..5b56c2a250495 --- /dev/null +++ b/rust/src/parquet/encodings/rle.rs @@ -0,0 +1,839 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + cmp, + mem::{size_of, transmute_copy}, +}; + +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::util::{ + bit_util::{self, BitReader, BitWriter}, + memory::ByteBufferPtr, +}; + +/// Rle/Bit-Packing Hybrid Encoding +/// The grammar for this encoding looks like the following (copied verbatim +/// from https://github.com/Parquet/parquet-format/blob/master/Encodings.md): +/// +/// rle-bit-packed-hybrid: +/// length := length of the in bytes stored as 4 bytes little endian +/// encoded-data := * +/// run := | +/// bit-packed-run := +/// bit-packed-header := varint-encode( << 1 | 1) +/// we always bit-pack a multiple of 8 values at a time, so we only store the number of +/// values / 8 +/// bit-pack-count := (number of values in this run) / 8 +/// bit-packed-values := *see 1 below* +/// rle-run := +/// rle-header := varint-encode( (number of times repeated) << 1) +/// repeated-value := value that is repeated, using a fixed-width of +/// round-up-to-next-byte(bit-width) + +/// Maximum groups per bit-packed run. Current value is 64. +const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6; +const MAX_VALUES_PER_BIT_PACKED_RUN: usize = MAX_GROUPS_PER_BIT_PACKED_RUN * 8; +const MAX_WRITER_BUF_SIZE: usize = 1 << 10; + +/// A RLE/Bit-Packing hybrid encoder. +// TODO: tracking memory usage +pub struct RleEncoder { + // Number of bits needed to encode the value. Must be in the range of [0, 64]. + bit_width: u8, + + // Underlying writer which holds an internal buffer. + bit_writer: BitWriter, + + // If this is true, the buffer is full and subsequent `put()` calls will fail. + buffer_full: bool, + + // The maximum byte size a single run can take. + max_run_byte_size: usize, + + // Buffered values for bit-packed runs. + buffered_values: [u64; 8], + + // Number of current buffered values. Must be less than 8. + num_buffered_values: usize, + + // The current (also last) value that was written and the count of how many + // times in a row that value has been seen. + current_value: u64, + + // The number of repetitions for `current_value`. If this gets too high we'd + // switch to use RLE encoding. + repeat_count: usize, + + // Number of bit-packed values in the current run. This doesn't include values + // in `buffered_values`. + bit_packed_count: usize, + + // The position of the indicator byte in the `bit_writer`. + indicator_byte_pos: i64, +} + +impl RleEncoder { + pub fn new(bit_width: u8, buffer_len: usize) -> Self { + let buffer = vec![0; buffer_len]; + RleEncoder::new_from_buf(bit_width, buffer, 0) + } + + /// Initialize the encoder from existing `buffer` and the starting offset `start`. + pub fn new_from_buf(bit_width: u8, buffer: Vec, start: usize) -> Self { + assert!(bit_width <= 64, "bit_width ({}) out of range.", bit_width); + let max_run_byte_size = RleEncoder::min_buffer_size(bit_width); + assert!( + buffer.len() >= max_run_byte_size, + "buffer length {} must be greater than {}", + buffer.len(), + max_run_byte_size + ); + let bit_writer = BitWriter::new_from_buf(buffer, start); + RleEncoder { + bit_width, + bit_writer, + buffer_full: false, + max_run_byte_size, + buffered_values: [0; 8], + num_buffered_values: 0, + current_value: 0, + repeat_count: 0, + bit_packed_count: 0, + indicator_byte_pos: -1, + } + } + + /// Returns the minimum buffer size needed to use the encoder for `bit_width`. + /// This is the maximum length of a single run for `bit_width`. + pub fn min_buffer_size(bit_width: u8) -> usize { + let max_bit_packed_run_size = 1 + bit_util::ceil( + (MAX_VALUES_PER_BIT_PACKED_RUN * bit_width as usize) as i64, + 8, + ); + let max_rle_run_size = + bit_util::MAX_VLQ_BYTE_LEN + bit_util::ceil(bit_width as i64, 8) as usize; + ::std::cmp::max(max_bit_packed_run_size as usize, max_rle_run_size) + } + + /// Returns the maximum buffer size takes to encode `num_values` values with + /// `bit_width`. + pub fn max_buffer_size(bit_width: u8, num_values: usize) -> usize { + // First the maximum size for bit-packed run + let bytes_per_run = bit_width; + let num_runs = bit_util::ceil(num_values as i64, 8) as usize; + let bit_packed_max_size = num_runs + num_runs * bytes_per_run as usize; + + // Second the maximum size for RLE run + let min_rle_run_size = 1 + bit_util::ceil(bit_width as i64, 8) as usize; + let rle_max_size = bit_util::ceil(num_values as i64, 8) as usize * min_rle_run_size; + ::std::cmp::max(bit_packed_max_size, rle_max_size) as usize + } + + /// Encodes `value`, which must be representable with `bit_width` bits. + /// Returns true if the value fits in buffer, false if it doesn't, or + /// error if something is wrong. + #[inline] + pub fn put(&mut self, value: u64) -> Result { + // This function buffers 8 values at a time. After seeing 8 values, it + // decides whether the current run should be encoded in bit-packed or RLE. + if self.buffer_full { + // The value cannot fit in the current buffer. + return Ok(false); + } + if self.current_value == value { + self.repeat_count += 1; + if self.repeat_count > 8 { + // A continuation of last value. No need to buffer. + return Ok(true); + } + } else { + if self.repeat_count >= 8 { + // The current RLE run has ended and we've gathered enough. Flush first. + assert_eq!(self.bit_packed_count, 0); + self.flush_rle_run()?; + } + self.repeat_count = 1; + self.current_value = value; + } + + self.buffered_values[self.num_buffered_values] = value; + self.num_buffered_values += 1; + if self.num_buffered_values == 8 { + // Buffered values are full. Flush them. + assert_eq!(self.bit_packed_count % 8, 0); + self.flush_buffered_values()?; + } + + Ok(true) + } + + #[inline] + pub fn buffer(&self) -> &[u8] { + self.bit_writer.buffer() + } + + #[inline] + pub fn len(&self) -> usize { + self.bit_writer.bytes_written() + } + + #[inline] + pub fn consume(mut self) -> Result> { + self.flush()?; + Ok(self.bit_writer.consume()) + } + + /// Borrow equivalent of the `consume` method. + /// Call `clear()` after invoking this method. + #[inline] + pub fn flush_buffer(&mut self) -> Result<&[u8]> { + self.flush()?; + Ok(self.bit_writer.flush_buffer()) + } + + /// Clears the internal state so this encoder can be reused (e.g., after becoming full). + #[inline] + pub fn clear(&mut self) { + self.bit_writer.clear(); + self.buffer_full = false; + self.num_buffered_values = 0; + self.current_value = 0; + self.repeat_count = 0; + self.bit_packed_count = 0; + self.indicator_byte_pos = -1; + } + + /// Flushes all remaining values and return the final byte buffer maintained by the + /// internal writer. + #[inline] + pub fn flush(&mut self) -> Result<()> { + if self.bit_packed_count > 0 || self.repeat_count > 0 || self.num_buffered_values > 0 { + let all_repeat = self.bit_packed_count == 0 + && (self.repeat_count == self.num_buffered_values || self.num_buffered_values == 0); + if self.repeat_count > 0 && all_repeat { + self.flush_rle_run()?; + } else { + // Buffer the last group of bit-packed values to 8 by padding with 0s. + if self.num_buffered_values > 0 { + while self.num_buffered_values < 8 { + self.buffered_values[self.num_buffered_values] = 0; + self.num_buffered_values += 1; + } + } + self.bit_packed_count += self.num_buffered_values; + self.flush_bit_packed_run(true)?; + self.repeat_count = 0; + } + } + Ok(()) + } + + #[inline] + fn flush_rle_run(&mut self) -> Result<()> { + assert!(self.repeat_count > 0); + let indicator_value = self.repeat_count << 1 | 0; + let mut result = self.bit_writer.put_vlq_int(indicator_value as u64); + result &= self.bit_writer.put_aligned( + self.current_value, + bit_util::ceil(self.bit_width as i64, 8) as usize, + ); + if !result { + return Err(general_err!("Failed to write RLE run")); + } + self.num_buffered_values = 0; + self.repeat_count = 0; + Ok(()) + } + + #[inline] + fn flush_bit_packed_run(&mut self, update_indicator_byte: bool) -> Result<()> { + if self.indicator_byte_pos < 0 { + self.indicator_byte_pos = self.bit_writer.skip(1)? as i64; + } + + // Write all buffered values as bit-packed literals + for i in 0..self.num_buffered_values { + let _ = self + .bit_writer + .put_value(self.buffered_values[i], self.bit_width as usize); + } + self.num_buffered_values = 0; + if update_indicator_byte { + // Write the indicator byte to the reserved position in `bit_writer` + let num_groups = self.bit_packed_count / 8; + let indicator_byte = ((num_groups << 1) | 1) as u8; + if !self.bit_writer.put_aligned_offset( + indicator_byte, + 1, + self.indicator_byte_pos as usize, + ) { + return Err(general_err!("Not enough space to write indicator byte")); + } + self.indicator_byte_pos = -1; + self.bit_packed_count = 0; + } + Ok(()) + } + + #[inline] + fn flush_buffered_values(&mut self) -> Result<()> { + if self.repeat_count >= 8 { + self.num_buffered_values = 0; + if self.bit_packed_count > 0 { + // In this case we choose RLE encoding. Flush the current buffered values + // as bit-packed encoding. + assert_eq!(self.bit_packed_count % 8, 0); + self.flush_bit_packed_run(true)? + } + return Ok(()); + } + + self.bit_packed_count += self.num_buffered_values; + let num_groups = self.bit_packed_count / 8; + if num_groups + 1 >= MAX_GROUPS_PER_BIT_PACKED_RUN { + // We've reached the maximum value that can be hold in a single bit-packed run. + assert!(self.indicator_byte_pos >= 0); + self.flush_bit_packed_run(true)?; + } else { + self.flush_bit_packed_run(false)?; + } + self.repeat_count = 0; + Ok(()) + } +} + +/// A RLE/Bit-Packing hybrid decoder. +pub struct RleDecoder { + // Number of bits used to encode the value. Must be between [0, 64]. + bit_width: u8, + + // Bit reader loaded with input buffer. + bit_reader: Option, + + // Buffer used when `bit_reader` is not `None`, for batch reading. + index_buf: Option<[i32; 1024]>, + + // The remaining number of values in RLE for this run + rle_left: u32, + + // The remaining number of values in Bit-Packing for this run + bit_packed_left: u32, + + // The current value for the case of RLE mode + current_value: Option, +} + +impl RleDecoder { + pub fn new(bit_width: u8) -> Self { + RleDecoder { + bit_width, + rle_left: 0, + bit_packed_left: 0, + bit_reader: None, + index_buf: None, + current_value: None, + } + } + + pub fn set_data(&mut self, data: ByteBufferPtr) { + if let Some(ref mut bit_reader) = self.bit_reader { + bit_reader.reset(data); + } else { + self.bit_reader = Some(BitReader::new(data)); + self.index_buf = Some([0; 1024]); + } + + let _ = self.reload(); + } + + #[inline] + pub fn get(&mut self) -> Result> { + assert!(size_of::() <= 8); + + while self.rle_left <= 0 && self.bit_packed_left <= 0 { + if !self.reload() { + return Ok(None); + } + } + + let value = if self.rle_left > 0 { + let rle_value = unsafe { + transmute_copy::( + self.current_value + .as_mut() + .expect("current_value should be Some"), + ) + }; + self.rle_left -= 1; + rle_value + } else { + // self.bit_packed_left > 0 + let bit_reader = self.bit_reader.as_mut().expect("bit_reader should be Some"); + let bit_packed_value = bit_reader + .get_value(self.bit_width as usize) + .ok_or(eof_err!("Not enough data for 'bit_packed_value'"))?; + self.bit_packed_left -= 1; + bit_packed_value + }; + + Ok(Some(value)) + } + + #[inline] + pub fn get_batch(&mut self, buffer: &mut [T]) -> Result { + assert!(self.bit_reader.is_some()); + assert!(size_of::() <= 8); + + let mut values_read = 0; + while values_read < buffer.len() { + if self.rle_left > 0 { + assert!(self.current_value.is_some()); + let num_values = cmp::min(buffer.len() - values_read, self.rle_left as usize); + for i in 0..num_values { + let repeated_value = + unsafe { transmute_copy::(self.current_value.as_mut().unwrap()) }; + buffer[values_read + i] = repeated_value; + } + self.rle_left -= num_values as u32; + values_read += num_values; + } else if self.bit_packed_left > 0 { + assert!(self.bit_reader.is_some()); + let mut num_values = + cmp::min(buffer.len() - values_read, self.bit_packed_left as usize); + if let Some(ref mut bit_reader) = self.bit_reader { + num_values = bit_reader.get_batch::( + &mut buffer[values_read..values_read + num_values], + self.bit_width as usize, + ); + self.bit_packed_left -= num_values as u32; + values_read += num_values; + } + } else { + if !self.reload() { + break; + } + } + } + + Ok(values_read) + } + + #[inline] + pub fn get_batch_with_dict( + &mut self, + dict: &[T], + buffer: &mut [T], + max_values: usize, + ) -> Result + where + T: Default + Clone, + { + assert!(buffer.len() >= max_values); + + let mut values_read = 0; + while values_read < max_values { + if self.rle_left > 0 { + assert!(self.current_value.is_some()); + let num_values = cmp::min(max_values - values_read, self.rle_left as usize); + let dict_idx = self.current_value.unwrap() as usize; + for i in 0..num_values { + buffer[values_read + i] = dict[dict_idx].clone(); + } + self.rle_left -= num_values as u32; + values_read += num_values; + } else if self.bit_packed_left > 0 { + assert!(self.bit_reader.is_some()); + let mut num_values = + cmp::min(max_values - values_read, self.bit_packed_left as usize); + if let Some(ref mut bit_reader) = self.bit_reader { + let mut index_buf = self.index_buf.unwrap(); + num_values = cmp::min(num_values, index_buf.len()); + loop { + num_values = bit_reader.get_batch::( + &mut index_buf[..num_values], + self.bit_width as usize, + ); + for i in 0..num_values { + buffer[values_read + i] = dict[index_buf[i] as usize].clone(); + } + self.bit_packed_left -= num_values as u32; + values_read += num_values; + if num_values < index_buf.len() { + break; + } + } + } + } else { + if !self.reload() { + break; + } + } + } + + Ok(values_read) + } + + #[inline] + fn reload(&mut self) -> bool { + assert!(self.bit_reader.is_some()); + if let Some(ref mut bit_reader) = self.bit_reader { + if let Some(indicator_value) = bit_reader.get_vlq_int() { + if indicator_value & 1 == 1 { + self.bit_packed_left = ((indicator_value >> 1) * 8) as u32; + } else { + self.rle_left = (indicator_value >> 1) as u32; + let value_width = bit_util::ceil(self.bit_width as i64, 8); + self.current_value = bit_reader.get_aligned::(value_width as usize); + assert!(self.current_value.is_some()); + } + return true; + } else { + return false; + } + } + return false; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use rand::{ + self, + distributions::{Distribution, Standard}, + thread_rng, Rng, SeedableRng, + }; + + use crate::parquet::util::memory::ByteBufferPtr; + + const MAX_WIDTH: usize = 32; + + #[test] + fn test_rle_decode_int32() { + // Test data: 0-7 with bit width 3 + // 00000011 10001000 11000110 11111010 + let data = ByteBufferPtr::new(vec![0x03, 0x88, 0xC6, 0xFA]); + let mut decoder: RleDecoder = RleDecoder::new(3); + decoder.set_data(data); + let mut buffer = vec![0; 8]; + let expected = vec![0, 1, 2, 3, 4, 5, 6, 7]; + let result = decoder.get_batch::(&mut buffer); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + } + + #[test] + fn test_rle_consume_flush_buffer() { + let data = vec![1, 1, 1, 2, 2, 3, 3, 3]; + let mut encoder1 = RleEncoder::new(3, 256); + let mut encoder2 = RleEncoder::new(3, 256); + for value in data { + encoder1.put(value as u64).unwrap(); + encoder2.put(value as u64).unwrap(); + } + let res1 = encoder1.flush_buffer().unwrap(); + let res2 = encoder2.consume().unwrap(); + assert_eq!(res1, &res2[..]); + } + + #[test] + fn test_rle_decode_bool() { + // RLE test data: 50 1s followed by 50 0s + // 01100100 00000001 01100100 00000000 + let data1 = ByteBufferPtr::new(vec![0x64, 0x01, 0x64, 0x00]); + + // Bit-packing test data: alternating 1s and 0s, 100 total + // 100 / 8 = 13 groups + // 00011011 10101010 ... 00001010 + let data2 = ByteBufferPtr::new(vec![ + 0x1B, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x0A, + ]); + + let mut decoder: RleDecoder = RleDecoder::new(1); + decoder.set_data(data1); + let mut buffer = vec![false; 100]; + let mut expected = vec![]; + for i in 0..100 { + if i < 50 { + expected.push(true); + } else { + expected.push(false); + } + } + let result = decoder.get_batch::(&mut buffer); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + + decoder.set_data(data2); + let mut buffer = vec![false; 100]; + let mut expected = vec![]; + for i in 0..100 { + if i % 2 == 0 { + expected.push(false); + } else { + expected.push(true); + } + } + let result = decoder.get_batch::(&mut buffer); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + } + + #[test] + fn test_rle_decode_with_dict_int32() { + // Test RLE encoding: 3 0s followed by 4 1s followed by 5 2s + // 00000110 00000000 00001000 00000001 00001010 00000010 + let dict = vec![10, 20, 30]; + let data = ByteBufferPtr::new(vec![0x06, 0x00, 0x08, 0x01, 0x0A, 0x02]); + let mut decoder: RleDecoder = RleDecoder::new(3); + decoder.set_data(data); + let mut buffer = vec![0; 12]; + let expected = vec![10, 10, 10, 20, 20, 20, 20, 30, 30, 30, 30, 30]; + let result = decoder.get_batch_with_dict::(&dict, &mut buffer, 12); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + + // Test bit-pack encoding: 345345345455 (2 groups: 8 and 4) + // 011 100 101 011 100 101 011 100 101 100 101 101 + // 00000011 01100011 11000111 10001110 00000011 01100101 00001011 + let dict = vec!["aaa", "bbb", "ccc", "ddd", "eee", "fff"]; + let data = ByteBufferPtr::new(vec![0x03, 0x63, 0xC7, 0x8E, 0x03, 0x65, 0x0B]); + let mut decoder: RleDecoder = RleDecoder::new(3); + decoder.set_data(data); + let mut buffer = vec![""; 12]; + let expected = vec![ + "ddd", "eee", "fff", "ddd", "eee", "fff", "ddd", "eee", "fff", "eee", "fff", "fff", + ]; + let result = + decoder.get_batch_with_dict::<&str>(dict.as_slice(), buffer.as_mut_slice(), 12); + assert!(result.is_ok()); + assert_eq!(buffer, expected); + } + + fn validate_rle( + values: &[i64], + bit_width: u8, + expected_encoding: Option<&[u8]>, + expected_len: i32, + ) { + let buffer_len = 64 * 1024; + let mut encoder = RleEncoder::new(bit_width, buffer_len); + for v in values { + let result = encoder.put(*v as u64); + assert!(result.is_ok()); + } + let buffer = ByteBufferPtr::new(encoder.consume().expect("Expect consume() OK")); + if expected_len != -1 { + assert_eq!(buffer.len(), expected_len as usize); + } + match expected_encoding { + Some(b) => assert_eq!(buffer.as_ref(), b), + _ => (), + } + + // Verify read + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(buffer.all()); + for v in values { + let val: i64 = decoder + .get() + .expect("get() should be OK") + .expect("get() should return more value"); + assert_eq!(val, *v); + } + + // Verify batch read + decoder.set_data(buffer); + let mut values_read: Vec = vec![0; values.len()]; + decoder + .get_batch(&mut values_read[..]) + .expect("get_batch() should be OK"); + assert_eq!(&values_read[..], values); + } + + #[test] + fn test_rle_specific_sequences() { + let mut expected_buffer = Vec::new(); + let mut values = Vec::new(); + for _ in 0..50 { + values.push(0); + } + for _ in 0..50 { + values.push(1); + } + expected_buffer.push(50 << 1); + expected_buffer.push(0); + expected_buffer.push(50 << 1); + expected_buffer.push(1); + + for width in 1..9 { + validate_rle(&values[..], width, Some(&expected_buffer[..]), 4); + } + for width in 9..MAX_WIDTH + 1 { + validate_rle( + &values[..], + width as u8, + None, + 2 * (1 + bit_util::ceil(width as i64, 8) as i32), + ); + } + + // Test 100 0's and 1's alternating + values.clear(); + expected_buffer.clear(); + for i in 0..101 { + values.push(i % 2); + } + let num_groups = bit_util::ceil(100, 8) as u8; + expected_buffer.push(((num_groups << 1) as u8) | 1); + for _ in 1..(100 / 8) + 1 { + expected_buffer.push(0b10101010); + } + // For the last 4 0 and 1's, padded with 0. + expected_buffer.push(0b00001010); + validate_rle( + &values, + 1, + Some(&expected_buffer[..]), + 1 + num_groups as i32, + ); + for width in 2..MAX_WIDTH + 1 { + let num_values = bit_util::ceil(100, 8) * 8; + validate_rle( + &values, + width as u8, + None, + 1 + bit_util::ceil(width as i64 * num_values, 8) as i32, + ); + } + } + + // `validate_rle` on `num_vals` with width `bit_width`. If `value` is -1, that value + // is used, otherwise alternating values are used. + fn test_rle_values(bit_width: usize, num_vals: usize, value: i32) { + let mod_val = if bit_width == 64 { + 1 + } else { + 1u64 << bit_width + }; + let mut values: Vec = vec![]; + for v in 0..num_vals { + let val = if value == -1 { + v as i64 % mod_val as i64 + } else { + value as i64 + }; + values.push(val); + } + validate_rle(&values, bit_width as u8, None, -1); + } + + #[test] + fn test_values() { + for width in 1..MAX_WIDTH + 1 { + test_rle_values(width, 1, -1); + test_rle_values(width, 1024, -1); + test_rle_values(width, 1024, 0); + test_rle_values(width, 1024, 1); + } + } + + #[test] + fn test_rle_specific_roundtrip() { + let bit_width = 1; + let buffer_len = RleEncoder::min_buffer_size(bit_width); + let values: Vec = vec![0, 1, 1, 1, 1, 0, 0, 0, 0, 1]; + let mut encoder = RleEncoder::new(bit_width, buffer_len); + for v in &values { + assert!(encoder.put(*v as u64).expect("put() should be OK")); + } + let buffer = encoder.consume().expect("consume() should be OK"); + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(ByteBufferPtr::new(buffer)); + let mut actual_values: Vec = vec![0; values.len()]; + decoder + .get_batch(&mut actual_values) + .expect("get_batch() should be OK"); + assert_eq!(actual_values, values); + } + + fn test_round_trip(values: &[i32], bit_width: u8) { + let buffer_len = 64 * 1024; + let mut encoder = RleEncoder::new(bit_width, buffer_len); + for v in values { + let result = encoder.put(*v as u64).expect("put() should be OK"); + assert!(result, "put() should not return false"); + } + + let buffer = ByteBufferPtr::new(encoder.consume().expect("consume() should be OK")); + + // Verify read + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(buffer.all()); + for v in values { + let val = decoder + .get::() + .expect("get() should be OK") + .expect("get() should return value"); + assert_eq!(val, *v); + } + + // Verify batch read + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(buffer); + let mut values_read: Vec = vec![0; values.len()]; + decoder + .get_batch(&mut values_read[..]) + .expect("get_batch() should be OK"); + assert_eq!(&values_read[..], values); + } + + #[test] + fn test_random() { + let seed_len = 32; + let niters = 50; + let ngroups = 1000; + let max_group_size = 15; + let mut values = vec![]; + + for _ in 0..niters { + values.clear(); + let mut rng = thread_rng(); + let seed_vec: Vec = Standard.sample_iter(&mut rng).take(seed_len).collect(); + let mut seed = [0u8; 32]; + seed.copy_from_slice(&seed_vec[0..seed_len]); + let mut gen = rand::StdRng::from_seed(seed); + + let mut parity = false; + for _ in 0..ngroups { + let mut group_size = gen.gen_range::(1, 20); + if group_size > max_group_size { + group_size = 1; + } + for _ in 0..group_size { + values.push(parity as i32); + } + parity = !parity; + } + let bit_width = bit_util::num_required_bits(values.len() as u64); + assert!(bit_width < 64); + test_round_trip(&values[..], bit_width as u8); + } + } +} diff --git a/rust/src/parquet/errors.rs b/rust/src/parquet/errors.rs new file mode 100644 index 0000000000000..a5532c1eb66dc --- /dev/null +++ b/rust/src/parquet/errors.rs @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Common Parquet errors and macros. + +use std::{cell, convert, io, result}; + +use quick_error::quick_error; +use snap; +use thrift; + +quick_error! { + /// Set of errors that can be produced during different operations in Parquet. + #[derive(Debug, PartialEq)] + pub enum ParquetError { + /// General Parquet error. + /// Returned when code violates normal workflow of working with Parquet files. + General(message: String) { + display("Parquet error: {}", message) + description(message) + from(e: io::Error) -> (format!("underlying IO error: {}", e)) + from(e: snap::Error) -> (format!("underlying snap error: {}", e)) + from(e: thrift::Error) -> (format!("underlying Thrift error: {}", e)) + from(e: cell::BorrowMutError) -> (format!("underlying borrow error: {}", e)) + } + /// "Not yet implemented" Parquet error. + /// Returned when functionality is not yet available. + NYI(message: String) { + display("NYI: {}", message) + description(message) + } + /// "End of file" Parquet error. + /// Returned when IO related failures occur, e.g. when there are not enough bytes to + /// decode. + EOF(message: String) { + display("EOF: {}", message) + description(message) + } + } +} + +/// A specialized `Result` for Parquet errors. +pub type Result = result::Result; + +// ---------------------------------------------------------------------- +// Conversion from `ParquetError` to other types of `Error`s + +impl convert::From for io::Error { + fn from(e: ParquetError) -> Self { + io::Error::new(io::ErrorKind::Other, e) + } +} + +// ---------------------------------------------------------------------- +// Convenient macros for different errors + +macro_rules! general_err { + ($fmt:expr) => (ParquetError::General($fmt.to_owned())); + ($fmt:expr, $($args:expr),*) => (ParquetError::General(format!($fmt, $($args),*))); + ($e:expr, $fmt:expr) => (ParquetError::General($fmt.to_owned(), $e)); + ($e:ident, $fmt:expr, $($args:tt),*) => ( + ParquetError::General(&format!($fmt, $($args),*), $e)); +} + +macro_rules! nyi_err { + ($fmt:expr) => (ParquetError::NYI($fmt.to_owned())); + ($fmt:expr, $($args:expr),*) => (ParquetError::NYI(format!($fmt, $($args),*))); +} + +macro_rules! eof_err { + ($fmt:expr) => (ParquetError::EOF($fmt.to_owned())); + ($fmt:expr, $($args:expr),*) => (ParquetError::EOF(format!($fmt, $($args),*))); +} diff --git a/rust/src/parquet/file/metadata.rs b/rust/src/parquet/file/metadata.rs new file mode 100644 index 0000000000000..7f2442506f67f --- /dev/null +++ b/rust/src/parquet/file/metadata.rs @@ -0,0 +1,736 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains information about available Parquet metadata. +//! +//! The hierarchy of metadata is as follows: +//! +//! [`ParquetMetaData`](struct.ParquetMetaData.html) contains +//! [`FileMetaData`](struct.FileMetaData.html) and zero or more +//! [`RowGroupMetaData`](struct.RowGroupMetaData.html) for each row group. +//! +//! [`FileMetaData`](struct.FileMetaData.html) includes file version, application specific +//! metadata. +//! +//! Each [`RowGroupMetaData`](struct.RowGroupMetaData.html) contains information about row +//! group and one or more [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) for +//! each column chunk. +//! +//! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information about column +//! chunk (primitive leaf column), including encoding/compression, number of values, etc. + +use std::rc::Rc; + +use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup}; + +use crate::parquet::basic::{ColumnOrder, Compression, Encoding, Type}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::statistics::{self, Statistics}; +use crate::parquet::schema::types::{ + ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, + Type as SchemaType, TypePtr, +}; + +/// Reference counted pointer for [`ParquetMetaData`]. +pub type ParquetMetaDataPtr = Rc; + +/// Global Parquet metadata. +pub struct ParquetMetaData { + file_metadata: FileMetaDataPtr, + row_groups: Vec, +} + +impl ParquetMetaData { + /// Creates Parquet metadata from file metadata and a list of row group metadata `Rc`s + /// for each available row group. + pub fn new(file_metadata: FileMetaData, row_group_ptrs: Vec) -> Self { + ParquetMetaData { + file_metadata: Rc::new(file_metadata), + row_groups: row_group_ptrs, + } + } + + /// Returns file metadata as reference counted clone. + pub fn file_metadata(&self) -> FileMetaDataPtr { + self.file_metadata.clone() + } + + /// Returns number of row groups in this file. + pub fn num_row_groups(&self) -> usize { + self.row_groups.len() + } + + /// Returns row group metadata for `i`th position. + /// Position should be less than number of row groups `num_row_groups`. + pub fn row_group(&self, i: usize) -> RowGroupMetaDataPtr { + self.row_groups[i].clone() + } + + /// Returns slice of row group reference counted pointers in this file. + pub fn row_groups(&self) -> &[RowGroupMetaDataPtr] { + &self.row_groups.as_slice() + } +} + +/// Reference counted pointer for [`FileMetaData`]. +pub type FileMetaDataPtr = Rc; + +/// Metadata for a Parquet file. +pub struct FileMetaData { + version: i32, + num_rows: i64, + created_by: Option, + schema: TypePtr, + schema_descr: SchemaDescPtr, + column_orders: Option>, +} + +impl FileMetaData { + /// Creates new file metadata. + pub fn new( + version: i32, + num_rows: i64, + created_by: Option, + schema: TypePtr, + schema_descr: SchemaDescPtr, + column_orders: Option>, + ) -> Self { + FileMetaData { + version, + num_rows, + created_by, + schema, + schema_descr, + column_orders, + } + } + + /// Returns version of this file. + pub fn version(&self) -> i32 { + self.version + } + + /// Returns number of rows in the file. + pub fn num_rows(&self) -> i64 { + self.num_rows + } + + /// String message for application that wrote this file. + /// + /// This should have the following format: + /// ` version (build )`. + /// + /// ```shell + /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b) + /// ``` + pub fn created_by(&self) -> &Option { + &self.created_by + } + + /// Returns Parquet ['Type`] that describes schema in this file. + pub fn schema(&self) -> &SchemaType { + self.schema.as_ref() + } + + /// Returns a reference to schema descriptor. + pub fn schema_descr(&self) -> &SchemaDescriptor { + &self.schema_descr + } + + /// Returns reference counted clone for schema descriptor. + pub fn schema_descr_ptr(&self) -> SchemaDescPtr { + self.schema_descr.clone() + } + + /// Column (sort) order used for `min` and `max` values of each column in this file. + /// + /// Each column order corresponds to one column, determined by its position in the list, + /// matching the position of the column in the schema. + /// + /// When `None` is returned, there are no column orders available, and each column + /// should be assumed to have undefined (legacy) column order. + pub fn column_orders(&self) -> Option<&Vec> { + self.column_orders.as_ref() + } + + /// Returns column order for `i`th column in this file. + /// If column orders are not available, returns undefined (legacy) column order. + pub fn column_order(&self, i: usize) -> ColumnOrder { + self.column_orders + .as_ref() + .map(|data| data[i]) + .unwrap_or(ColumnOrder::UNDEFINED) + } +} + +/// Reference counted pointer for [`RowGroupMetaData`]. +pub type RowGroupMetaDataPtr = Rc; + +/// Metadata for a row group. +pub struct RowGroupMetaData { + columns: Vec, + num_rows: i64, + total_byte_size: i64, + schema_descr: SchemaDescPtr, +} + +impl RowGroupMetaData { + /// Returns builer for row group metadata. + pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder { + RowGroupMetaDataBuilder::new(schema_descr) + } + + /// Number of columns in this row group. + pub fn num_columns(&self) -> usize { + self.columns.len() + } + + /// Returns column chunk metadata for `i`th column. + pub fn column(&self, i: usize) -> &ColumnChunkMetaData { + &self.columns[i] + } + + /// Returns slice of column chunk metadata [`Rc`] pointers. + pub fn columns(&self) -> &[ColumnChunkMetaDataPtr] { + &self.columns + } + + /// Number of rows in this row group. + pub fn num_rows(&self) -> i64 { + self.num_rows + } + + /// Total byte size of all uncompressed column data in this row group. + pub fn total_byte_size(&self) -> i64 { + self.total_byte_size + } + + /// Returns reference to a schema descriptor. + pub fn schema_descr(&self) -> &SchemaDescriptor { + self.schema_descr.as_ref() + } + + /// Returns reference counted clone of schema descriptor. + pub fn schema_descr_ptr(&self) -> SchemaDescPtr { + self.schema_descr.clone() + } + + /// Method to convert from Thrift. + pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result { + assert_eq!(schema_descr.num_columns(), rg.columns.len()); + let total_byte_size = rg.total_byte_size; + let num_rows = rg.num_rows; + let mut columns = vec![]; + for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) { + let cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; + columns.push(Rc::new(cc)); + } + Ok(RowGroupMetaData { + columns, + num_rows, + total_byte_size, + schema_descr, + }) + } + + /// Method to convert to Thrift. + pub fn to_thrift(&self) -> RowGroup { + RowGroup { + columns: self.columns().into_iter().map(|v| v.to_thrift()).collect(), + total_byte_size: self.total_byte_size, + num_rows: self.num_rows, + sorting_columns: None, + } + } +} + +/// Builder for row group metadata. +pub struct RowGroupMetaDataBuilder { + columns: Vec, + schema_descr: SchemaDescPtr, + num_rows: i64, + total_byte_size: i64, +} + +impl RowGroupMetaDataBuilder { + /// Creates new builder from schema descriptor. + fn new(schema_descr: SchemaDescPtr) -> Self { + Self { + columns: Vec::with_capacity(schema_descr.num_columns()), + schema_descr, + num_rows: 0, + total_byte_size: 0, + } + } + + /// Sets number of rows in this row group. + pub fn set_num_rows(mut self, value: i64) -> Self { + self.num_rows = value; + self + } + + /// Sets total size in bytes for this row group. + pub fn set_total_byte_size(mut self, value: i64) -> Self { + self.total_byte_size = value; + self + } + + /// Sets column metadata for this row group. + pub fn set_column_metadata(mut self, value: Vec) -> Self { + self.columns = value; + self + } + + /// Builds row group metadata. + pub fn build(self) -> Result { + if self.schema_descr.num_columns() != self.columns.len() { + return Err(general_err!( + "Column length mismatch: {} != {}", + self.schema_descr.num_columns(), + self.columns.len() + )); + } + + Ok(RowGroupMetaData { + columns: self.columns, + num_rows: self.num_rows, + total_byte_size: self.total_byte_size, + schema_descr: self.schema_descr, + }) + } +} + +/// Reference counted pointer for [`ColumnChunkMetaData`]. +pub type ColumnChunkMetaDataPtr = Rc; + +/// Metadata for a column chunk. +pub struct ColumnChunkMetaData { + column_type: Type, + column_path: ColumnPath, + column_descr: ColumnDescPtr, + encodings: Vec, + file_path: Option, + file_offset: i64, + num_values: i64, + compression: Compression, + total_compressed_size: i64, + total_uncompressed_size: i64, + data_page_offset: i64, + index_page_offset: Option, + dictionary_page_offset: Option, + statistics: Option, +} + +/// Represents common operations for a column chunk. +impl ColumnChunkMetaData { + /// Returns builder for column chunk metadata. + pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder { + ColumnChunkMetaDataBuilder::new(column_descr) + } + + /// File where the column chunk is stored. + /// + /// If not set, assumed to belong to the same file as the metadata. + /// This path is relative to the current file. + pub fn file_path(&self) -> Option<&String> { + self.file_path.as_ref() + } + + /// Byte offset in `file_path()`. + pub fn file_offset(&self) -> i64 { + self.file_offset + } + + /// Type of this column. Must be primitive. + pub fn column_type(&self) -> Type { + self.column_type + } + + /// Path (or identifier) of this column. + pub fn column_path(&self) -> &ColumnPath { + &self.column_path + } + + /// Descriptor for this column. + pub fn column_descr(&self) -> &ColumnDescriptor { + self.column_descr.as_ref() + } + + /// Reference counted clone of descriptor for this column. + pub fn column_descr_ptr(&self) -> ColumnDescPtr { + self.column_descr.clone() + } + + /// All encodings used for this column. + pub fn encodings(&self) -> &Vec { + &self.encodings + } + + /// Total number of values in this column chunk. + pub fn num_values(&self) -> i64 { + self.num_values + } + + /// Compression for this column. + pub fn compression(&self) -> Compression { + self.compression + } + + /// Returns the total compressed data size of this column chunk. + pub fn compressed_size(&self) -> i64 { + self.total_compressed_size + } + + /// Returns the total uncompressed data size of this column chunk. + pub fn uncompressed_size(&self) -> i64 { + self.total_uncompressed_size + } + + /// Returns the offset for the column data. + pub fn data_page_offset(&self) -> i64 { + self.data_page_offset + } + + /// Returns `true` if this column chunk contains a index page, `false` otherwise. + pub fn has_index_page(&self) -> bool { + self.index_page_offset.is_some() + } + + /// Returns the offset for the index page. + pub fn index_page_offset(&self) -> Option { + self.index_page_offset + } + + /// Returns `true` if this column chunk contains a dictionary page, `false` otherwise. + pub fn has_dictionary_page(&self) -> bool { + self.dictionary_page_offset.is_some() + } + + /// Returns the offset for the dictionary page, if any. + pub fn dictionary_page_offset(&self) -> Option { + self.dictionary_page_offset + } + + /// Returns statistics that are set for this column chunk, + /// or `None` if no statistics are available. + pub fn statistics(&self) -> Option<&Statistics> { + self.statistics.as_ref() + } + + /// Method to convert from Thrift. + pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { + if cc.meta_data.is_none() { + return Err(general_err!("Expected to have column metadata")); + } + let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap(); + let column_type = Type::from(col_metadata.type_); + let column_path = ColumnPath::new(col_metadata.path_in_schema); + let encodings = col_metadata + .encodings + .drain(0..) + .map(Encoding::from) + .collect(); + let compression = Compression::from(col_metadata.codec); + let file_path = cc.file_path; + let file_offset = cc.file_offset; + let num_values = col_metadata.num_values; + let total_compressed_size = col_metadata.total_compressed_size; + let total_uncompressed_size = col_metadata.total_uncompressed_size; + let data_page_offset = col_metadata.data_page_offset; + let index_page_offset = col_metadata.index_page_offset; + let dictionary_page_offset = col_metadata.dictionary_page_offset; + let statistics = statistics::from_thrift(column_type, col_metadata.statistics); + let result = ColumnChunkMetaData { + column_type, + column_path, + column_descr, + encodings, + file_path, + file_offset, + num_values, + compression, + total_compressed_size, + total_uncompressed_size, + data_page_offset, + index_page_offset, + dictionary_page_offset, + statistics, + }; + Ok(result) + } + + /// Method to convert to Thrift. + pub fn to_thrift(&self) -> ColumnChunk { + let column_metadata = ColumnMetaData { + type_: self.column_type.into(), + encodings: self.encodings().into_iter().map(|&v| v.into()).collect(), + path_in_schema: Vec::from(self.column_path.as_ref()), + codec: self.compression.into(), + num_values: self.num_values, + total_uncompressed_size: self.total_uncompressed_size, + total_compressed_size: self.total_compressed_size, + key_value_metadata: None, + data_page_offset: self.data_page_offset, + index_page_offset: self.index_page_offset, + dictionary_page_offset: self.dictionary_page_offset, + statistics: statistics::to_thrift(self.statistics.as_ref()), + encoding_stats: None, + }; + + ColumnChunk { + file_path: self.file_path().map(|v| v.clone()), + file_offset: self.file_offset, + meta_data: Some(column_metadata), + offset_index_offset: None, + offset_index_length: None, + column_index_offset: None, + column_index_length: None, + } + } +} + +/// Builder for column chunk metadata. +pub struct ColumnChunkMetaDataBuilder { + column_descr: ColumnDescPtr, + encodings: Vec, + file_path: Option, + file_offset: i64, + num_values: i64, + compression: Compression, + total_compressed_size: i64, + total_uncompressed_size: i64, + data_page_offset: i64, + index_page_offset: Option, + dictionary_page_offset: Option, + statistics: Option, +} + +impl ColumnChunkMetaDataBuilder { + /// Creates new column chunk metadata builder. + fn new(column_descr: ColumnDescPtr) -> Self { + Self { + column_descr, + encodings: Vec::new(), + file_path: None, + file_offset: 0, + num_values: 0, + compression: Compression::UNCOMPRESSED, + total_compressed_size: 0, + total_uncompressed_size: 0, + data_page_offset: 0, + index_page_offset: None, + dictionary_page_offset: None, + statistics: None, + } + } + + /// Sets list of encodings for this column chunk. + pub fn set_encodings(mut self, encodings: Vec) -> Self { + self.encodings = encodings; + self + } + + /// Sets optional file path for this column chunk. + pub fn set_file_path(mut self, value: String) -> Self { + self.file_path = Some(value); + self + } + + /// Sets file offset in bytes. + pub fn set_file_offset(mut self, value: i64) -> Self { + self.file_offset = value; + self + } + + /// Sets number of values. + pub fn set_num_values(mut self, value: i64) -> Self { + self.num_values = value; + self + } + + /// Sets compression. + pub fn set_compression(mut self, value: Compression) -> Self { + self.compression = value; + self + } + + /// Sets total compressed size in bytes. + pub fn set_total_compressed_size(mut self, value: i64) -> Self { + self.total_compressed_size = value; + self + } + + /// Sets total uncompressed size in bytes. + pub fn set_total_uncompressed_size(mut self, value: i64) -> Self { + self.total_uncompressed_size = value; + self + } + + /// Sets data page offset in bytes. + pub fn set_data_page_offset(mut self, value: i64) -> Self { + self.data_page_offset = value; + self + } + + /// Sets optional dictionary page ofset in bytes. + pub fn set_dictionary_page_offset(mut self, value: Option) -> Self { + self.dictionary_page_offset = value; + self + } + + /// Sets optional index page offset in bytes. + pub fn set_index_page_offset(mut self, value: Option) -> Self { + self.index_page_offset = value; + self + } + + /// Sets statistics for this column chunk. + pub fn set_statistics(mut self, value: Statistics) -> Self { + self.statistics = Some(value); + self + } + + /// Builds column chunk metadata. + pub fn build(self) -> Result { + Ok(ColumnChunkMetaData { + column_type: self.column_descr.physical_type(), + column_path: self.column_descr.path().clone(), + column_descr: self.column_descr, + encodings: self.encodings, + file_path: self.file_path, + file_offset: self.file_offset, + num_values: self.num_values, + compression: self.compression, + total_compressed_size: self.total_compressed_size, + total_uncompressed_size: self.total_uncompressed_size, + data_page_offset: self.data_page_offset, + index_page_offset: self.index_page_offset, + dictionary_page_offset: self.dictionary_page_offset, + statistics: self.statistics, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_row_group_metadata_thrift_conversion() { + let schema_descr = get_test_schema_descr(); + + let mut columns = vec![]; + for ptr in schema_descr.columns() { + let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap(); + columns.push(Rc::new(column)); + } + let row_group_meta = RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(1000) + .set_total_byte_size(2000) + .set_column_metadata(columns) + .build() + .unwrap(); + + let row_group_exp = row_group_meta.to_thrift(); + let row_group_res = + RowGroupMetaData::from_thrift(schema_descr.clone(), row_group_exp.clone()) + .unwrap() + .to_thrift(); + + assert_eq!(row_group_res, row_group_exp); + } + + #[test] + fn test_row_group_metadata_thrift_conversion_empty() { + let schema_descr = get_test_schema_descr(); + + let row_group_meta = RowGroupMetaData::builder(schema_descr.clone()).build(); + + assert!(row_group_meta.is_err()); + if let Err(e) = row_group_meta { + assert_eq!( + e.to_string(), + "Parquet error: Column length mismatch: 2 != 0" + ); + } + } + + #[test] + fn test_column_chunk_metadata_thrift_conversion() { + let column_descr = get_test_schema_descr().column(0); + + let col_metadata = ColumnChunkMetaData::builder(column_descr.clone()) + .set_encodings(vec![Encoding::PLAIN, Encoding::RLE]) + .set_file_path("file_path".to_owned()) + .set_file_offset(100) + .set_num_values(1000) + .set_compression(Compression::SNAPPY) + .set_total_compressed_size(2000) + .set_total_uncompressed_size(3000) + .set_data_page_offset(4000) + .set_dictionary_page_offset(Some(5000)) + .build() + .unwrap(); + + let col_chunk_exp = col_metadata.to_thrift(); + + let col_chunk_res = + ColumnChunkMetaData::from_thrift(column_descr.clone(), col_chunk_exp.clone()) + .unwrap() + .to_thrift(); + + assert_eq!(col_chunk_res, col_chunk_exp); + } + + #[test] + fn test_column_chunk_metadata_thrift_conversion_empty() { + let column_descr = get_test_schema_descr().column(0); + + let col_metadata = ColumnChunkMetaData::builder(column_descr.clone()) + .build() + .unwrap(); + + let col_chunk_exp = col_metadata.to_thrift(); + let col_chunk_res = + ColumnChunkMetaData::from_thrift(column_descr.clone(), col_chunk_exp.clone()) + .unwrap() + .to_thrift(); + + assert_eq!(col_chunk_res, col_chunk_exp); + } + + /// Returns sample schema descriptor so we can create column metadata. + fn get_test_schema_descr() -> SchemaDescPtr { + let schema = SchemaType::group_type_builder("schema") + .with_fields(&mut vec![ + Rc::new( + SchemaType::primitive_type_builder("a", Type::INT32) + .build() + .unwrap(), + ), + Rc::new( + SchemaType::primitive_type_builder("b", Type::INT32) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(); + + Rc::new(SchemaDescriptor::new(Rc::new(schema))) + } +} diff --git a/rust/src/parquet/file/mod.rs b/rust/src/parquet/file/mod.rs new file mode 100644 index 0000000000000..ebaebbad0bb6f --- /dev/null +++ b/rust/src/parquet/file/mod.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Main entrypoint for working with Parquet API. +//! +//! Provides access to file and row group readers and writers, record API, metadata, etc. +//! +//! See [`reader::SerializedFileReader`](reader/struct.SerializedFileReader.html) or +//! [`writer::SerializedFileWriter`](writer/struct.SerializedFileWriter.html) for a +//! starting reference, [`metadata::ParquetMetaData`](metadata/index.html) for file +//! metadata, and [`statistics`](statistics/index.html) for working with statistics. +//! +//! # Example of writing a new file +//! +//! ```rust +//! use std::{fs, path::Path, rc::Rc}; +//! +//! use arrow::parquet::{ +//! file::{ +//! properties::WriterProperties, +//! writer::{FileWriter, SerializedFileWriter}, +//! }, +//! schema::parser::parse_message_type, +//! }; +//! +//! let path = Path::new("target/debug/examples/sample.parquet"); +//! +//! let message_type = " +//! message schema { +//! REQUIRED INT32 b; +//! } +//! "; +//! let schema = Rc::new(parse_message_type(message_type).unwrap()); +//! let props = Rc::new(WriterProperties::builder().build()); +//! let file = fs::File::create(&path).unwrap(); +//! let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); +//! let mut row_group_writer = writer.next_row_group().unwrap(); +//! while let Some(mut col_writer) = row_group_writer.next_column().unwrap() { +//! // ... write values to a column writer +//! row_group_writer.close_column(col_writer).unwrap(); +//! } +//! writer.close_row_group(row_group_writer).unwrap(); +//! writer.close().unwrap(); +//! +//! let bytes = fs::read(&path).unwrap(); +//! assert_eq!(&bytes[0..4], &[b'P', b'A', b'R', b'1']); +//! ``` +//! # Example of reading an existing file +//! +//! ```rust +//! use arrow::parquet::file::reader::{FileReader, SerializedFileReader}; +//! use std::{fs::File, path::Path}; +//! +//! let path = Path::new("target/debug/examples/sample.parquet"); +//! if let Ok(file) = File::open(&path) { +//! let file = File::open(&path).unwrap(); +//! let reader = SerializedFileReader::new(file).unwrap(); +//! +//! let parquet_metadata = reader.metadata(); +//! assert_eq!(parquet_metadata.num_row_groups(), 1); +//! +//! let row_group_reader = reader.get_row_group(0).unwrap(); +//! assert_eq!(row_group_reader.num_columns(), 1); +//! } +//! ``` + +pub mod metadata; +pub mod properties; +pub mod reader; +pub mod statistics; +pub mod writer; + +const FOOTER_SIZE: usize = 8; +const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; diff --git a/rust/src/parquet/file/properties.rs b/rust/src/parquet/file/properties.rs new file mode 100644 index 0000000000000..911ec55733490 --- /dev/null +++ b/rust/src/parquet/file/properties.rs @@ -0,0 +1,648 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Writer properties. +//! +//! # Usage +//! +//! ```rust +//! use arrow::parquet::{ +//! basic::{Compression, Encoding}, +//! file::properties::*, +//! schema::types::ColumnPath, +//! }; +//! +//! // Create properties with default configuration. +//! let props = WriterProperties::builder().build(); +//! +//! // Use properties builder to set certain options and assemble the configuration. +//! let props = WriterProperties::builder() +//! .set_writer_version(WriterVersion::PARQUET_1_0) +//! .set_encoding(Encoding::PLAIN) +//! .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED) +//! .set_compression(Compression::SNAPPY) +//! .build(); +//! +//! assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0); +//! assert_eq!( +//! props.encoding(&ColumnPath::from("col1")), +//! Some(Encoding::DELTA_BINARY_PACKED) +//! ); +//! assert_eq!( +//! props.encoding(&ColumnPath::from("col2")), +//! Some(Encoding::PLAIN) +//! ); +//! ``` + +use std::{collections::HashMap, rc::Rc}; + +use crate::parquet::basic::{Compression, Encoding}; +use crate::parquet::schema::types::ColumnPath; + +const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; +const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; +const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0; +const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; +const DEFAULT_DICTIONARY_ENABLED: bool = true; +const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; +const DEFAULT_STATISTICS_ENABLED: bool = true; +const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; +const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 128 * 1024 * 1024; +const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); + +/// Parquet writer version. +/// +/// Basic constant, which is not part of the Thrift definition. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum WriterVersion { + PARQUET_1_0, + PARQUET_2_0, +} + +impl WriterVersion { + /// Returns writer version as `i32`. + pub fn as_num(&self) -> i32 { + match self { + WriterVersion::PARQUET_1_0 => 1, + WriterVersion::PARQUET_2_0 => 2, + } + } +} + +/// Reference counted writer properties. +pub type WriterPropertiesPtr = Rc; + +/// Writer properties. +/// +/// It is created as an immutable data structure, use [`WriterPropertiesBuilder`] to +/// assemble the properties. +#[derive(Debug, Clone)] +pub struct WriterProperties { + data_pagesize_limit: usize, + dictionary_pagesize_limit: usize, + write_batch_size: usize, + max_row_group_size: usize, + writer_version: WriterVersion, + created_by: String, + default_column_properties: ColumnProperties, + column_properties: HashMap, +} + +impl WriterProperties { + /// Returns builder for writer properties with default values. + pub fn builder() -> WriterPropertiesBuilder { + WriterPropertiesBuilder::with_defaults() + } + + /// Returns data page size limit. + pub fn data_pagesize_limit(&self) -> usize { + self.data_pagesize_limit + } + + /// Returns dictionary page size limit. + pub fn dictionary_pagesize_limit(&self) -> usize { + self.dictionary_pagesize_limit + } + + /// Returns configured batch size for writes. + /// + /// When writing a batch of data, this setting allows to split it internally into + /// smaller batches so we can better estimate the size of a page currently being + /// written. + pub fn write_batch_size(&self) -> usize { + self.write_batch_size + } + + /// Returns max size for a row group. + pub fn max_row_group_size(&self) -> usize { + self.max_row_group_size + } + + /// Returns configured writer version. + pub fn writer_version(&self) -> WriterVersion { + self.writer_version + } + + /// Returns `created_by` string. + pub fn created_by(&self) -> &str { + &self.created_by + } + + /// Returns encoding for a data page, when dictionary encoding is enabled. + /// This is not configurable. + #[inline] + pub fn dictionary_data_page_encoding(&self) -> Encoding { + // PLAIN_DICTIONARY encoding is deprecated in writer version 1. + // Dictionary values are encoded using RLE_DICTIONARY encoding. + Encoding::RLE_DICTIONARY + } + + /// Returns encoding for dictionary page, when dictionary encoding is enabled. + /// This is not configurable. + #[inline] + pub fn dictionary_page_encoding(&self) -> Encoding { + // PLAIN_DICTIONARY is deprecated in writer version 1. + // Dictionary is encoded using plain encoding. + Encoding::PLAIN + } + + /// Returns encoding for a column, if set. + /// In case when dictionary is enabled, returns fallback encoding. + /// + /// If encoding is not set, then column writer will choose the best encoding + /// based on the column type. + pub fn encoding(&self, col: &ColumnPath) -> Option { + self.column_properties + .get(col) + .and_then(|c| c.encoding()) + .or_else(|| self.default_column_properties.encoding()) + } + + /// Returns compression codec for a column. + pub fn compression(&self, col: &ColumnPath) -> Compression { + self.column_properties + .get(col) + .and_then(|c| c.compression()) + .or_else(|| self.default_column_properties.compression()) + .unwrap_or(DEFAULT_COMPRESSION) + } + + /// Returns `true` if dictionary encoding is enabled for a column. + pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool { + self.column_properties + .get(col) + .and_then(|c| c.dictionary_enabled()) + .or_else(|| self.default_column_properties.dictionary_enabled()) + .unwrap_or(DEFAULT_DICTIONARY_ENABLED) + } + + /// Returns `true` if statistics are enabled for a column. + pub fn statistics_enabled(&self, col: &ColumnPath) -> bool { + self.column_properties + .get(col) + .and_then(|c| c.statistics_enabled()) + .or_else(|| self.default_column_properties.statistics_enabled()) + .unwrap_or(DEFAULT_STATISTICS_ENABLED) + } + + /// Returns max size for statistics. + /// Only applicable if statistics are enabled. + pub fn max_statistics_size(&self, col: &ColumnPath) -> usize { + self.column_properties + .get(col) + .and_then(|c| c.max_statistics_size()) + .or_else(|| self.default_column_properties.max_statistics_size()) + .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE) + } +} + +/// Writer properties builder. +pub struct WriterPropertiesBuilder { + data_pagesize_limit: usize, + dictionary_pagesize_limit: usize, + write_batch_size: usize, + max_row_group_size: usize, + writer_version: WriterVersion, + created_by: String, + default_column_properties: ColumnProperties, + column_properties: HashMap, +} + +impl WriterPropertiesBuilder { + /// Returns default state of the builder. + fn with_defaults() -> Self { + Self { + data_pagesize_limit: DEFAULT_PAGE_SIZE, + dictionary_pagesize_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, + write_batch_size: DEFAULT_WRITE_BATCH_SIZE, + max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE, + writer_version: DEFAULT_WRITER_VERSION, + created_by: DEFAULT_CREATED_BY.to_string(), + default_column_properties: ColumnProperties::new(), + column_properties: HashMap::new(), + } + } + + /// Finalizes the configuration and returns immutable writer properties struct. + pub fn build(self) -> WriterProperties { + WriterProperties { + data_pagesize_limit: self.data_pagesize_limit, + dictionary_pagesize_limit: self.dictionary_pagesize_limit, + write_batch_size: self.write_batch_size, + max_row_group_size: self.max_row_group_size, + writer_version: self.writer_version, + created_by: self.created_by, + default_column_properties: self.default_column_properties, + column_properties: self.column_properties, + } + } + + // ---------------------------------------------------------------------- + // Writer properies related to a file + + /// Sets writer version. + pub fn set_writer_version(mut self, value: WriterVersion) -> Self { + self.writer_version = value; + self + } + + /// Sets data page size limit. + pub fn set_data_pagesize_limit(mut self, value: usize) -> Self { + self.data_pagesize_limit = value; + self + } + + /// Sets dictionary page size limit. + pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self { + self.dictionary_pagesize_limit = value; + self + } + + /// Sets write batch size. + pub fn set_write_batch_size(mut self, value: usize) -> Self { + self.write_batch_size = value; + self + } + + /// Sets max size for a row group. + pub fn set_max_row_group_size(mut self, value: usize) -> Self { + self.max_row_group_size = value; + self + } + + /// Sets "created by" property. + pub fn set_created_by(mut self, value: String) -> Self { + self.created_by = value; + self + } + + // ---------------------------------------------------------------------- + // Setters for any column (global) + + /// Sets encoding for any column. + /// + /// If dictionary is not enabled, this is treated as a primary encoding for all columns. + /// In case when dictionary is enabled for any column, this value is considered to + /// be a fallback encoding for that column. + /// + /// Panics if user tries to set dictionary encoding here, regardless of dictinoary + /// encoding flag being set. + pub fn set_encoding(mut self, value: Encoding) -> Self { + self.default_column_properties.set_encoding(value); + self + } + + /// Sets compression codec for any column. + pub fn set_compression(mut self, value: Compression) -> Self { + self.default_column_properties.set_compression(value); + self + } + + /// Sets flag to enable/disable dictionary encoding for any column. + /// + /// Use this method to set dictionary encoding, instead of explicitly specifying + /// encoding in `set_encoding` method. + pub fn set_dictionary_enabled(mut self, value: bool) -> Self { + self.default_column_properties.set_dictionary_enabled(value); + self + } + + /// Sets flag to enable/disable statistics for any column. + pub fn set_statistics_enabled(mut self, value: bool) -> Self { + self.default_column_properties.set_statistics_enabled(value); + self + } + + /// Sets max statistics size for any column. + /// Applicable only if statistics are enabled. + pub fn set_max_statistics_size(mut self, value: usize) -> Self { + self.default_column_properties + .set_max_statistics_size(value); + self + } + + // ---------------------------------------------------------------------- + // Setters for a specific column + + /// Helper method to get existing or new mutable reference of column properties. + #[inline] + fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties { + self.column_properties + .entry(col) + .or_insert(ColumnProperties::new()) + } + + /// Sets encoding for a column. + /// Takes precedence over globally defined settings. + /// + /// If dictionary is not enabled, this is treated as a primary encoding for this column. + /// In case when dictionary is enabled for this column, either through global defaults + /// or explicitly, this value is considered to be a fallback encoding for this column. + /// + /// Panics if user tries to set dictionary encoding here, regardless of dictinoary + /// encoding flag being set. + pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self { + self.get_mut_props(col).set_encoding(value); + self + } + + /// Sets compression codec for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self { + self.get_mut_props(col).set_compression(value); + self + } + + /// Sets flag to enable/disable dictionary encoding for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self { + self.get_mut_props(col).set_dictionary_enabled(value); + self + } + + /// Sets flag to enable/disable statistics for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_statistics_enabled(mut self, col: ColumnPath, value: bool) -> Self { + self.get_mut_props(col).set_statistics_enabled(value); + self + } + + /// Sets max size for statistics for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self { + self.get_mut_props(col).set_max_statistics_size(value); + self + } +} + +/// Container for column properties that can be changed as part of writer. +/// +/// If a field is `None`, it means that no specific value has been set for this column, +/// so some subsequent or default value must be used. +#[derive(Debug, Clone, PartialEq)] +struct ColumnProperties { + encoding: Option, + codec: Option, + dictionary_enabled: Option, + statistics_enabled: Option, + max_statistics_size: Option, +} + +impl ColumnProperties { + /// Initialise column properties with default values. + fn new() -> Self { + Self { + encoding: None, + codec: None, + dictionary_enabled: None, + statistics_enabled: None, + max_statistics_size: None, + } + } + + /// Sets encoding for this column. + /// + /// If dictionary is not enabled, this is treated as a primary encoding for a column. + /// In case when dictionary is enabled for a column, this value is considered to + /// be a fallback encoding. + /// + /// Panics if user tries to set dictionary encoding here, regardless of dictinoary + /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary + /// for a column. + fn set_encoding(&mut self, value: Encoding) { + if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY { + panic!("Dictionary encoding can not be used as fallback encoding"); + } + self.encoding = Some(value); + } + + /// Sets compression codec for this column. + fn set_compression(&mut self, value: Compression) { + self.codec = Some(value); + } + + /// Sets whether or not dictionary encoding is enabled for this column. + fn set_dictionary_enabled(&mut self, enabled: bool) { + self.dictionary_enabled = Some(enabled); + } + + /// Sets whether or not statistics are enabled for this column. + fn set_statistics_enabled(&mut self, enabled: bool) { + self.statistics_enabled = Some(enabled); + } + + /// Sets max size for statistics for this column. + fn set_max_statistics_size(&mut self, value: usize) { + self.max_statistics_size = Some(value); + } + + /// Returns optional encoding for this column. + fn encoding(&self) -> Option { + self.encoding + } + + /// Returns optional compression codec for this column. + fn compression(&self) -> Option { + self.codec + } + + /// Returns `Some(true)` if dictionary encoding is enabled for this column, if disabled + /// then returns `Some(false)`. If result is `None`, then no setting has been provided. + fn dictionary_enabled(&self) -> Option { + self.dictionary_enabled + } + + /// Returns `Some(true)` if statistics are enabled for this column, if disabled then + /// returns `Some(false)`. If result is `None`, then no setting has been provided. + fn statistics_enabled(&self) -> Option { + self.statistics_enabled + } + + /// Returns optional max size in bytes for statistics. + fn max_statistics_size(&self) -> Option { + self.max_statistics_size + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_writer_version() { + assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1); + assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2); + } + + #[test] + fn test_writer_properties_default_settings() { + let props = WriterProperties::builder().build(); + assert_eq!(props.data_pagesize_limit(), DEFAULT_PAGE_SIZE); + assert_eq!( + props.dictionary_pagesize_limit(), + DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT + ); + assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE); + assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE); + assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION); + assert_eq!(props.created_by(), DEFAULT_CREATED_BY); + assert_eq!(props.encoding(&ColumnPath::from("col")), None); + assert_eq!( + props.compression(&ColumnPath::from("col")), + DEFAULT_COMPRESSION + ); + assert_eq!( + props.dictionary_enabled(&ColumnPath::from("col")), + DEFAULT_DICTIONARY_ENABLED + ); + assert_eq!( + props.statistics_enabled(&ColumnPath::from("col")), + DEFAULT_STATISTICS_ENABLED + ); + assert_eq!( + props.max_statistics_size(&ColumnPath::from("col")), + DEFAULT_MAX_STATISTICS_SIZE + ); + } + + #[test] + fn test_writer_properties_dictionary_encoding() { + // dictionary encoding is not configurable, and it should be the same for both + // writer version 1 and 2. + for version in vec![WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] { + let props = WriterProperties::builder() + .set_writer_version(version) + .build(); + assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN); + assert_eq!( + props.dictionary_data_page_encoding(), + Encoding::RLE_DICTIONARY + ); + } + } + + #[test] + #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")] + fn test_writer_properties_panic_when_plain_dictionary_is_fallback() { + // Should panic when user specifies dictionary encoding as fallback encoding. + WriterProperties::builder() + .set_encoding(Encoding::PLAIN_DICTIONARY) + .build(); + } + + #[test] + #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")] + fn test_writer_properties_panic_when_rle_dictionary_is_fallback() { + // Should panic when user specifies dictionary encoding as fallback encoding. + WriterProperties::builder() + .set_encoding(Encoding::RLE_DICTIONARY) + .build(); + } + + #[test] + #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")] + fn test_writer_properties_panic_when_dictionary_is_enabled() { + WriterProperties::builder() + .set_dictionary_enabled(true) + .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY) + .build(); + } + + #[test] + #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")] + fn test_writer_properties_panic_when_dictionary_is_disabled() { + WriterProperties::builder() + .set_dictionary_enabled(false) + .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY) + .build(); + } + + #[test] + fn test_writer_properties_builder() { + let props = WriterProperties::builder() + // file settings + .set_writer_version(WriterVersion::PARQUET_2_0) + .set_data_pagesize_limit(10) + .set_dictionary_pagesize_limit(20) + .set_write_batch_size(30) + .set_max_row_group_size(40) + .set_created_by("default".to_owned()) + // global column settings + .set_encoding(Encoding::DELTA_BINARY_PACKED) + .set_compression(Compression::GZIP) + .set_dictionary_enabled(false) + .set_statistics_enabled(false) + .set_max_statistics_size(50) + // specific column settings + .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) + .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY) + .set_column_dictionary_enabled(ColumnPath::from("col"), true) + .set_column_statistics_enabled(ColumnPath::from("col"), true) + .set_column_max_statistics_size(ColumnPath::from("col"), 123) + .build(); + + assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0); + assert_eq!(props.data_pagesize_limit(), 10); + assert_eq!(props.dictionary_pagesize_limit(), 20); + assert_eq!(props.write_batch_size(), 30); + assert_eq!(props.max_row_group_size(), 40); + assert_eq!(props.created_by(), "default"); + + assert_eq!( + props.encoding(&ColumnPath::from("a")), + Some(Encoding::DELTA_BINARY_PACKED) + ); + assert_eq!(props.compression(&ColumnPath::from("a")), Compression::GZIP); + assert_eq!(props.dictionary_enabled(&ColumnPath::from("a")), false); + assert_eq!(props.statistics_enabled(&ColumnPath::from("a")), false); + assert_eq!(props.max_statistics_size(&ColumnPath::from("a")), 50); + + assert_eq!( + props.encoding(&ColumnPath::from("col")), + Some(Encoding::RLE) + ); + assert_eq!( + props.compression(&ColumnPath::from("col")), + Compression::SNAPPY + ); + assert_eq!(props.dictionary_enabled(&ColumnPath::from("col")), true); + assert_eq!(props.statistics_enabled(&ColumnPath::from("col")), true); + assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123); + } + + #[test] + fn test_writer_properties_builder_partial_defaults() { + let props = WriterProperties::builder() + .set_encoding(Encoding::DELTA_BINARY_PACKED) + .set_compression(Compression::GZIP) + .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) + .build(); + + assert_eq!( + props.encoding(&ColumnPath::from("col")), + Some(Encoding::RLE) + ); + assert_eq!( + props.compression(&ColumnPath::from("col")), + Compression::GZIP + ); + assert_eq!( + props.dictionary_enabled(&ColumnPath::from("col")), + DEFAULT_DICTIONARY_ENABLED + ); + } +} diff --git a/rust/src/parquet/file/reader.rs b/rust/src/parquet/file/reader.rs new file mode 100644 index 0000000000000..c2e5dd176dac5 --- /dev/null +++ b/rust/src/parquet/file/reader.rs @@ -0,0 +1,899 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains file reader API and provides methods to access file metadata, row group +//! readers to read individual column chunks, or access record iterator. + +use std::{ + convert::TryFrom, + fs::File, + io::{BufReader, Cursor, Read, Seek, SeekFrom}, + path::Path, + rc::Rc, +}; + +use byteorder::{ByteOrder, LittleEndian}; +use parquet_format::{ + ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData, PageHeader, PageType, +}; +use thrift::protocol::TCompactInputProtocol; + +use crate::parquet::basic::{ColumnOrder, Compression, Encoding, Type}; +use crate::parquet::column::{ + page::{Page, PageReader}, + reader::{ColumnReader, ColumnReaderImpl}, +}; +use crate::parquet::compression::{create_codec, Codec}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::{metadata::*, statistics, FOOTER_SIZE, PARQUET_MAGIC}; +use crate::parquet::record::reader::RowIter; +use crate::parquet::schema::types::{self, SchemaDescriptor, Type as SchemaType}; +use crate::parquet::util::{io::FileSource, memory::ByteBufferPtr}; + +// ---------------------------------------------------------------------- +// APIs for file & row group readers + +/// Parquet file reader API. With this, user can get metadata information about the +/// Parquet file, can get reader for each row group, and access record iterator. +pub trait FileReader { + /// Get metadata information about this file. + fn metadata(&self) -> ParquetMetaDataPtr; + + /// Get the total number of row groups for this file. + fn num_row_groups(&self) -> usize; + + /// Get the `i`th row group reader. Note this doesn't do bound check. + fn get_row_group(&self, i: usize) -> Result>; + + /// Get full iterator of `Row`s from a file (over all row groups). + /// + /// Iterator will automatically load the next row group to advance. + /// + /// Projected schema can be a subset of or equal to the file schema, when it is None, + /// full file schema is assumed. + fn get_row_iter(&self, projection: Option) -> Result; +} + +/// Parquet row group reader API. With this, user can get metadata information about the +/// row group, as well as readers for each individual column chunk. +pub trait RowGroupReader { + /// Get metadata information about this row group. + fn metadata(&self) -> RowGroupMetaDataPtr; + + /// Get the total number of column chunks in this row group. + fn num_columns(&self) -> usize; + + /// Get page reader for the `i`th column chunk. + fn get_column_page_reader(&self, i: usize) -> Result>; + + /// Get value reader for the `i`th column chunk. + fn get_column_reader(&self, i: usize) -> Result; + + /// Get iterator of `Row`s from this row group. + /// + /// Projected schema can be a subset of or equal to the file schema, when it is None, + /// full file schema is assumed. + fn get_row_iter(&self, projection: Option) -> Result; +} + +// ---------------------------------------------------------------------- +// Serialized impl for file & row group readers + +/// Length should return the amount of bytes that implementor contains. +/// It's mainly used to read the metadata, which is at the end of the source. +pub trait Length { + /// Returns the amount of bytes of the inner source. + fn len(&self) -> u64; +} + +/// TryClone tries to clone the type and should maintain the `Seek` position of the given +/// instance. +pub trait TryClone: Sized { + /// Clones the type returning a new instance or an error if it's not possible + /// to clone it. + fn try_clone(&self) -> Result; +} + +impl Length for File { + fn len(&self) -> u64 { + self.metadata().map(|m| m.len()).unwrap_or(0u64) + } +} + +impl TryClone for File { + fn try_clone(&self) -> Result { + self.try_clone().map_err(|e| e.into()) + } +} + +impl<'a> Length for Cursor<&'a [u8]> { + fn len(&self) -> u64 { + self.get_ref().len() as u64 + } +} + +impl<'a> TryClone for Cursor<&'a [u8]> { + fn try_clone(&self) -> Result { + Ok(self.clone()) + } +} + +/// ParquetReader is the interface which needs to be fulfilled to be able to parse a +/// parquet source. +pub trait ParquetReader: Read + Seek + Length + TryClone {} +impl ParquetReader for T {} + +/// A serialized implementation for Parquet [`FileReader`]. +pub struct SerializedFileReader { + buf: BufReader, + metadata: ParquetMetaDataPtr, +} + +impl SerializedFileReader { + /// Creates file reader from a Parquet file. + /// Returns error if Parquet file does not exist or is corrupt. + pub fn new(reader: R) -> Result { + let mut buf = BufReader::new(reader); + let metadata = Self::parse_metadata(&mut buf)?; + Ok(Self { + buf, + metadata: Rc::new(metadata), + }) + } + + // Layout of Parquet file + // +---------------------------+---+-----+ + // | Rest of file | B | A | + // +---------------------------+---+-----+ + // where A: parquet footer, B: parquet metadata. + // + fn parse_metadata(buf: &mut BufReader) -> Result { + let file_size = buf.get_ref().len(); + if file_size < (FOOTER_SIZE as u64) { + return Err(general_err!( + "Invalid Parquet file. Size is smaller than footer" + )); + } + let mut footer_buffer: [u8; FOOTER_SIZE] = [0; FOOTER_SIZE]; + buf.seek(SeekFrom::End(-(FOOTER_SIZE as i64)))?; + buf.read_exact(&mut footer_buffer)?; + if footer_buffer[4..] != PARQUET_MAGIC { + return Err(general_err!("Invalid Parquet file. Corrupt footer")); + } + let metadata_len = LittleEndian::read_i32(&footer_buffer[0..4]) as i64; + if metadata_len < 0 { + return Err(general_err!( + "Invalid Parquet file. Metadata length is less than zero ({})", + metadata_len + )); + } + let metadata_start: i64 = file_size as i64 - FOOTER_SIZE as i64 - metadata_len; + if metadata_start < 0 { + return Err(general_err!( + "Invalid Parquet file. Metadata start is less than zero ({})", + metadata_start + )); + } + buf.seek(SeekFrom::Start(metadata_start as u64))?; + let metadata_buf = buf.take(metadata_len as u64).into_inner(); + + // TODO: row group filtering + let mut prot = TCompactInputProtocol::new(metadata_buf); + let mut t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) + .map_err(|e| ParquetError::General(format!("Could not parse metadata: {}", e)))?; + let schema = types::from_thrift(&mut t_file_metadata.schema)?; + let schema_descr = Rc::new(SchemaDescriptor::new(schema.clone())); + let mut row_groups = Vec::new(); + for rg in t_file_metadata.row_groups { + row_groups.push(Rc::new(RowGroupMetaData::from_thrift( + schema_descr.clone(), + rg, + )?)); + } + let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr); + + let file_metadata = FileMetaData::new( + t_file_metadata.version, + t_file_metadata.num_rows, + t_file_metadata.created_by, + schema, + schema_descr, + column_orders, + ); + Ok(ParquetMetaData::new(file_metadata, row_groups)) + } + + /// Parses column orders from Thrift definition. + /// If no column orders are defined, returns `None`. + fn parse_column_orders( + t_column_orders: Option>, + schema_descr: &SchemaDescriptor, + ) -> Option> { + match t_column_orders { + Some(orders) => { + // Should always be the case + assert_eq!( + orders.len(), + schema_descr.num_columns(), + "Column order length mismatch" + ); + let mut res = Vec::new(); + for (i, column) in schema_descr.columns().iter().enumerate() { + match orders[i] { + TColumnOrder::TYPEORDER(_) => { + let sort_order = ColumnOrder::get_sort_order( + column.logical_type(), + column.physical_type(), + ); + res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); + } + } + } + Some(res) + } + None => None, + } + } +} + +impl FileReader for SerializedFileReader { + fn metadata(&self) -> ParquetMetaDataPtr { + self.metadata.clone() + } + + fn num_row_groups(&self) -> usize { + self.metadata.num_row_groups() + } + + fn get_row_group(&self, i: usize) -> Result> { + let row_group_metadata = self.metadata.row_group(i); + // Row groups should be processed sequentially. + let f = self.buf.get_ref().try_clone()?; + Ok(Box::new(SerializedRowGroupReader::new( + f, + row_group_metadata, + ))) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_file(projection, self) + } +} + +impl TryFrom for SerializedFileReader { + type Error = ParquetError; + + fn try_from(file: File) -> Result { + Self::new(file) + } +} + +impl<'a> TryFrom<&'a Path> for SerializedFileReader { + type Error = ParquetError; + + fn try_from(path: &Path) -> Result { + let file = File::open(path)?; + Self::try_from(file) + } +} + +impl TryFrom for SerializedFileReader { + type Error = ParquetError; + + fn try_from(path: String) -> Result { + Self::try_from(Path::new(&path)) + } +} + +impl<'a> TryFrom<&'a str> for SerializedFileReader { + type Error = ParquetError; + + fn try_from(path: &str) -> Result { + Self::try_from(Path::new(&path)) + } +} + +/// A serialized implementation for Parquet [`RowGroupReader`]. +pub struct SerializedRowGroupReader { + buf: BufReader, + metadata: RowGroupMetaDataPtr, +} + +impl SerializedRowGroupReader { + /// Creates new row group reader from a file and row group metadata. + fn new(file: R, metadata: RowGroupMetaDataPtr) -> Self { + let buf = BufReader::new(file); + Self { buf, metadata } + } +} + +impl RowGroupReader for SerializedRowGroupReader { + fn metadata(&self) -> RowGroupMetaDataPtr { + self.metadata.clone() + } + + fn num_columns(&self) -> usize { + self.metadata.num_columns() + } + + // TODO: fix PARQUET-816 + fn get_column_page_reader(&self, i: usize) -> Result> { + let col = self.metadata.column(i); + let mut col_start = col.data_page_offset(); + if col.has_dictionary_page() { + col_start = col.dictionary_page_offset().unwrap(); + } + let col_length = col.compressed_size(); + let file_chunk = FileSource::new(self.buf.get_ref(), col_start as u64, col_length as usize); + let page_reader = SerializedPageReader::new( + file_chunk, + col.num_values(), + col.compression(), + col.column_descr().physical_type(), + )?; + Ok(Box::new(page_reader)) + } + + fn get_column_reader(&self, i: usize) -> Result { + let schema_descr = self.metadata.schema_descr(); + let col_descr = schema_descr.column(i); + let col_page_reader = self.get_column_page_reader(i)?; + let col_reader = match col_descr.physical_type() { + Type::BOOLEAN => { + ColumnReader::BoolColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT32 => { + ColumnReader::Int32ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT64 => { + ColumnReader::Int64ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT96 => { + ColumnReader::Int96ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::FLOAT => { + ColumnReader::FloatColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::DOUBLE => { + ColumnReader::DoubleColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::BYTE_ARRAY => ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new( + col_descr, + col_page_reader, + )), + Type::FIXED_LEN_BYTE_ARRAY => ColumnReader::FixedLenByteArrayColumnReader( + ColumnReaderImpl::new(col_descr, col_page_reader), + ), + }; + Ok(col_reader) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_row_group(projection, self) + } +} + +/// A serialized implementation for Parquet [`PageReader`]. +pub struct SerializedPageReader { + // The file source buffer which references exactly the bytes for the column trunk + // to be read by this page reader. + buf: T, + + // The compression codec for this column chunk. Only set for non-PLAIN codec. + decompressor: Option>, + + // The number of values we have seen so far. + seen_num_values: i64, + + // The number of total values in this column chunk. + total_num_values: i64, + + // Column chunk type. + physical_type: Type, +} + +impl SerializedPageReader { + /// Creates a new serialized page reader from file source. + pub fn new( + buf: T, + total_num_values: i64, + compression: Compression, + physical_type: Type, + ) -> Result { + let decompressor = create_codec(compression)?; + let result = Self { + buf, + total_num_values, + seen_num_values: 0, + decompressor, + physical_type, + }; + Ok(result) + } + + /// Reads Page header from Thrift. + fn read_page_header(&mut self) -> Result { + let mut prot = TCompactInputProtocol::new(&mut self.buf); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + Ok(page_header) + } +} + +impl PageReader for SerializedPageReader { + fn get_next_page(&mut self) -> Result> { + while self.seen_num_values < self.total_num_values { + let page_header = self.read_page_header()?; + + // When processing data page v2, depending on enabled compression for the page, we + // should account for uncompressed data ('offset') of repetition and definition + // levels. + // + // We always use 0 offset for other pages other than v2, `true` flag means that + // compression will be applied if decompressor is defined + let mut offset: usize = 0; + let mut can_decompress = true; + + if let Some(ref header_v2) = page_header.data_page_header_v2 { + offset = (header_v2.definition_levels_byte_length + + header_v2.repetition_levels_byte_length) as usize; + // When is_compressed flag is missing the page is considered compressed + can_decompress = header_v2.is_compressed.unwrap_or(true); + } + + let compressed_len = page_header.compressed_page_size as usize - offset; + let uncompressed_len = page_header.uncompressed_page_size as usize - offset; + // We still need to read all bytes from buffered stream + let mut buffer = vec![0; offset + compressed_len]; + self.buf.read_exact(&mut buffer)?; + + // TODO: page header could be huge because of statistics. We should set a maximum + // page header size and abort if that is exceeded. + if let Some(decompressor) = self.decompressor.as_mut() { + if can_decompress { + let mut decompressed_buffer = Vec::with_capacity(uncompressed_len); + let decompressed_size = + decompressor.decompress(&buffer[offset..], &mut decompressed_buffer)?; + if decompressed_size != uncompressed_len { + return Err(general_err!( + "Actual decompressed size doesn't match the expected one ({} vs {})", + decompressed_size, + uncompressed_len + )); + } + if offset == 0 { + buffer = decompressed_buffer; + } else { + // Prepend saved offsets to the buffer + buffer.truncate(offset); + buffer.append(&mut decompressed_buffer); + } + } + } + + let result = match page_header.type_ { + PageType::DICTIONARY_PAGE => { + assert!(page_header.dictionary_page_header.is_some()); + let dict_header = page_header.dictionary_page_header.as_ref().unwrap(); + let is_sorted = dict_header.is_sorted.unwrap_or(false); + Page::DictionaryPage { + buf: ByteBufferPtr::new(buffer), + num_values: dict_header.num_values as u32, + encoding: Encoding::from(dict_header.encoding), + is_sorted, + } + } + PageType::DATA_PAGE => { + assert!(page_header.data_page_header.is_some()); + let header = page_header.data_page_header.unwrap(); + self.seen_num_values += header.num_values as i64; + Page::DataPage { + buf: ByteBufferPtr::new(buffer), + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + def_level_encoding: Encoding::from(header.definition_level_encoding), + rep_level_encoding: Encoding::from(header.repetition_level_encoding), + statistics: statistics::from_thrift(self.physical_type, header.statistics), + } + } + PageType::DATA_PAGE_V2 => { + assert!(page_header.data_page_header_v2.is_some()); + let header = page_header.data_page_header_v2.unwrap(); + let is_compressed = header.is_compressed.unwrap_or(true); + self.seen_num_values += header.num_values as i64; + Page::DataPageV2 { + buf: ByteBufferPtr::new(buffer), + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + num_nulls: header.num_nulls as u32, + num_rows: header.num_rows as u32, + def_levels_byte_len: header.definition_levels_byte_length as u32, + rep_levels_byte_len: header.repetition_levels_byte_length as u32, + is_compressed, + statistics: statistics::from_thrift(self.physical_type, header.statistics), + } + } + _ => { + // For unknown page type (e.g., INDEX_PAGE), skip and read next. + continue; + } + }; + return Ok(Some(result)); + } + + // We are at the end of this column chunk and no more page left. Return None. + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use parquet_format::TypeDefinedOrder; + + use crate::parquet::basic::SortOrder; + use crate::parquet::util::test_common::{get_temp_file, get_test_file, get_test_path}; + + #[test] + fn test_file_reader_metadata_size_smaller_than_footer() { + let test_file = get_temp_file("corrupt-1.parquet", &[]); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_err()); + assert_eq!( + reader_result.err().unwrap(), + general_err!("Invalid Parquet file. Size is smaller than footer") + ); + } + + // #[test] + // fn test_cursor_and_file_has_the_same_behaviour() { + // let path = get_test_path("alltypes_plain.parquet"); + // let buffer = include_bytes!(path); + // let cursor = Cursor::new(buffer.as_ref()); + + // let read_from_file = + // SerializedFileReader::new(File::open("testdata/alltypes_plain.parquet").unwrap()) + // .unwrap(); + // let read_from_cursor = SerializedFileReader::new(cursor).unwrap(); + + // let file_iter = read_from_file.get_row_iter(None).unwrap(); + // let cursor_iter = read_from_cursor.get_row_iter(None).unwrap(); + + // assert!(file_iter.eq(cursor_iter)); + // } + + #[test] + fn test_file_reader_metadata_corrupt_footer() { + let test_file = get_temp_file("corrupt-2.parquet", &[1, 2, 3, 4, 5, 6, 7, 8]); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_err()); + assert_eq!( + reader_result.err().unwrap(), + general_err!("Invalid Parquet file. Corrupt footer") + ); + } + + #[test] + fn test_file_reader_metadata_invalid_length() { + let test_file = get_temp_file("corrupt-3.parquet", &[0, 0, 0, 255, b'P', b'A', b'R', b'1']); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_err()); + assert_eq!( + reader_result.err().unwrap(), + general_err!("Invalid Parquet file. Metadata length is less than zero (-16777216)") + ); + } + + #[test] + fn test_file_reader_metadata_invalid_start() { + let test_file = get_temp_file("corrupt-4.parquet", &[255, 0, 0, 0, b'P', b'A', b'R', b'1']); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_err()); + assert_eq!( + reader_result.err().unwrap(), + general_err!("Invalid Parquet file. Metadata start is less than zero (-255)") + ); + } + + #[test] + fn test_file_reader_column_orders_parse() { + // Define simple schema, we do not need to provide logical types. + let mut fields = vec![ + Rc::new( + SchemaType::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + ), + Rc::new( + SchemaType::primitive_type_builder("col2", Type::FLOAT) + .build() + .unwrap(), + ), + ]; + let schema = SchemaType::group_type_builder("schema") + .with_fields(&mut fields) + .build() + .unwrap(); + let schema_descr = SchemaDescriptor::new(Rc::new(schema)); + + let t_column_orders = Some(vec![ + TColumnOrder::TYPEORDER(TypeDefinedOrder::new()), + TColumnOrder::TYPEORDER(TypeDefinedOrder::new()), + ]); + + assert_eq!( + SerializedFileReader::::parse_column_orders(t_column_orders, &schema_descr), + Some(vec![ + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) + ]) + ); + + // Test when no column orders are defined. + assert_eq!( + SerializedFileReader::::parse_column_orders(None, &schema_descr), + None + ); + } + + #[test] + #[should_panic(expected = "Column order length mismatch")] + fn test_file_reader_column_orders_len_mismatch() { + let schema = SchemaType::group_type_builder("schema").build().unwrap(); + let schema_descr = SchemaDescriptor::new(Rc::new(schema)); + + let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); + + SerializedFileReader::::parse_column_orders(t_column_orders, &schema_descr); + } + + #[test] + fn test_file_reader_try_from() { + // Valid file path + let test_file = get_test_file("alltypes_plain.parquet"); + let test_path_buf = get_test_path("alltypes_plain.parquet"); + let test_path = test_path_buf.as_path(); + let test_path_str = test_path.to_str().unwrap(); + + let reader = SerializedFileReader::try_from(test_file); + assert!(reader.is_ok()); + + let reader = SerializedFileReader::try_from(test_path); + assert!(reader.is_ok()); + + let reader = SerializedFileReader::try_from(test_path_str); + assert!(reader.is_ok()); + + let reader = SerializedFileReader::try_from(test_path_str.to_string()); + assert!(reader.is_ok()); + + // Invalid file path + let test_path = Path::new("invalid.parquet"); + let test_path_str = test_path.to_str().unwrap(); + + let reader = SerializedFileReader::try_from(test_path); + assert!(reader.is_err()); + + let reader = SerializedFileReader::try_from(test_path_str); + assert!(reader.is_err()); + + let reader = SerializedFileReader::try_from(test_path_str.to_string()); + assert!(reader.is_err()); + } + + #[test] + fn test_reuse_file_chunk() { + // This test covers the case of maintaining the correct start position in a file + // stream for each column reader after initializing and moving to the next one + // (without necessarily reading the entire column). + let test_file = get_test_file("alltypes_plain.parquet"); + let reader = SerializedFileReader::new(test_file).unwrap(); + let row_group = reader.get_row_group(0).unwrap(); + + let mut page_readers = Vec::new(); + for i in 0..row_group.num_columns() { + page_readers.push(row_group.get_column_page_reader(i).unwrap()); + } + + // Now buffer each col reader, we do not expect any failures like: + // General("underlying Thrift error: end of file") + for mut page_reader in page_readers { + assert!(page_reader.get_next_page().is_ok()); + } + } + + #[test] + fn test_file_reader() { + let test_file = get_test_file("alltypes_plain.parquet"); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_ok()); + let reader = reader_result.unwrap(); + + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // Test contents in file metadata + let file_metadata = metadata.file_metadata(); + assert!(file_metadata.created_by().is_some()); + assert_eq!( + file_metadata.created_by().as_ref().unwrap(), + "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)" + ); + assert_eq!(file_metadata.num_rows(), 8); + assert_eq!(file_metadata.version(), 1); + assert_eq!(file_metadata.column_orders(), None); + + // Test contents in row group metadata + let row_group_metadata = metadata.row_group(0); + assert_eq!(row_group_metadata.num_columns(), 11); + assert_eq!(row_group_metadata.num_rows(), 8); + assert_eq!(row_group_metadata.total_byte_size(), 671); + // Check each column order + for i in 0..row_group_metadata.num_columns() { + assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED); + } + + // Test row group reader + let row_group_reader_result = reader.get_row_group(0); + assert!(row_group_reader_result.is_ok()); + let row_group_reader: Box = row_group_reader_result.unwrap(); + assert_eq!( + row_group_reader.num_columns(), + row_group_metadata.num_columns() + ); + assert_eq!( + row_group_reader.metadata().total_byte_size(), + row_group_metadata.total_byte_size() + ); + + // Test page readers + // TODO: test for every column + let page_reader_0_result = row_group_reader.get_column_page_reader(0); + assert!(page_reader_0_result.is_ok()); + let mut page_reader_0: Box = page_reader_0_result.unwrap(); + let mut page_count = 0; + while let Ok(Some(page)) = page_reader_0.get_next_page() { + let is_expected_page = match page { + Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + } => { + assert_eq!(buf.len(), 32); + assert_eq!(num_values, 8); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + assert_eq!(is_sorted, false); + true + } + Page::DataPage { + buf, + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + statistics, + } => { + assert_eq!(buf.len(), 11); + assert_eq!(num_values, 8); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + assert_eq!(def_level_encoding, Encoding::RLE); + assert_eq!(rep_level_encoding, Encoding::BIT_PACKED); + assert!(statistics.is_none()); + true + } + _ => false, + }; + assert!(is_expected_page); + page_count += 1; + } + assert_eq!(page_count, 2); + } + + #[test] + fn test_file_reader_datapage_v2() { + let test_file = get_test_file("datapage_v2.snappy.parquet"); + let reader_result = SerializedFileReader::new(test_file); + assert!(reader_result.is_ok()); + let reader = reader_result.unwrap(); + + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // Test contents in file metadata + let file_metadata = metadata.file_metadata(); + assert!(file_metadata.created_by().is_some()); + assert_eq!( + file_metadata.created_by().as_ref().unwrap(), + "parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)" + ); + assert_eq!(file_metadata.num_rows(), 5); + assert_eq!(file_metadata.version(), 1); + assert_eq!(file_metadata.column_orders(), None); + + let row_group_metadata = metadata.row_group(0); + + // Check each column order + for i in 0..row_group_metadata.num_columns() { + assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED); + } + + // Test row group reader + let row_group_reader_result = reader.get_row_group(0); + assert!(row_group_reader_result.is_ok()); + let row_group_reader: Box = row_group_reader_result.unwrap(); + assert_eq!( + row_group_reader.num_columns(), + row_group_metadata.num_columns() + ); + assert_eq!( + row_group_reader.metadata().total_byte_size(), + row_group_metadata.total_byte_size() + ); + + // Test page readers + // TODO: test for every column + let page_reader_0_result = row_group_reader.get_column_page_reader(0); + assert!(page_reader_0_result.is_ok()); + let mut page_reader_0: Box = page_reader_0_result.unwrap(); + let mut page_count = 0; + while let Ok(Some(page)) = page_reader_0.get_next_page() { + let is_expected_page = match page { + Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + } => { + assert_eq!(buf.len(), 7); + assert_eq!(num_values, 1); + assert_eq!(encoding, Encoding::PLAIN); + assert_eq!(is_sorted, false); + true + } + Page::DataPageV2 { + buf, + num_values, + encoding, + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed, + statistics, + } => { + assert_eq!(buf.len(), 4); + assert_eq!(num_values, 5); + assert_eq!(encoding, Encoding::RLE_DICTIONARY); + assert_eq!(num_nulls, 1); + assert_eq!(num_rows, 5); + assert_eq!(def_levels_byte_len, 2); + assert_eq!(rep_levels_byte_len, 0); + assert_eq!(is_compressed, true); + assert!(statistics.is_some()); + true + } + _ => false, + }; + assert!(is_expected_page); + page_count += 1; + } + assert_eq!(page_count, 2); + } +} diff --git a/rust/src/parquet/file/statistics.rs b/rust/src/parquet/file/statistics.rs new file mode 100644 index 0000000000000..ff4d731857f16 --- /dev/null +++ b/rust/src/parquet/file/statistics.rs @@ -0,0 +1,692 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains definitions for working with Parquet statistics. +//! +//! Though some common methods are available on enum, use pattern match to extract +//! actual min and max values from statistics, see below: +//! +//! ```rust +//! use arrow::parquet::file::statistics::Statistics; +//! +//! let stats = Statistics::int32(Some(1), Some(10), None, 3, true); +//! assert_eq!(stats.null_count(), 3); +//! assert!(stats.has_min_max_set()); +//! assert!(stats.is_min_max_deprecated()); +//! +//! match stats { +//! Statistics::Int32(ref typed) => { +//! assert_eq!(*typed.min(), 1); +//! assert_eq!(*typed.max(), 10); +//! } +//! _ => {} +//! } +//! ``` + +use std::{cmp, fmt}; + +use byteorder::{ByteOrder, LittleEndian}; +use parquet_format::Statistics as TStatistics; + +use crate::parquet::basic::Type; +use crate::parquet::data_type::*; + +// Macro to generate methods create Statistics. +macro_rules! statistics_new_func { + ($func:ident, $vtype:ty, $stat:ident) => { + pub fn $func( + min: $vtype, + max: $vtype, + distinct: Option, + nulls: u64, + is_deprecated: bool, + ) -> Self { + Statistics::$stat(TypedStatistics::new( + min, + max, + distinct, + nulls, + is_deprecated, + )) + } + }; +} + +// Macro to generate getter functions for Statistics. +macro_rules! statistics_enum_func { + ($self:ident, $func:ident) => {{ + match *$self { + Statistics::Boolean(ref typed) => typed.$func(), + Statistics::Int32(ref typed) => typed.$func(), + Statistics::Int64(ref typed) => typed.$func(), + Statistics::Int96(ref typed) => typed.$func(), + Statistics::Float(ref typed) => typed.$func(), + Statistics::Double(ref typed) => typed.$func(), + Statistics::ByteArray(ref typed) => typed.$func(), + Statistics::FixedLenByteArray(ref typed) => typed.$func(), + } + }}; +} + +/// Converts Thrift definition into `Statistics`. +pub fn from_thrift(physical_type: Type, thrift_stats: Option) -> Option { + match thrift_stats { + Some(stats) => { + // Number of nulls recorded, when it is not available, we just mark it as 0. + let null_count = stats.null_count.unwrap_or(0); + assert!( + null_count >= 0, + "Statistics null count is negative ({})", + null_count + ); + + // Generic null count. + let null_count = null_count as u64; + // Generic distinct count (count of distinct values occurring) + let distinct_count = stats.distinct_count.map(|value| value as u64); + // Whether or not statistics use deprecated min/max fields. + let old_format = stats.min_value.is_none() && stats.max_value.is_none(); + // Generic min value as bytes. + let min = if old_format { + stats.min + } else { + stats.min_value + }; + // Generic max value as bytes. + let max = if old_format { + stats.max + } else { + stats.max_value + }; + + // Values are encoded using PLAIN encoding definition, except that + // variable-length byte arrays do not include a length prefix. + // + // Instead of using actual decoder, we manually convert values. + let res = match physical_type { + Type::BOOLEAN => Statistics::boolean( + min.map(|data| data[0] != 0), + max.map(|data| data[0] != 0), + distinct_count, + null_count, + old_format, + ), + Type::INT32 => Statistics::int32( + min.map(|data| LittleEndian::read_i32(&data)), + max.map(|data| LittleEndian::read_i32(&data)), + distinct_count, + null_count, + old_format, + ), + Type::INT64 => Statistics::int64( + min.map(|data| LittleEndian::read_i64(&data)), + max.map(|data| LittleEndian::read_i64(&data)), + distinct_count, + null_count, + old_format, + ), + Type::INT96 => { + // INT96 statistics may not be correct, because comparison is signed + // byte-wise, not actual timestamps. It is recommended to ignore min/max + // statistics for INT96 columns. + let min = min.map(|data| { + assert_eq!(data.len(), 12); + unsafe { + let raw = ::std::slice::from_raw_parts(data.as_ptr() as *mut u32, 3); + Int96::from(Vec::from(raw)) + } + }); + let max = max.map(|data| { + assert_eq!(data.len(), 12); + unsafe { + let raw = ::std::slice::from_raw_parts(data.as_ptr() as *mut u32, 3); + Int96::from(Vec::from(raw)) + } + }); + Statistics::int96(min, max, distinct_count, null_count, old_format) + } + Type::FLOAT => Statistics::float( + min.map(|data| LittleEndian::read_f32(&data)), + max.map(|data| LittleEndian::read_f32(&data)), + distinct_count, + null_count, + old_format, + ), + Type::DOUBLE => Statistics::double( + min.map(|data| LittleEndian::read_f64(&data)), + max.map(|data| LittleEndian::read_f64(&data)), + distinct_count, + null_count, + old_format, + ), + Type::BYTE_ARRAY => Statistics::byte_array( + min.map(|data| ByteArray::from(data)), + max.map(|data| ByteArray::from(data)), + distinct_count, + null_count, + old_format, + ), + Type::FIXED_LEN_BYTE_ARRAY => Statistics::fixed_len_byte_array( + min.map(|data| ByteArray::from(data)), + max.map(|data| ByteArray::from(data)), + distinct_count, + null_count, + old_format, + ), + }; + + Some(res) + } + None => None, + } +} + +// Convert Statistics into Thrift definition. +pub fn to_thrift(stats: Option<&Statistics>) -> Option { + if stats.is_none() { + return None; + } + + let stats = stats.unwrap(); + + let mut thrift_stats = TStatistics { + max: None, + min: None, + null_count: if stats.has_nulls() { + Some(stats.null_count() as i64) + } else { + None + }, + distinct_count: stats.distinct_count().map(|value| value as i64), + max_value: None, + min_value: None, + }; + + // Get min/max if set. + let (min, max) = if stats.has_min_max_set() { + ( + Some(stats.min_bytes().to_vec()), + Some(stats.max_bytes().to_vec()), + ) + } else { + (None, None) + }; + + if stats.is_min_max_deprecated() { + thrift_stats.min = min; + thrift_stats.max = max; + } else { + thrift_stats.min_value = min; + thrift_stats.max_value = max; + } + + Some(thrift_stats) +} + +/// Statistics for a column chunk and data page. +#[derive(Debug, PartialEq)] +pub enum Statistics { + Boolean(TypedStatistics), + Int32(TypedStatistics), + Int64(TypedStatistics), + Int96(TypedStatistics), + Float(TypedStatistics), + Double(TypedStatistics), + ByteArray(TypedStatistics), + FixedLenByteArray(TypedStatistics), +} + +impl Statistics { + statistics_new_func![boolean, Option, Boolean]; + + statistics_new_func![int32, Option, Int32]; + + statistics_new_func![int64, Option, Int64]; + + statistics_new_func![int96, Option, Int96]; + + statistics_new_func![float, Option, Float]; + + statistics_new_func![double, Option, Double]; + + statistics_new_func![byte_array, Option, ByteArray]; + + statistics_new_func![fixed_len_byte_array, Option, FixedLenByteArray]; + + /// Returns `true` if statistics have old `min` and `max` fields set. + /// This means that the column order is likely to be undefined, which, for old files + /// could mean a signed sort order of values. + /// + /// Refer to [`ColumnOrder`](`::basic::ColumnOrder`) and + /// [`SortOrder`](`::basic::SortOrder`) for more information. + pub fn is_min_max_deprecated(&self) -> bool { + statistics_enum_func![self, is_min_max_deprecated] + } + + /// Returns optional value of number of distinct values occurring. + /// When it is `None`, the value should be ignored. + pub fn distinct_count(&self) -> Option { + statistics_enum_func![self, distinct_count] + } + + /// Returns number of null values for the column. + /// Note that this includes all nulls when column is part of the complex type. + pub fn null_count(&self) -> u64 { + statistics_enum_func![self, null_count] + } + + /// Returns `true` if statistics collected any null values, `false` otherwise. + pub fn has_nulls(&self) -> bool { + self.null_count() > 0 + } + + /// Returns `true` if min value and max value are set. + /// Normally both min/max values will be set to `Some(value)` or `None`. + pub fn has_min_max_set(&self) -> bool { + statistics_enum_func![self, has_min_max_set] + } + + /// Returns slice of bytes that represent min value. + /// Panics if min value is not set. + pub fn min_bytes(&self) -> &[u8] { + statistics_enum_func![self, min_bytes] + } + + /// Returns slice of bytes that represent max value. + /// Panics if max value is not set. + pub fn max_bytes(&self) -> &[u8] { + statistics_enum_func![self, max_bytes] + } + + /// Returns physical type associated with statistics. + pub fn physical_type(&self) -> Type { + match self { + Statistics::Boolean(_) => Type::BOOLEAN, + Statistics::Int32(_) => Type::INT32, + Statistics::Int64(_) => Type::INT64, + Statistics::Int96(_) => Type::INT96, + Statistics::Float(_) => Type::FLOAT, + Statistics::Double(_) => Type::DOUBLE, + Statistics::ByteArray(_) => Type::BYTE_ARRAY, + Statistics::FixedLenByteArray(_) => Type::FIXED_LEN_BYTE_ARRAY, + } + } +} + +impl fmt::Display for Statistics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Statistics::Boolean(typed) => write!(f, "{}", typed), + Statistics::Int32(typed) => write!(f, "{}", typed), + Statistics::Int64(typed) => write!(f, "{}", typed), + Statistics::Int96(typed) => write!(f, "{}", typed), + Statistics::Float(typed) => write!(f, "{}", typed), + Statistics::Double(typed) => write!(f, "{}", typed), + Statistics::ByteArray(typed) => write!(f, "{}", typed), + Statistics::FixedLenByteArray(typed) => write!(f, "{}", typed), + } + } +} + +/// Typed implementation for [`Statistics`]. +pub struct TypedStatistics { + min: Option, + max: Option, + // Distinct count could be omitted in some cases + distinct_count: Option, + null_count: u64, + is_min_max_deprecated: bool, +} + +impl TypedStatistics { + /// Creates new typed statistics. + pub fn new( + min: Option, + max: Option, + distinct_count: Option, + null_count: u64, + is_min_max_deprecated: bool, + ) -> Self { + Self { + min, + max, + distinct_count, + null_count, + is_min_max_deprecated, + } + } + + /// Returns min value of the statistics. + /// + /// Panics if min value is not set, e.g. all values are `null`. + /// Use `has_min_max_set` method to check that. + pub fn min(&self) -> &T::T { + self.min.as_ref().unwrap() + } + + /// Returns max value of the statistics. + /// + /// Panics if max value is not set, e.g. all values are `null`. + /// Use `has_min_max_set` method to check that. + pub fn max(&self) -> &T::T { + self.max.as_ref().unwrap() + } + + /// Returns min value as bytes of the statistics. + /// + /// Panics if min value is not set, use `has_min_max_set` method to check + /// if values are set. + pub fn min_bytes(&self) -> &[u8] { + self.min().as_bytes() + } + + /// Returns max value as bytes of the statistics. + /// + /// Panics if max value is not set, use `has_min_max_set` method to check + /// if values are set. + pub fn max_bytes(&self) -> &[u8] { + self.max().as_bytes() + } + + /// Whether or not min and max values are set. + /// Normally both min/max values will be set to `Some(value)` or `None`. + fn has_min_max_set(&self) -> bool { + self.min.is_some() && self.max.is_some() + } + + /// Returns optional value of number of distinct values occurring. + fn distinct_count(&self) -> Option { + self.distinct_count + } + + /// Returns null count. + fn null_count(&self) -> u64 { + self.null_count + } + + /// Returns `true` if statistics were created using old min/max fields. + fn is_min_max_deprecated(&self) -> bool { + self.is_min_max_deprecated + } +} + +impl fmt::Display for TypedStatistics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{{")?; + write!(f, "min: ")?; + match self.min { + Some(ref value) => self.value_fmt(f, value)?, + None => write!(f, "N/A")?, + } + write!(f, ", max: ")?; + match self.max { + Some(ref value) => self.value_fmt(f, value)?, + None => write!(f, "N/A")?, + } + write!(f, ", distinct_count: ")?; + match self.distinct_count { + Some(value) => write!(f, "{}", value)?, + None => write!(f, "N/A")?, + } + write!(f, ", null_count: {}", self.null_count)?; + write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?; + write!(f, "}}") + } +} + +impl fmt::Debug for TypedStatistics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \ + min_max_deprecated: {}}}", + self.min, self.max, self.distinct_count, self.null_count, self.is_min_max_deprecated + ) + } +} + +impl cmp::PartialEq for TypedStatistics { + fn eq(&self, other: &TypedStatistics) -> bool { + self.min == other.min + && self.max == other.max + && self.distinct_count == other.distinct_count + && self.null_count == other.null_count + && self.is_min_max_deprecated == other.is_min_max_deprecated + } +} + +/// Trait to provide a specific write format for values. +/// For example, we should display vector slices for byte array types, and original +/// values for other types. +trait ValueDisplay { + fn value_fmt(&self, f: &mut fmt::Formatter, value: &T::T) -> fmt::Result; +} + +impl ValueDisplay for TypedStatistics { + default fn value_fmt(&self, f: &mut fmt::Formatter, value: &T::T) -> fmt::Result { + write!(f, "{:?}", value) + } +} + +impl ValueDisplay for TypedStatistics { + fn value_fmt(&self, f: &mut fmt::Formatter, value: &Int96) -> fmt::Result { + write!(f, "{:?}", value.data()) + } +} + +impl ValueDisplay for TypedStatistics { + fn value_fmt(&self, f: &mut fmt::Formatter, value: &ByteArray) -> fmt::Result { + write!(f, "{:?}", value.data()) + } +} + +impl ValueDisplay for TypedStatistics { + fn value_fmt(&self, f: &mut fmt::Formatter, value: &ByteArray) -> fmt::Result { + write!(f, "{:?}", value.data()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_statistics_min_max_bytes() { + let stats = Statistics::int32(Some(-123), Some(234), None, 1, false); + assert!(stats.has_min_max_set()); + assert_eq!(stats.min_bytes(), (-123).as_bytes()); + assert_eq!(stats.max_bytes(), 234.as_bytes()); + + let stats = Statistics::byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![3, 4, 5])), + None, + 1, + true, + ); + assert!(stats.has_min_max_set()); + assert_eq!(stats.min_bytes(), &[1, 2, 3]); + assert_eq!(stats.max_bytes(), &[3, 4, 5]); + } + + #[test] + #[should_panic(expected = "Statistics null count is negative (-10)")] + fn test_statistics_negative_null_count() { + let thrift_stats = TStatistics { + max: None, + min: None, + null_count: Some(-10), + distinct_count: None, + max_value: None, + min_value: None, + }; + + from_thrift(Type::INT32, Some(thrift_stats)); + } + + #[test] + fn test_statistics_thrift_none() { + assert_eq!(from_thrift(Type::INT32, None), None); + assert_eq!(from_thrift(Type::BYTE_ARRAY, None), None); + } + + #[test] + fn test_statistics_debug() { + let stats = Statistics::int32(Some(1), Some(12), None, 12, true); + assert_eq!( + format!("{:?}", stats), + "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \ + min_max_deprecated: true})" + ); + + let stats = Statistics::int32(None, None, None, 7, false); + assert_eq!( + format!("{:?}", stats), + "Int32({min: None, max: None, distinct_count: None, null_count: 7, \ + min_max_deprecated: false})" + ) + } + + #[test] + fn test_statistics_display() { + let stats = Statistics::int32(Some(1), Some(12), None, 12, true); + assert_eq!( + format!("{}", stats), + "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true}" + ); + + let stats = Statistics::int64(None, None, None, 7, false); + assert_eq!( + format!("{}", stats), + "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \ + false}" + ); + + let stats = Statistics::int96( + Some(Int96::from(vec![1, 0, 0])), + Some(Int96::from(vec![2, 3, 4])), + None, + 3, + true, + ); + assert_eq!( + format!("{}", stats), + "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \ + min_max_deprecated: true}" + ); + + let stats = Statistics::byte_array( + Some(ByteArray::from(vec![1u8])), + Some(ByteArray::from(vec![2u8])), + Some(5), + 7, + false, + ); + assert_eq!( + format!("{}", stats), + "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false}" + ); + } + + #[test] + fn test_statistics_partial_eq() { + let expected = Statistics::int32(Some(12), Some(45), None, 11, true); + + assert!(Statistics::int32(Some(12), Some(45), None, 11, true) == expected); + assert!(Statistics::int32(Some(11), Some(45), None, 11, true) != expected); + assert!(Statistics::int32(Some(12), Some(44), None, 11, true) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, 23, true) != expected); + assert!(Statistics::int32(Some(12), Some(45), None, 11, false) != expected); + + assert!( + Statistics::int32(Some(12), Some(45), None, 11, false) + != Statistics::int64(Some(12), Some(45), None, 11, false) + ); + + assert!( + Statistics::boolean(Some(false), Some(true), None, 0, true) + != Statistics::double(Some(1.2), Some(4.5), None, 0, true) + ); + + assert!( + Statistics::byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![1, 2, 3])), + None, + 0, + true + ) != Statistics::fixed_len_byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![1, 2, 3])), + None, + 0, + true + ) + ); + } + + #[test] + fn test_statistics_from_thrift() { + // Helper method to check statistics conversion. + fn check_stats(stats: Statistics) { + let tpe = stats.physical_type(); + let thrift_stats = to_thrift(Some(&stats)); + assert_eq!(from_thrift(tpe, thrift_stats), Some(stats)); + } + + check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); + check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); + check_stats(Statistics::boolean(Some(false), Some(true), None, 0, false)); + check_stats(Statistics::boolean(Some(true), Some(true), None, 7, true)); + check_stats(Statistics::boolean(Some(false), Some(false), None, 7, true)); + check_stats(Statistics::boolean(None, None, None, 7, true)); + + check_stats(Statistics::int32(Some(-100), Some(500), None, 7, true)); + check_stats(Statistics::int32(Some(-100), Some(500), None, 0, false)); + check_stats(Statistics::int32(None, None, None, 7, true)); + + check_stats(Statistics::int64(Some(-100), Some(200), None, 7, true)); + check_stats(Statistics::int64(Some(-100), Some(200), None, 0, false)); + check_stats(Statistics::int64(None, None, None, 7, true)); + + check_stats(Statistics::float(Some(1.2), Some(3.4), None, 7, true)); + check_stats(Statistics::float(Some(1.2), Some(3.4), None, 0, false)); + check_stats(Statistics::float(None, None, None, 7, true)); + + check_stats(Statistics::double(Some(1.2), Some(3.4), None, 7, true)); + check_stats(Statistics::double(Some(1.2), Some(3.4), None, 0, false)); + check_stats(Statistics::double(None, None, None, 7, true)); + + check_stats(Statistics::byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![3, 4, 5])), + None, + 7, + true, + )); + check_stats(Statistics::byte_array(None, None, None, 7, true)); + + check_stats(Statistics::fixed_len_byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![3, 4, 5])), + None, + 7, + true, + )); + check_stats(Statistics::fixed_len_byte_array(None, None, None, 7, true)); + } +} diff --git a/rust/src/parquet/file/writer.rs b/rust/src/parquet/file/writer.rs new file mode 100644 index 0000000000000..1e0c11641f9a4 --- /dev/null +++ b/rust/src/parquet/file/writer.rs @@ -0,0 +1,936 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains file writer API, and provides methods to write row groups and columns by +//! using row group writers and column writers respectively. + +use std::{ + fs::File, + io::{Seek, SeekFrom, Write}, + rc::Rc, +}; + +use byteorder::{ByteOrder, LittleEndian}; +use parquet_format as parquet; +use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol}; + +use crate::parquet::basic::PageType; +use crate::parquet::column::{ + page::{CompressedPage, Page, PageWriteSpec, PageWriter}, + writer::{get_column_writer, ColumnWriter}, +}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::{ + metadata::*, properties::WriterPropertiesPtr, statistics::to_thrift as statistics_to_thrift, + FOOTER_SIZE, PARQUET_MAGIC, +}; +use crate::parquet::schema::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; +use crate::parquet::util::io::{FileSink, Position}; + +// ---------------------------------------------------------------------- +// APIs for file & row group writers + +/// Parquet file writer API. +/// Provides methods to write row groups sequentially. +/// +/// The main workflow should be as following: +/// - Create file writer, this will open a new file and potentially write some metadata. +/// - Request a new row group writer by calling `next_row_group`. +/// - Once finished writing row group, close row group writer by passing it into +/// `close_row_group` method - this will finalise row group metadata and update metrics. +/// - Write subsequent row groups, if necessary. +/// - After all row groups have been written, close the file writer using `close` method. +pub trait FileWriter { + /// Creates new row group from this file writer. + /// In case of IO error or Thrift error, returns `Err`. + /// + /// There is no limit on a number of row groups in a file; however, row groups have + /// to be written sequentially. Every time the next row group is requested, the + /// previous row group must be finalised and closed using `close_row_group` method. + fn next_row_group(&mut self) -> Result>; + + /// Finalises and closes row group that was created using `next_row_group` method. + /// After calling this method, the next row group is available for writes. + fn close_row_group(&mut self, row_group_writer: Box) -> Result<()>; + + /// Closes and finalises file writer. + /// + /// All row groups must be appended before this method is called. + /// No writes are allowed after this point. + /// + /// Can be called multiple times. It is up to implementation to either result in no-op, + /// or return an `Err` for subsequent calls. + fn close(&mut self) -> Result<()>; +} + +/// Parquet row group writer API. +/// Provides methods to access column writers in an iterator-like fashion, order is +/// guaranteed to match the order of schema leaves (column descriptors). +/// +/// All columns should be written sequentially; the main workflow is: +/// - Request the next column using `next_column` method - this will return `None` if no +/// more columns are available to write. +/// - Once done writing a column, close column writer with `close_column` method - this +/// will finalise column chunk metadata and update row group metrics. +/// - Once all columns have been written, close row group writer with `close` method - +/// it will return row group metadata and is no-op on already closed row group. +pub trait RowGroupWriter { + /// Returns the next column writer, if available; otherwise returns `None`. + /// In case of any IO error or Thrift error, or if row group writer has already been + /// closed returns `Err`. + /// + /// To request the next column writer, the previous one must be finalised and closed + /// using `close_column`. + fn next_column(&mut self) -> Result>; + + /// Closes column writer that was created using `next_column` method. + /// This should be called before requesting the next column writer. + fn close_column(&mut self, column_writer: ColumnWriter) -> Result<()>; + + /// Closes this row group writer and returns row group metadata. + /// After calling this method row group writer must not be used. + /// + /// It is recommended to call this method before requesting another row group, but it + /// will be closed automatically before returning a new row group. + /// + /// Can be called multiple times. In subsequent calls will result in no-op and return + /// already created row group metadata. + fn close(&mut self) -> Result; +} + +// ---------------------------------------------------------------------- +// Serialized impl for file & row group writers + +/// A serialized implementation for Parquet [`FileWriter`]. +/// See documentation on file writer for more information. +pub struct SerializedFileWriter { + file: File, + schema: TypePtr, + descr: SchemaDescPtr, + props: WriterPropertiesPtr, + total_num_rows: u64, + row_groups: Vec, + previous_writer_closed: bool, + is_closed: bool, +} + +impl SerializedFileWriter { + /// Creates new file writer. + pub fn new(mut file: File, schema: TypePtr, properties: WriterPropertiesPtr) -> Result { + Self::start_file(&mut file)?; + Ok(Self { + file, + schema: schema.clone(), + descr: Rc::new(SchemaDescriptor::new(schema)), + props: properties, + total_num_rows: 0, + row_groups: Vec::new(), + previous_writer_closed: true, + is_closed: false, + }) + } + + /// Writes magic bytes at the beginning of the file. + fn start_file(file: &mut File) -> Result<()> { + file.write(&PARQUET_MAGIC)?; + Ok(()) + } + + /// Finalises active row group writer, otherwise no-op. + fn finalise_row_group_writer( + &mut self, + mut row_group_writer: Box, + ) -> Result<()> { + let row_group_metadata = row_group_writer.close()?; + self.row_groups.push(row_group_metadata); + Ok(()) + } + + /// Assembles and writes metadata at the end of the file. + fn write_metadata(&mut self) -> Result<()> { + let file_metadata = parquet::FileMetaData { + version: self.props.writer_version().as_num(), + schema: types::to_thrift(self.schema.as_ref())?, + num_rows: self.total_num_rows as i64, + row_groups: self + .row_groups + .as_slice() + .into_iter() + .map(|v| v.to_thrift()) + .collect(), + key_value_metadata: None, + created_by: Some(self.props.created_by().to_owned()), + column_orders: None, + }; + + // Write file metadata + let start_pos = self.file.seek(SeekFrom::Current(0))?; + { + let mut protocol = TCompactOutputProtocol::new(&mut self.file); + file_metadata.write_to_out_protocol(&mut protocol)?; + protocol.flush()?; + } + let end_pos = self.file.seek(SeekFrom::Current(0))?; + + // Write footer + let mut footer_buffer: [u8; FOOTER_SIZE] = [0; FOOTER_SIZE]; + let metadata_len = (end_pos - start_pos) as i32; + LittleEndian::write_i32(&mut footer_buffer, metadata_len); + (&mut footer_buffer[4..]).write(&PARQUET_MAGIC)?; + self.file.write(&footer_buffer)?; + Ok(()) + } + + #[inline] + fn assert_closed(&self) -> Result<()> { + if self.is_closed { + Err(general_err!("File writer is closed")) + } else { + Ok(()) + } + } + + #[inline] + fn assert_previous_writer_closed(&self) -> Result<()> { + if !self.previous_writer_closed { + Err(general_err!("Previous row group writer was not closed")) + } else { + Ok(()) + } + } +} + +impl FileWriter for SerializedFileWriter { + #[inline] + fn next_row_group(&mut self) -> Result> { + self.assert_closed()?; + self.assert_previous_writer_closed()?; + let row_group_writer = + SerializedRowGroupWriter::new(self.descr.clone(), self.props.clone(), &self.file); + self.previous_writer_closed = false; + Ok(Box::new(row_group_writer)) + } + + #[inline] + fn close_row_group(&mut self, row_group_writer: Box) -> Result<()> { + self.assert_closed()?; + let res = self.finalise_row_group_writer(row_group_writer); + self.previous_writer_closed = res.is_ok(); + res + } + + #[inline] + fn close(&mut self) -> Result<()> { + self.assert_closed()?; + self.assert_previous_writer_closed()?; + self.write_metadata()?; + self.is_closed = true; + Ok(()) + } +} + +/// A serialized implementation for Parquet [`RowGroupWriter`]. +/// Coordinates writing of a row group with column writers. +/// See documentation on row group writer for more information. +pub struct SerializedRowGroupWriter { + descr: SchemaDescPtr, + props: WriterPropertiesPtr, + file: File, + total_rows_written: Option, + total_bytes_written: u64, + column_index: usize, + previous_writer_closed: bool, + row_group_metadata: Option, + column_chunks: Vec, +} + +impl SerializedRowGroupWriter { + pub fn new(schema_descr: SchemaDescPtr, properties: WriterPropertiesPtr, file: &File) -> Self { + let num_columns = schema_descr.num_columns(); + Self { + descr: schema_descr, + props: properties, + file: file.try_clone().unwrap(), + total_rows_written: None, + total_bytes_written: 0, + column_index: 0, + previous_writer_closed: true, + row_group_metadata: None, + column_chunks: Vec::with_capacity(num_columns), + } + } + + /// Checks and finalises current column writer. + fn finalise_column_writer(&mut self, writer: ColumnWriter) -> Result<()> { + let (bytes_written, rows_written, metadata) = match writer { + ColumnWriter::BoolColumnWriter(typed) => typed.close()?, + ColumnWriter::Int32ColumnWriter(typed) => typed.close()?, + ColumnWriter::Int64ColumnWriter(typed) => typed.close()?, + ColumnWriter::Int96ColumnWriter(typed) => typed.close()?, + ColumnWriter::FloatColumnWriter(typed) => typed.close()?, + ColumnWriter::DoubleColumnWriter(typed) => typed.close()?, + ColumnWriter::ByteArrayColumnWriter(typed) => typed.close()?, + ColumnWriter::FixedLenByteArrayColumnWriter(typed) => typed.close()?, + }; + + // Update row group writer metrics + self.total_bytes_written += bytes_written; + self.column_chunks.push(Rc::new(metadata)); + if let Some(rows) = self.total_rows_written { + if rows != rows_written { + return Err(general_err!( + "Incorrect number of rows, expected {} != {} rows", + rows, + rows_written + )); + } + } else { + self.total_rows_written = Some(rows_written); + } + + Ok(()) + } + + #[inline] + fn assert_closed(&self) -> Result<()> { + if self.row_group_metadata.is_some() { + Err(general_err!("Row group writer is closed")) + } else { + Ok(()) + } + } + + #[inline] + fn assert_previous_writer_closed(&self) -> Result<()> { + if !self.previous_writer_closed { + Err(general_err!("Previous column writer was not closed")) + } else { + Ok(()) + } + } +} + +impl RowGroupWriter for SerializedRowGroupWriter { + #[inline] + fn next_column(&mut self) -> Result> { + self.assert_closed()?; + self.assert_previous_writer_closed()?; + + if self.column_index >= self.descr.num_columns() { + return Ok(None); + } + let sink = FileSink::new(&self.file); + let page_writer = Box::new(SerializedPageWriter::new(sink)); + let column_writer = get_column_writer( + self.descr.column(self.column_index), + self.props.clone(), + page_writer, + ); + self.column_index += 1; + self.previous_writer_closed = false; + + Ok(Some(column_writer)) + } + + #[inline] + fn close_column(&mut self, column_writer: ColumnWriter) -> Result<()> { + let res = self.finalise_column_writer(column_writer); + self.previous_writer_closed = res.is_ok(); + res + } + + #[inline] + fn close(&mut self) -> Result { + if self.row_group_metadata.is_none() { + self.assert_previous_writer_closed()?; + + let row_group_metadata = RowGroupMetaData::builder(self.descr.clone()) + .set_column_metadata(self.column_chunks.clone()) + .set_total_byte_size(self.total_bytes_written as i64) + .set_num_rows(self.total_rows_written.unwrap_or(0) as i64) + .build()?; + + self.row_group_metadata = Some(Rc::new(row_group_metadata)); + } + + let metadata = self.row_group_metadata.as_ref().unwrap().clone(); + Ok(metadata) + } +} + +/// A serialized implementation for Parquet [`PageWriter`]. +/// Writes and serializes pages and metadata into output stream. +/// +/// `SerializedPageWriter` should not be used after calling `close()`. +pub struct SerializedPageWriter { + sink: T, +} + +impl SerializedPageWriter { + /// Creates new page writer. + pub fn new(sink: T) -> Self { + Self { sink } + } + + /// Serializes page header into Thrift. + /// Returns number of bytes that have been written into the sink. + #[inline] + fn serialize_page_header(&mut self, header: parquet::PageHeader) -> Result { + let start_pos = self.sink.pos(); + { + let mut protocol = TCompactOutputProtocol::new(&mut self.sink); + header.write_to_out_protocol(&mut protocol)?; + protocol.flush()?; + } + Ok((self.sink.pos() - start_pos) as usize) + } + + /// Serializes column chunk into Thrift. + /// Returns Ok() if there are not errors serializing and writing data into the sink. + #[inline] + fn serialize_column_chunk(&mut self, chunk: parquet::ColumnChunk) -> Result<()> { + let mut protocol = TCompactOutputProtocol::new(&mut self.sink); + chunk.write_to_out_protocol(&mut protocol)?; + protocol.flush()?; + Ok(()) + } +} + +impl PageWriter for SerializedPageWriter { + fn write_page(&mut self, page: CompressedPage) -> Result { + let uncompressed_size = page.uncompressed_size(); + let compressed_size = page.compressed_size(); + let num_values = page.num_values(); + let encoding = page.encoding(); + let page_type = page.page_type(); + + let mut page_header = parquet::PageHeader { + type_: page_type.into(), + uncompressed_page_size: uncompressed_size as i32, + compressed_page_size: compressed_size as i32, + // TODO: Add support for crc checksum + crc: None, + data_page_header: None, + index_page_header: None, + dictionary_page_header: None, + data_page_header_v2: None, + }; + + match page.compressed_page() { + &Page::DataPage { + def_level_encoding, + rep_level_encoding, + ref statistics, + .. + } => { + let data_page_header = parquet::DataPageHeader { + num_values: num_values as i32, + encoding: encoding.into(), + definition_level_encoding: def_level_encoding.into(), + repetition_level_encoding: rep_level_encoding.into(), + statistics: statistics_to_thrift(statistics.as_ref()), + }; + page_header.data_page_header = Some(data_page_header); + } + &Page::DataPageV2 { + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed, + ref statistics, + .. + } => { + let data_page_header_v2 = parquet::DataPageHeaderV2 { + num_values: num_values as i32, + num_nulls: num_nulls as i32, + num_rows: num_rows as i32, + encoding: encoding.into(), + definition_levels_byte_length: def_levels_byte_len as i32, + repetition_levels_byte_length: rep_levels_byte_len as i32, + is_compressed: Some(is_compressed), + statistics: statistics_to_thrift(statistics.as_ref()), + }; + page_header.data_page_header_v2 = Some(data_page_header_v2); + } + &Page::DictionaryPage { is_sorted, .. } => { + let dictionary_page_header = parquet::DictionaryPageHeader { + num_values: num_values as i32, + encoding: encoding.into(), + is_sorted: Some(is_sorted), + }; + page_header.dictionary_page_header = Some(dictionary_page_header); + } + } + + let start_pos = self.sink.pos(); + + let header_size = self.serialize_page_header(page_header)?; + self.sink.write_all(page.data())?; + + let mut spec = PageWriteSpec::new(); + spec.page_type = page_type; + spec.uncompressed_size = uncompressed_size + header_size; + spec.compressed_size = compressed_size + header_size; + spec.offset = start_pos; + spec.bytes_written = self.sink.pos() - start_pos; + // Number of values is incremented for data pages only + if page_type == PageType::DATA_PAGE || page_type == PageType::DATA_PAGE_V2 { + spec.num_values = num_values; + } + + Ok(spec) + } + + fn write_metadata(&mut self, metadata: &ColumnChunkMetaData) -> Result<()> { + self.serialize_column_chunk(metadata.to_thrift()) + } + + fn close(&mut self) -> Result<()> { + self.sink.flush()?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::{error::Error, io::Cursor}; + + use crate::parquet::basic::{Compression, Encoding, Repetition, Type}; + use crate::parquet::column::page::PageReader; + use crate::parquet::compression::{create_codec, Codec}; + use crate::parquet::file::{ + properties::WriterProperties, + reader::{FileReader, SerializedFileReader, SerializedPageReader}, + statistics::{from_thrift, to_thrift, Statistics}, + }; + use crate::parquet::record::RowAccessor; + use crate::parquet::util::{memory::ByteBufferPtr, test_common::get_temp_file}; + + #[test] + fn test_file_writer_error_after_close() { + let file = get_temp_file("test_file_writer_error_after_close", &[]); + let schema = Rc::new(types::Type::group_type_builder("schema").build().unwrap()); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + writer.close().unwrap(); + { + let res = writer.next_row_group(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "File writer is closed"); + } + } + { + let res = writer.close(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "File writer is closed"); + } + } + } + + #[test] + fn test_row_group_writer_error_after_close() { + let file = get_temp_file("test_file_writer_row_group_error_after_close", &[]); + let schema = Rc::new(types::Type::group_type_builder("schema").build().unwrap()); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + row_group_writer.close().unwrap(); + + let res = row_group_writer.next_column(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "Row group writer is closed"); + } + } + + #[test] + fn test_row_group_writer_error_not_all_columns_written() { + let file = get_temp_file("test_row_group_writer_error_not_all_columns_written", &[]); + let schema = Rc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + let res = row_group_writer.close(); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!(err.description(), "Column length mismatch: 1 != 0"); + } + } + + #[test] + fn test_row_group_writer_num_records_mismatch() { + let file = get_temp_file("test_row_group_writer_num_records_mismatch", &[]); + let schema = Rc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![ + Rc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ), + Rc::new( + types::Type::primitive_type_builder("col2", Type::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + ); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + if let ColumnWriter::Int32ColumnWriter(ref mut typed) = col_writer { + typed.write_batch(&[1, 2, 3], None, None).unwrap(); + } + row_group_writer.close_column(col_writer).unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + if let ColumnWriter::Int32ColumnWriter(ref mut typed) = col_writer { + typed.write_batch(&[1, 2], None, None).unwrap(); + } + + let res = row_group_writer.close_column(col_writer); + assert!(res.is_err()); + if let Err(err) = res { + assert_eq!( + err.description(), + "Incorrect number of rows, expected 3 != 2 rows" + ); + } + } + + #[test] + fn test_file_writer_empty_file() { + let file = get_temp_file("test_file_writer_write_empty_file", &[]); + + let schema = Rc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ); + let props = Rc::new(WriterProperties::builder().build()); + let mut writer = + SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + assert_eq!(reader.get_row_iter(None).unwrap().count(), 0); + } + + #[test] + fn test_file_writer_empty_row_groups() { + let file = get_temp_file("test_file_writer_write_empty_row_groups", &[]); + test_file_roundtrip(file, vec![]); + } + + #[test] + fn test_file_writer_single_row_group() { + let file = get_temp_file("test_file_writer_write_single_row_group", &[]); + test_file_roundtrip(file, vec![vec![1, 2, 3, 4, 5]]); + } + + #[test] + fn test_file_writer_multiple_row_groups() { + let file = get_temp_file("test_file_writer_write_multiple_row_groups", &[]); + test_file_roundtrip( + file, + vec![ + vec![1, 2, 3, 4, 5], + vec![1, 2, 3], + vec![1], + vec![1, 2, 3, 4, 5, 6], + ], + ); + } + + #[test] + fn test_file_writer_multiple_large_row_groups() { + let file = get_temp_file("test_file_writer_multiple_large_row_groups", &[]); + test_file_roundtrip( + file, + vec![vec![123; 1024], vec![124; 1000], vec![125; 15], vec![]], + ); + } + + #[test] + fn test_page_writer_data_pages() { + let pages = vec![ + Page::DataPage { + buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5, 6, 7, 8]), + num_values: 10, + encoding: Encoding::DELTA_BINARY_PACKED, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + }, + Page::DataPageV2 { + buf: ByteBufferPtr::new(vec![4; 128]), + num_values: 10, + encoding: Encoding::DELTA_BINARY_PACKED, + num_nulls: 2, + num_rows: 12, + def_levels_byte_len: 24, + rep_levels_byte_len: 32, + is_compressed: false, + statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + }, + ]; + + test_page_roundtrip(&pages[..], Compression::SNAPPY, Type::INT32); + test_page_roundtrip(&pages[..], Compression::UNCOMPRESSED, Type::INT32); + } + + #[test] + fn test_page_writer_dict_pages() { + let pages = vec![ + Page::DictionaryPage { + buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5]), + num_values: 5, + encoding: Encoding::RLE_DICTIONARY, + is_sorted: false, + }, + Page::DataPage { + buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5, 6, 7, 8]), + num_values: 10, + encoding: Encoding::DELTA_BINARY_PACKED, + def_level_encoding: Encoding::RLE, + rep_level_encoding: Encoding::RLE, + statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), + }, + Page::DataPageV2 { + buf: ByteBufferPtr::new(vec![4; 128]), + num_values: 10, + encoding: Encoding::DELTA_BINARY_PACKED, + num_nulls: 2, + num_rows: 12, + def_levels_byte_len: 24, + rep_levels_byte_len: 32, + is_compressed: false, + statistics: None, + }, + ]; + + test_page_roundtrip(&pages[..], Compression::SNAPPY, Type::INT32); + test_page_roundtrip(&pages[..], Compression::UNCOMPRESSED, Type::INT32); + } + + /// Tests writing and reading pages. + /// Physical type is for statistics only, should match any defined statistics type in + /// pages. + fn test_page_roundtrip(pages: &[Page], codec: Compression, physical_type: Type) { + let mut compressed_pages = vec![]; + let mut total_num_values = 0i64; + let mut compressor = create_codec(codec).unwrap(); + + for page in pages { + let uncompressed_len = page.buffer().len(); + + let compressed_page = match page { + &Page::DataPage { + ref buf, + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + ref statistics, + } => { + total_num_values += num_values as i64; + let output_buf = compress_helper(compressor.as_mut(), buf.data()); + + Page::DataPage { + buf: ByteBufferPtr::new(output_buf), + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())), + } + } + &Page::DataPageV2 { + ref buf, + num_values, + encoding, + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + ref statistics, + .. + } => { + total_num_values += num_values as i64; + let offset = (def_levels_byte_len + rep_levels_byte_len) as usize; + let cmp_buf = compress_helper(compressor.as_mut(), &buf.data()[offset..]); + let mut output_buf = Vec::from(&buf.data()[..offset]); + output_buf.extend_from_slice(&cmp_buf[..]); + + Page::DataPageV2 { + buf: ByteBufferPtr::new(output_buf), + num_values, + encoding, + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed: compressor.is_some(), + statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())), + } + } + &Page::DictionaryPage { + ref buf, + num_values, + encoding, + is_sorted, + } => { + let output_buf = compress_helper(compressor.as_mut(), buf.data()); + + Page::DictionaryPage { + buf: ByteBufferPtr::new(output_buf), + num_values, + encoding, + is_sorted, + } + } + }; + + let compressed_page = CompressedPage::new(compressed_page, uncompressed_len); + compressed_pages.push(compressed_page); + } + + let mut buffer: Vec = vec![]; + let mut result_pages: Vec = vec![]; + { + let cursor = Cursor::new(&mut buffer); + let mut page_writer = SerializedPageWriter::new(cursor); + + for page in compressed_pages { + page_writer.write_page(page).unwrap(); + } + page_writer.close().unwrap(); + } + { + let mut page_reader = SerializedPageReader::new( + Cursor::new(&buffer), + total_num_values, + codec, + physical_type, + ) + .unwrap(); + + while let Some(page) = page_reader.get_next_page().unwrap() { + result_pages.push(page); + } + } + + assert_eq!(result_pages.len(), pages.len()); + for i in 0..result_pages.len() { + assert_page(&result_pages[i], &pages[i]); + } + } + + /// Helper function to compress a slice + fn compress_helper(compressor: Option<&mut Box>, data: &[u8]) -> Vec { + let mut output_buf = vec![]; + if let Some(cmpr) = compressor { + cmpr.compress(data, &mut output_buf).unwrap(); + } else { + output_buf.extend_from_slice(data); + } + output_buf + } + + /// Check if pages match. + fn assert_page(left: &Page, right: &Page) { + assert_eq!(left.page_type(), right.page_type()); + assert_eq!(left.buffer().data(), right.buffer().data()); + assert_eq!(left.num_values(), right.num_values()); + assert_eq!(left.encoding(), right.encoding()); + assert_eq!(to_thrift(left.statistics()), to_thrift(right.statistics())); + } + + /// File write-read roundtrip. + /// `data` consists of arrays of values for each row group. + fn test_file_roundtrip(file: File, data: Vec>) { + let schema = Rc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ); + let props = Rc::new(WriterProperties::builder().build()); + let mut file_writer = + SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + + for subset in &data { + let mut row_group_writer = file_writer.next_row_group().unwrap(); + let col_writer = row_group_writer.next_column().unwrap(); + if let Some(mut writer) = col_writer { + match writer { + ColumnWriter::Int32ColumnWriter(ref mut typed) => { + typed.write_batch(&subset[..], None, None).unwrap(); + } + _ => { + unimplemented!(); + } + } + row_group_writer.close_column(writer).unwrap(); + } + file_writer.close_row_group(row_group_writer).unwrap(); + } + + file_writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + assert_eq!(reader.num_row_groups(), data.len()); + for i in 0..reader.num_row_groups() { + let row_group_reader = reader.get_row_group(i).unwrap(); + let iter = row_group_reader.get_row_iter(None).unwrap(); + let res = iter + .map(|elem| elem.get_int(0).unwrap()) + .collect::>(); + assert_eq!(res, data[i]); + } + } +} diff --git a/rust/src/parquet/mod.rs b/rust/src/parquet/mod.rs new file mode 100644 index 0000000000000..58cc7b13df6d6 --- /dev/null +++ b/rust/src/parquet/mod.rs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +pub mod errors; +pub mod basic; +pub mod data_type; + +// Exported for external use, such as benchmarks +pub use self::encodings::{decoding, encoding}; +pub use self::util::memory; + +#[macro_use] +mod util; +pub mod column; +pub mod compression; +mod encodings; +pub mod file; +pub mod record; +pub mod schema; diff --git a/rust/src/parquet/record/api.rs b/rust/src/parquet/record/api.rs new file mode 100644 index 0000000000000..d6e3ec19b76f6 --- /dev/null +++ b/rust/src/parquet/record/api.rs @@ -0,0 +1,1439 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains Row enum that is used to represent record in Rust. + +use std::fmt; + +use chrono::{Local, TimeZone}; +use num_bigint::{BigInt, Sign}; + +use crate::parquet::basic::{LogicalType, Type as PhysicalType}; +use crate::parquet::data_type::{ByteArray, Decimal, Int96}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::ColumnDescPtr; + +/// Macro as a shortcut to generate 'not yet implemented' panic error. +macro_rules! nyi { + ($column_descr:ident, $value:ident) => {{ + unimplemented!( + "Conversion for physical type {}, logical type {}, value {:?}", + $column_descr.physical_type(), + $column_descr.logical_type(), + $value + ); + }}; +} + +/// `Row` represents a nested Parquet record. +#[derive(Clone, Debug, PartialEq)] +pub struct Row { + fields: Vec<(String, Field)>, +} + +impl Row { + /// Get the number of fields in this row. + pub fn len(&self) -> usize { + self.fields.len() + } +} + +/// Trait for type-safe convenient access to fields within a Row. +pub trait RowAccessor { + fn get_bool(&self, i: usize) -> Result; + fn get_byte(&self, i: usize) -> Result; + fn get_short(&self, i: usize) -> Result; + fn get_int(&self, i: usize) -> Result; + fn get_long(&self, i: usize) -> Result; + fn get_ubyte(&self, i: usize) -> Result; + fn get_ushort(&self, i: usize) -> Result; + fn get_uint(&self, i: usize) -> Result; + fn get_ulong(&self, i: usize) -> Result; + fn get_float(&self, i: usize) -> Result; + fn get_double(&self, i: usize) -> Result; + fn get_timestamp(&self, i: usize) -> Result; + fn get_decimal(&self, i: usize) -> Result<&Decimal>; + fn get_string(&self, i: usize) -> Result<&String>; + fn get_bytes(&self, i: usize) -> Result<&ByteArray>; + fn get_group(&self, i: usize) -> Result<&Row>; + fn get_list(&self, i: usize) -> Result<&List>; + fn get_map(&self, i: usize) -> Result<&Map>; +} + +/// Macro to generate type-safe get_xxx methods for primitive types, +/// e.g. `get_bool`, `get_short`. +macro_rules! row_primitive_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<$TY> { + match self.fields[i].1 { + Field::$VARIANT(v) => Ok(v), + _ => Err(general_err!("Cannot access {} as {}", + self.fields[i].1.get_type_name(), stringify!($VARIANT))) + } + } + } +} + +/// Macro to generate type-safe get_xxx methods for reference types, +/// e.g. `get_list`, `get_map`. +macro_rules! row_complex_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<&$TY> { + match self.fields[i].1 { + Field::$VARIANT(ref v) => Ok(v), + _ => Err(general_err!("Cannot access {} as {}", + self.fields[i].1.get_type_name(), stringify!($VARIANT))) + } + } + } +} + +impl RowAccessor for Row { + row_primitive_accessor!(get_bool, Bool, bool); + + row_primitive_accessor!(get_byte, Byte, i8); + + row_primitive_accessor!(get_short, Short, i16); + + row_primitive_accessor!(get_int, Int, i32); + + row_primitive_accessor!(get_long, Long, i64); + + row_primitive_accessor!(get_ubyte, UByte, u8); + + row_primitive_accessor!(get_ushort, UShort, u16); + + row_primitive_accessor!(get_uint, UInt, u32); + + row_primitive_accessor!(get_ulong, ULong, u64); + + row_primitive_accessor!(get_float, Float, f32); + + row_primitive_accessor!(get_double, Double, f64); + + row_primitive_accessor!(get_timestamp, Timestamp, u64); + + row_complex_accessor!(get_decimal, Decimal, Decimal); + + row_complex_accessor!(get_string, Str, String); + + row_complex_accessor!(get_bytes, Bytes, ByteArray); + + row_complex_accessor!(get_group, Group, Row); + + row_complex_accessor!(get_list, ListInternal, List); + + row_complex_accessor!(get_map, MapInternal, Map); +} + +/// Constructs a `Row` from the list of `fields` and returns it. +#[inline] +pub fn make_row(fields: Vec<(String, Field)>) -> Row { + Row { fields } +} + +impl fmt::Display for Row { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{{")?; + for (i, &(ref key, ref value)) in self.fields.iter().enumerate() { + key.fmt(f)?; + write!(f, ": ")?; + value.fmt(f)?; + if i < self.fields.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, "}}") + } +} + +/// `List` represents a list which contains an array of elements. +#[derive(Clone, Debug, PartialEq)] +pub struct List { + elements: Vec, +} + +impl List { + /// Get the number of fields in this row + pub fn len(&self) -> usize { + self.elements.len() + } +} + +/// Constructs a `List` from the list of `fields` and returns it. +#[inline] +pub fn make_list(elements: Vec) -> List { + List { elements } +} + +/// Trait for type-safe access of an index for a `List`. +/// Note that the get_XXX methods do not do bound checking. +pub trait ListAccessor { + fn get_bool(&self, i: usize) -> Result; + fn get_byte(&self, i: usize) -> Result; + fn get_short(&self, i: usize) -> Result; + fn get_int(&self, i: usize) -> Result; + fn get_long(&self, i: usize) -> Result; + fn get_ubyte(&self, i: usize) -> Result; + fn get_ushort(&self, i: usize) -> Result; + fn get_uint(&self, i: usize) -> Result; + fn get_ulong(&self, i: usize) -> Result; + fn get_float(&self, i: usize) -> Result; + fn get_double(&self, i: usize) -> Result; + fn get_timestamp(&self, i: usize) -> Result; + fn get_decimal(&self, i: usize) -> Result<&Decimal>; + fn get_string(&self, i: usize) -> Result<&String>; + fn get_bytes(&self, i: usize) -> Result<&ByteArray>; + fn get_group(&self, i: usize) -> Result<&Row>; + fn get_list(&self, i: usize) -> Result<&List>; + fn get_map(&self, i: usize) -> Result<&Map>; +} + +/// Macro to generate type-safe get_xxx methods for primitive types, +/// e.g. get_bool, get_short +macro_rules! list_primitive_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<$TY> { + match self.elements[i] { + Field::$VARIANT(v) => Ok(v), + _ => Err(general_err!( + "Cannot access {} as {}", + self.elements[i].get_type_name(), stringify!($VARIANT)) + ) + } + } + } +} + +/// Macro to generate type-safe get_xxx methods for reference types +/// e.g. get_list, get_map +macro_rules! list_complex_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<&$TY> { + match self.elements[i] { + Field::$VARIANT(ref v) => Ok(v), + _ => Err(general_err!( + "Cannot access {} as {}", + self.elements[i].get_type_name(), stringify!($VARIANT)) + ) + } + } + } +} + +impl ListAccessor for List { + list_primitive_accessor!(get_bool, Bool, bool); + + list_primitive_accessor!(get_byte, Byte, i8); + + list_primitive_accessor!(get_short, Short, i16); + + list_primitive_accessor!(get_int, Int, i32); + + list_primitive_accessor!(get_long, Long, i64); + + list_primitive_accessor!(get_ubyte, UByte, u8); + + list_primitive_accessor!(get_ushort, UShort, u16); + + list_primitive_accessor!(get_uint, UInt, u32); + + list_primitive_accessor!(get_ulong, ULong, u64); + + list_primitive_accessor!(get_float, Float, f32); + + list_primitive_accessor!(get_double, Double, f64); + + list_primitive_accessor!(get_timestamp, Timestamp, u64); + + list_complex_accessor!(get_decimal, Decimal, Decimal); + + list_complex_accessor!(get_string, Str, String); + + list_complex_accessor!(get_bytes, Bytes, ByteArray); + + list_complex_accessor!(get_group, Group, Row); + + list_complex_accessor!(get_list, ListInternal, List); + + list_complex_accessor!(get_map, MapInternal, Map); +} + +/// `Map` represents a map which contains an list of key->value pairs. +#[derive(Clone, Debug, PartialEq)] +pub struct Map { + entries: Vec<(Field, Field)>, +} + +impl Map { + /// Get the number of fields in this row + pub fn len(&self) -> usize { + self.entries.len() + } +} + +/// Constructs a `Map` from the list of `entries` and returns it. +#[inline] +pub fn make_map(entries: Vec<(Field, Field)>) -> Map { + Map { entries } +} + +/// Trait for type-safe access of an index for a `Map` +pub trait MapAccessor { + fn get_keys<'a>(&'a self) -> Box; + fn get_values<'a>(&'a self) -> Box; +} + +struct MapList<'a> { + elements: Vec<&'a Field>, +} + +/// Macro to generate type-safe get_xxx methods for primitive types, +/// e.g. get_bool, get_short +macro_rules! map_list_primitive_accessor { + ($METHOD:ident, $VARIANT:ident, $TY:ty) => { + fn $METHOD(&self, i: usize) -> Result<$TY> { + match self.elements[i] { + Field::$VARIANT(v) => Ok(*v), + _ => Err(general_err!( + "Cannot access {} as {}", + self.elements[i].get_type_name(), stringify!($VARIANT)) + ) + } + } + } +} + +impl<'a> ListAccessor for MapList<'a> { + map_list_primitive_accessor!(get_bool, Bool, bool); + + map_list_primitive_accessor!(get_byte, Byte, i8); + + map_list_primitive_accessor!(get_short, Short, i16); + + map_list_primitive_accessor!(get_int, Int, i32); + + map_list_primitive_accessor!(get_long, Long, i64); + + map_list_primitive_accessor!(get_ubyte, UByte, u8); + + map_list_primitive_accessor!(get_ushort, UShort, u16); + + map_list_primitive_accessor!(get_uint, UInt, u32); + + map_list_primitive_accessor!(get_ulong, ULong, u64); + + map_list_primitive_accessor!(get_float, Float, f32); + + map_list_primitive_accessor!(get_double, Double, f64); + + map_list_primitive_accessor!(get_timestamp, Timestamp, u64); + + list_complex_accessor!(get_decimal, Decimal, Decimal); + + list_complex_accessor!(get_string, Str, String); + + list_complex_accessor!(get_bytes, Bytes, ByteArray); + + list_complex_accessor!(get_group, Group, Row); + + list_complex_accessor!(get_list, ListInternal, List); + + list_complex_accessor!(get_map, MapInternal, Map); +} + +impl MapAccessor for Map { + fn get_keys<'a>(&'a self) -> Box { + let map_list = MapList { + elements: self.entries.iter().map(|v| &v.0).collect(), + }; + Box::new(map_list) + } + + fn get_values<'a>(&'a self) -> Box { + let map_list = MapList { + elements: self.entries.iter().map(|v| &v.1).collect(), + }; + Box::new(map_list) + } +} + +/// API to represent a single field in a `Row`. +#[derive(Clone, Debug, PartialEq)] +pub enum Field { + // Primitive types + /// Null value. + Null, + /// Boolean value (`true`, `false`). + Bool(bool), + /// Signed integer INT_8. + Byte(i8), + /// Signed integer INT_16. + Short(i16), + /// Signed integer INT_32. + Int(i32), + /// Signed integer INT_64. + Long(i64), + // Unsigned integer UINT_8. + UByte(u8), + // Unsigned integer UINT_16. + UShort(u16), + // Unsigned integer UINT_32. + UInt(u32), + // Unsigned integer UINT_64. + ULong(u64), + /// IEEE 32-bit floating point value. + Float(f32), + /// IEEE 64-bit floating point value. + Double(f64), + /// Decimal value. + Decimal(Decimal), + /// UTF-8 encoded character string. + Str(String), + /// General binary value. + Bytes(ByteArray), + /// Date without a time of day, stores the number of days from the + /// Unix epoch, 1 January 1970. + Date(u32), + /// Milliseconds from the Unix epoch, 1 January 1970. + Timestamp(u64), + + // ---------------------------------------------------------------------- + // Complex types + /// Struct, child elements are tuples of field-value pairs. + Group(Row), + /// List of elements. + ListInternal(List), + /// List of key-value pairs. + MapInternal(Map), +} + +impl Field { + /// Get the type name. + fn get_type_name(&self) -> &'static str { + match *self { + Field::Null => "Null", + Field::Bool(_) => "Bool", + Field::Byte(_) => "Byte", + Field::Short(_) => "Short", + Field::Int(_) => "Int", + Field::Long(_) => "Long", + Field::UByte(_) => "UByte", + Field::UShort(_) => "UShort", + Field::UInt(_) => "UInt", + Field::ULong(_) => "ULong", + Field::Float(_) => "Float", + Field::Double(_) => "Double", + Field::Decimal(_) => "Decimal", + Field::Date(_) => "Date", + Field::Str(_) => "Str", + Field::Bytes(_) => "Bytes", + Field::Timestamp(_) => "Timestamp", + Field::Group(_) => "Group", + Field::ListInternal(_) => "ListInternal", + Field::MapInternal(_) => "MapInternal", + } + } + + /// Determines if this Row represents a primitive value. + pub fn is_primitive(&self) -> bool { + match *self { + Field::Group(_) => false, + Field::ListInternal(_) => false, + Field::MapInternal(_) => false, + _ => true, + } + } + + /// Converts Parquet BOOLEAN type with logical type into `bool` value. + #[inline] + pub fn convert_bool(_descr: &ColumnDescPtr, value: bool) -> Self { + Field::Bool(value) + } + + /// Converts Parquet INT32 type with logical type into `i32` value. + #[inline] + pub fn convert_int32(descr: &ColumnDescPtr, value: i32) -> Self { + match descr.logical_type() { + LogicalType::INT_8 => Field::Byte(value as i8), + LogicalType::INT_16 => Field::Short(value as i16), + LogicalType::INT_32 | LogicalType::NONE => Field::Int(value), + LogicalType::UINT_8 => Field::UByte(value as u8), + LogicalType::UINT_16 => Field::UShort(value as u16), + LogicalType::UINT_32 => Field::UInt(value as u32), + LogicalType::DATE => Field::Date(value as u32), + LogicalType::DECIMAL => Field::Decimal(Decimal::from_i32( + value, + descr.type_precision(), + descr.type_scale(), + )), + _ => nyi!(descr, value), + } + } + + /// Converts Parquet INT64 type with logical type into `i64` value. + #[inline] + pub fn convert_int64(descr: &ColumnDescPtr, value: i64) -> Self { + match descr.logical_type() { + LogicalType::INT_64 | LogicalType::NONE => Field::Long(value), + LogicalType::UINT_64 => Field::ULong(value as u64), + LogicalType::TIMESTAMP_MILLIS => Field::Timestamp(value as u64), + LogicalType::DECIMAL => Field::Decimal(Decimal::from_i64( + value, + descr.type_precision(), + descr.type_scale(), + )), + _ => nyi!(descr, value), + } + } + + /// Converts Parquet INT96 (nanosecond timestamps) type and logical type into + /// `Timestamp` value. + #[inline] + pub fn convert_int96(_descr: &ColumnDescPtr, value: Int96) -> Self { + const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588; + const SECONDS_PER_DAY: i64 = 86_400; + const MILLIS_PER_SECOND: i64 = 1_000; + + let day = value.data()[2] as i64; + let nanoseconds = ((value.data()[1] as i64) << 32) + value.data()[0] as i64; + let seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY; + let millis = seconds * MILLIS_PER_SECOND + nanoseconds / 1_000_000; + + // TODO: Add support for negative milliseconds. + // Chrono library does not handle negative timestamps, but we could probably write + // something similar to java.util.Date and java.util.Calendar. + if millis < 0 { + panic!( + "Expected non-negative milliseconds when converting Int96, found {}", + millis + ); + } + + Field::Timestamp(millis as u64) + } + + /// Converts Parquet FLOAT type with logical type into `f32` value. + #[inline] + pub fn convert_float(_descr: &ColumnDescPtr, value: f32) -> Self { + Field::Float(value) + } + + /// Converts Parquet DOUBLE type with logical type into `f64` value. + #[inline] + pub fn convert_double(_descr: &ColumnDescPtr, value: f64) -> Self { + Field::Double(value) + } + + /// Converts Parquet BYTE_ARRAY type with logical type into either UTF8 string or + /// array of bytes. + #[inline] + pub fn convert_byte_array(descr: &ColumnDescPtr, value: ByteArray) -> Self { + match descr.physical_type() { + PhysicalType::BYTE_ARRAY => match descr.logical_type() { + LogicalType::UTF8 | LogicalType::ENUM | LogicalType::JSON => { + let value = unsafe { String::from_utf8_unchecked(value.data().to_vec()) }; + Field::Str(value) + } + LogicalType::BSON | LogicalType::NONE => Field::Bytes(value), + LogicalType::DECIMAL => Field::Decimal(Decimal::from_bytes( + value, + descr.type_precision(), + descr.type_scale(), + )), + _ => nyi!(descr, value), + }, + PhysicalType::FIXED_LEN_BYTE_ARRAY => match descr.logical_type() { + LogicalType::DECIMAL => Field::Decimal(Decimal::from_bytes( + value, + descr.type_precision(), + descr.type_scale(), + )), + LogicalType::NONE => Field::Bytes(value), + _ => nyi!(descr, value), + }, + _ => nyi!(descr, value), + } + } +} + +impl fmt::Display for Field { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Field::Null => write!(f, "null"), + Field::Bool(value) => write!(f, "{}", value), + Field::Byte(value) => write!(f, "{}", value), + Field::Short(value) => write!(f, "{}", value), + Field::Int(value) => write!(f, "{}", value), + Field::Long(value) => write!(f, "{}", value), + Field::UByte(value) => write!(f, "{}", value), + Field::UShort(value) => write!(f, "{}", value), + Field::UInt(value) => write!(f, "{}", value), + Field::ULong(value) => write!(f, "{}", value), + Field::Float(value) => { + if value > 1e19 || value < 1e-15 { + write!(f, "{:E}", value) + } else { + write!(f, "{:?}", value) + } + } + Field::Double(value) => { + if value > 1e19 || value < 1e-15 { + write!(f, "{:E}", value) + } else { + write!(f, "{:?}", value) + } + } + Field::Decimal(ref value) => write!(f, "{}", convert_decimal_to_string(value)), + Field::Str(ref value) => write!(f, "\"{}\"", value), + Field::Bytes(ref value) => write!(f, "{:?}", value.data()), + Field::Date(value) => write!(f, "{}", convert_date_to_string(value)), + Field::Timestamp(value) => write!(f, "{}", convert_timestamp_to_string(value)), + Field::Group(ref fields) => write!(f, "{}", fields), + Field::ListInternal(ref list) => { + let elems = &list.elements; + write!(f, "[")?; + for (i, field) in elems.iter().enumerate() { + field.fmt(f)?; + if i < elems.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, "]") + } + Field::MapInternal(ref map) => { + let entries = &map.entries; + write!(f, "{{")?; + for (i, &(ref key, ref value)) in entries.iter().enumerate() { + key.fmt(f)?; + write!(f, " -> ")?; + value.fmt(f)?; + if i < entries.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, "}}") + } + } + } +} + +/// Helper method to convert Parquet date into a string. +/// Input `value` is a number of days since the epoch in UTC. +/// Date is displayed in local timezone. +#[inline] +fn convert_date_to_string(value: u32) -> String { + static NUM_SECONDS_IN_DAY: i64 = 60 * 60 * 24; + let dt = Local.timestamp(value as i64 * NUM_SECONDS_IN_DAY, 0).date(); + format!("{}", dt.format("%Y-%m-%d %:z")) +} + +/// Helper method to convert Parquet timestamp into a string. +/// Input `value` is a number of milliseconds since the epoch in UTC. +/// Datetime is displayed in local timezone. +#[inline] +fn convert_timestamp_to_string(value: u64) -> String { + let dt = Local.timestamp((value / 1000) as i64, 0); + format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")) +} + +/// Helper method to convert Parquet decimal into a string. +/// We assert that `scale >= 0` and `precision > scale`, but this will be enforced +/// when constructing Parquet schema. +#[inline] +fn convert_decimal_to_string(decimal: &Decimal) -> String { + assert!(decimal.scale() >= 0 && decimal.precision() > decimal.scale()); + + // Specify as signed bytes to resolve sign as part of conversion. + let num = BigInt::from_signed_bytes_be(decimal.data()); + + // Offset of the first digit in a string. + let negative = if num.sign() == Sign::Minus { 1 } else { 0 }; + let mut num_str = num.to_string(); + let mut point = num_str.len() as i32 - decimal.scale() - negative; + + // Convert to string form without scientific notation. + if point <= 0 { + // Zeros need to be prepended to the unscaled value. + while point < 0 { + num_str.insert(negative as usize, '0'); + point += 1; + } + num_str.insert_str(negative as usize, "0."); + } else { + // No zeroes need to be prepended to the unscaled value, simply insert decimal point. + num_str.insert((point + negative) as usize, '.'); + } + + num_str +} + +#[cfg(test)] +mod tests { + use super::*; + + use chrono; + use std::rc::Rc; + + use crate::parquet::schema::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; + + /// Creates test column descriptor based on provided type parameters. + macro_rules! make_column_descr { + ($physical_type:expr, $logical_type:expr) => {{ + let tpe = PrimitiveTypeBuilder::new("col", $physical_type) + .with_logical_type($logical_type) + .build() + .unwrap(); + Rc::new(ColumnDescriptor::new( + Rc::new(tpe), + None, + 0, + 0, + ColumnPath::from("col"), + )) + }}; + ($physical_type:expr, $logical_type:expr, $len:expr, $prec:expr, $scale:expr) => {{ + let tpe = PrimitiveTypeBuilder::new("col", $physical_type) + .with_logical_type($logical_type) + .with_length($len) + .with_precision($prec) + .with_scale($scale) + .build() + .unwrap(); + Rc::new(ColumnDescriptor::new( + Rc::new(tpe), + None, + 0, + 0, + ColumnPath::from("col"), + )) + }}; + } + + #[test] + fn test_row_convert_bool() { + // BOOLEAN value does not depend on logical type + let descr = make_column_descr![PhysicalType::BOOLEAN, LogicalType::NONE]; + + let row = Field::convert_bool(&descr, true); + assert_eq!(row, Field::Bool(true)); + + let row = Field::convert_bool(&descr, false); + assert_eq!(row, Field::Bool(false)); + } + + #[test] + fn test_row_convert_int32() { + let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_8]; + let row = Field::convert_int32(&descr, 111); + assert_eq!(row, Field::Byte(111)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_16]; + let row = Field::convert_int32(&descr, 222); + assert_eq!(row, Field::Short(222)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_32]; + let row = Field::convert_int32(&descr, 333); + assert_eq!(row, Field::Int(333)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_8]; + let row = Field::convert_int32(&descr, -1); + assert_eq!(row, Field::UByte(255)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_16]; + let row = Field::convert_int32(&descr, 256); + assert_eq!(row, Field::UShort(256)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_32]; + let row = Field::convert_int32(&descr, 1234); + assert_eq!(row, Field::UInt(1234)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::NONE]; + let row = Field::convert_int32(&descr, 444); + assert_eq!(row, Field::Int(444)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::DATE]; + let row = Field::convert_int32(&descr, 14611); + assert_eq!(row, Field::Date(14611)); + + let descr = make_column_descr![PhysicalType::INT32, LogicalType::DECIMAL, 0, 8, 2]; + let row = Field::convert_int32(&descr, 444); + assert_eq!(row, Field::Decimal(Decimal::from_i32(444, 8, 2))); + } + + #[test] + fn test_row_convert_int64() { + let descr = make_column_descr![PhysicalType::INT64, LogicalType::INT_64]; + let row = Field::convert_int64(&descr, 1111); + assert_eq!(row, Field::Long(1111)); + + let descr = make_column_descr![PhysicalType::INT64, LogicalType::UINT_64]; + let row = Field::convert_int64(&descr, 78239823); + assert_eq!(row, Field::ULong(78239823)); + + let descr = make_column_descr![PhysicalType::INT64, LogicalType::TIMESTAMP_MILLIS]; + let row = Field::convert_int64(&descr, 1541186529153); + assert_eq!(row, Field::Timestamp(1541186529153)); + + let descr = make_column_descr![PhysicalType::INT64, LogicalType::NONE]; + let row = Field::convert_int64(&descr, 2222); + assert_eq!(row, Field::Long(2222)); + + let descr = make_column_descr![PhysicalType::INT64, LogicalType::DECIMAL, 0, 8, 2]; + let row = Field::convert_int64(&descr, 3333); + assert_eq!(row, Field::Decimal(Decimal::from_i64(3333, 8, 2))); + } + + #[test] + fn test_row_convert_int96() { + // INT96 value does not depend on logical type + let descr = make_column_descr![PhysicalType::INT96, LogicalType::NONE]; + + let value = Int96::from(vec![0, 0, 2454923]); + let row = Field::convert_int96(&descr, value); + assert_eq!(row, Field::Timestamp(1238544000000)); + + let value = Int96::from(vec![4165425152, 13, 2454923]); + let row = Field::convert_int96(&descr, value); + assert_eq!(row, Field::Timestamp(1238544060000)); + } + + #[test] + #[should_panic(expected = "Expected non-negative milliseconds when converting Int96")] + fn test_row_convert_int96_invalid() { + // INT96 value does not depend on logical type + let descr = make_column_descr![PhysicalType::INT96, LogicalType::NONE]; + + let value = Int96::from(vec![0, 0, 0]); + Field::convert_int96(&descr, value); + } + + #[test] + fn test_row_convert_float() { + // FLOAT value does not depend on logical type + let descr = make_column_descr![PhysicalType::FLOAT, LogicalType::NONE]; + let row = Field::convert_float(&descr, 2.31); + assert_eq!(row, Field::Float(2.31)); + } + + #[test] + fn test_row_convert_double() { + // DOUBLE value does not depend on logical type + let descr = make_column_descr![PhysicalType::DOUBLE, LogicalType::NONE]; + let row = Field::convert_double(&descr, 1.56); + assert_eq!(row, Field::Double(1.56)); + } + + #[test] + fn test_row_convert_byte_array() { + // UTF8 + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::UTF8]; + let value = ByteArray::from(vec![b'A', b'B', b'C', b'D']); + let row = Field::convert_byte_array(&descr, value); + assert_eq!(row, Field::Str("ABCD".to_string())); + + // ENUM + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::ENUM]; + let value = ByteArray::from(vec![b'1', b'2', b'3']); + let row = Field::convert_byte_array(&descr, value); + assert_eq!(row, Field::Str("123".to_string())); + + // JSON + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::JSON]; + let value = ByteArray::from(vec![b'{', b'"', b'a', b'"', b':', b'1', b'}']); + let row = Field::convert_byte_array(&descr, value); + assert_eq!(row, Field::Str("{\"a\":1}".to_string())); + + // NONE + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::NONE]; + let value = ByteArray::from(vec![1, 2, 3, 4, 5]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Bytes(value)); + + // BSON + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::BSON]; + let value = ByteArray::from(vec![1, 2, 3, 4, 5]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Bytes(value)); + + // DECIMAL + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::DECIMAL, 0, 8, 2]; + let value = ByteArray::from(vec![207, 200]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 8, 2))); + + // DECIMAL (FIXED_LEN_BYTE_ARRAY) + let descr = make_column_descr![ + PhysicalType::FIXED_LEN_BYTE_ARRAY, + LogicalType::DECIMAL, + 8, + 17, + 5 + ]; + let value = ByteArray::from(vec![0, 0, 0, 0, 0, 4, 147, 224]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 17, 5))); + + // NONE (FIXED_LEN_BYTE_ARRAY) + let descr = make_column_descr![ + PhysicalType::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, + 6, + 0, + 0 + ]; + let value = ByteArray::from(vec![1, 2, 3, 4, 5, 6]); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row, Field::Bytes(value)); + } + + #[test] + fn test_convert_date_to_string() { + fn check_date_conversion(y: u32, m: u32, d: u32) { + let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(0, 0, 0); + let dt = Local.from_utc_datetime(&datetime); + let res = convert_date_to_string((dt.timestamp() / 60 / 60 / 24) as u32); + let exp = format!("{}", dt.format("%Y-%m-%d %:z")); + assert_eq!(res, exp); + } + + check_date_conversion(2010, 01, 02); + check_date_conversion(2014, 05, 01); + check_date_conversion(2016, 02, 29); + check_date_conversion(2017, 09, 12); + check_date_conversion(2018, 03, 31); + } + + #[test] + fn test_convert_timestamp_to_string() { + fn check_datetime_conversion(y: u32, m: u32, d: u32, h: u32, mi: u32, s: u32) { + let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(h, mi, s); + let dt = Local.from_utc_datetime(&datetime); + let res = convert_timestamp_to_string(dt.timestamp_millis() as u64); + let exp = format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")); + assert_eq!(res, exp); + } + + check_datetime_conversion(2010, 01, 02, 13, 12, 54); + check_datetime_conversion(2011, 01, 03, 08, 23, 01); + check_datetime_conversion(2012, 04, 05, 11, 06, 32); + check_datetime_conversion(2013, 05, 12, 16, 38, 00); + check_datetime_conversion(2014, 11, 28, 21, 15, 12); + } + + #[test] + fn test_convert_float_to_string() { + assert_eq!(format!("{}", Field::Float(1.0)), "1.0"); + assert_eq!(format!("{}", Field::Float(9.63)), "9.63"); + assert_eq!(format!("{}", Field::Float(1e-15)), "0.000000000000001"); + assert_eq!(format!("{}", Field::Float(1e-16)), "1E-16"); + assert_eq!(format!("{}", Field::Float(1e19)), "10000000000000000000.0"); + assert_eq!(format!("{}", Field::Float(1e20)), "1E20"); + assert_eq!(format!("{}", Field::Float(1.7976931E30)), "1.7976931E30"); + assert_eq!(format!("{}", Field::Float(-1.7976931E30)), "-1.7976931E30"); + } + + #[test] + fn test_convert_double_to_string() { + assert_eq!(format!("{}", Field::Double(1.0)), "1.0"); + assert_eq!(format!("{}", Field::Double(9.63)), "9.63"); + assert_eq!(format!("{}", Field::Double(1e-15)), "0.000000000000001"); + assert_eq!(format!("{}", Field::Double(1e-16)), "1E-16"); + assert_eq!(format!("{}", Field::Double(1e19)), "10000000000000000000.0"); + assert_eq!(format!("{}", Field::Double(1e20)), "1E20"); + assert_eq!( + format!("{}", Field::Double(1.79769313486E308)), + "1.79769313486E308" + ); + assert_eq!( + format!("{}", Field::Double(-1.79769313486E308)), + "-1.79769313486E308" + ); + } + + #[test] + fn test_convert_decimal_to_string() { + // Helper method to compare decimal + fn check_decimal(bytes: Vec, precision: i32, scale: i32, res: &str) { + let decimal = Decimal::from_bytes(ByteArray::from(bytes), precision, scale); + assert_eq!(convert_decimal_to_string(&decimal), res); + } + + // This example previously used to fail in some engines + check_decimal( + vec![0, 0, 0, 0, 0, 0, 0, 0, 13, 224, 182, 179, 167, 100, 0, 0], + 38, + 18, + "1.000000000000000000", + ); + check_decimal( + vec![ + 249, 233, 247, 16, 185, 192, 202, 223, 215, 165, 192, 166, 67, 72, + ], + 36, + 28, + "-12344.0242342304923409234234293432", + ); + check_decimal(vec![0, 0, 0, 0, 0, 4, 147, 224], 17, 5, "3.00000"); + check_decimal(vec![0, 0, 0, 0, 1, 201, 195, 140], 18, 2, "300000.12"); + check_decimal(vec![207, 200], 10, 2, "-123.44"); + check_decimal(vec![207, 200], 10, 8, "-0.00012344"); + } + + #[test] + fn test_row_display() { + // Primitive types + assert_eq!(format!("{}", Field::Null), "null"); + assert_eq!(format!("{}", Field::Bool(true)), "true"); + assert_eq!(format!("{}", Field::Bool(false)), "false"); + assert_eq!(format!("{}", Field::Byte(1)), "1"); + assert_eq!(format!("{}", Field::Short(2)), "2"); + assert_eq!(format!("{}", Field::Int(3)), "3"); + assert_eq!(format!("{}", Field::Long(4)), "4"); + assert_eq!(format!("{}", Field::UByte(1)), "1"); + assert_eq!(format!("{}", Field::UShort(2)), "2"); + assert_eq!(format!("{}", Field::UInt(3)), "3"); + assert_eq!(format!("{}", Field::ULong(4)), "4"); + assert_eq!(format!("{}", Field::Float(5.0)), "5.0"); + assert_eq!(format!("{}", Field::Float(5.1234)), "5.1234"); + assert_eq!(format!("{}", Field::Double(6.0)), "6.0"); + assert_eq!(format!("{}", Field::Double(6.1234)), "6.1234"); + assert_eq!(format!("{}", Field::Str("abc".to_string())), "\"abc\""); + assert_eq!( + format!("{}", Field::Bytes(ByteArray::from(vec![1, 2, 3]))), + "[1, 2, 3]" + ); + assert_eq!( + format!("{}", Field::Date(14611)), + convert_date_to_string(14611) + ); + assert_eq!( + format!("{}", Field::Timestamp(1262391174000)), + convert_timestamp_to_string(1262391174000) + ); + assert_eq!( + format!("{}", Field::Decimal(Decimal::from_i32(4, 8, 2))), + convert_decimal_to_string(&Decimal::from_i32(4, 8, 2)) + ); + + // Complex types + let fields = vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ("z".to_string(), Field::Float(3.1)), + ("a".to_string(), Field::Str("abc".to_string())), + ]; + let row = Field::Group(make_row(fields)); + assert_eq!(format!("{}", row), "{x: null, Y: 2, z: 3.1, a: \"abc\"}"); + + let row = Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ])); + assert_eq!(format!("{}", row), "[2, 1, null, 12]"); + + let row = Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ])); + assert_eq!(format!("{}", row), "{1 -> 1.2, 2 -> 4.5, 3 -> 2.3}"); + } + + #[test] + fn test_is_primitive() { + // primitives + assert!(Field::Null.is_primitive()); + assert!(Field::Bool(true).is_primitive()); + assert!(Field::Bool(false).is_primitive()); + assert!(Field::Byte(1).is_primitive()); + assert!(Field::Short(2).is_primitive()); + assert!(Field::Int(3).is_primitive()); + assert!(Field::Long(4).is_primitive()); + assert!(Field::UByte(1).is_primitive()); + assert!(Field::UShort(2).is_primitive()); + assert!(Field::UInt(3).is_primitive()); + assert!(Field::ULong(4).is_primitive()); + assert!(Field::Float(5.0).is_primitive()); + assert!(Field::Float(5.1234).is_primitive()); + assert!(Field::Double(6.0).is_primitive()); + assert!(Field::Double(6.1234).is_primitive()); + assert!(Field::Str("abc".to_string()).is_primitive()); + assert!(Field::Bytes(ByteArray::from(vec![1, 2, 3])).is_primitive()); + assert!(Field::Timestamp(12345678).is_primitive()); + assert!(Field::Decimal(Decimal::from_i32(4, 8, 2)).is_primitive()); + + // complex types + assert_eq!( + false, + Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ("z".to_string(), Field::Float(3.1)), + ("a".to_string(), Field::Str("abc".to_string())) + ])) + .is_primitive() + ); + + assert_eq!( + false, + Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12) + ])) + .is_primitive() + ); + + assert_eq!( + false, + Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)) + ])) + .is_primitive() + ); + } + + #[test] + fn test_row_primitive_accessors() { + // primitives + let row = make_row(vec![ + ("a".to_string(), Field::Null), + ("b".to_string(), Field::Bool(false)), + ("c".to_string(), Field::Byte(3)), + ("d".to_string(), Field::Short(4)), + ("e".to_string(), Field::Int(5)), + ("f".to_string(), Field::Long(6)), + ("g".to_string(), Field::UByte(3)), + ("h".to_string(), Field::UShort(4)), + ("i".to_string(), Field::UInt(5)), + ("j".to_string(), Field::ULong(6)), + ("k".to_string(), Field::Float(7.1)), + ("l".to_string(), Field::Double(8.1)), + ("m".to_string(), Field::Str("abc".to_string())), + ( + "n".to_string(), + Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5])), + ), + ("o".to_string(), Field::Decimal(Decimal::from_i32(4, 7, 2))), + ]); + + assert_eq!(false, row.get_bool(1).unwrap()); + assert_eq!(3, row.get_byte(2).unwrap()); + assert_eq!(4, row.get_short(3).unwrap()); + assert_eq!(5, row.get_int(4).unwrap()); + assert_eq!(6, row.get_long(5).unwrap()); + assert_eq!(3, row.get_ubyte(6).unwrap()); + assert_eq!(4, row.get_ushort(7).unwrap()); + assert_eq!(5, row.get_uint(8).unwrap()); + assert_eq!(6, row.get_ulong(9).unwrap()); + assert_eq!(7.1, row.get_float(10).unwrap()); + assert_eq!(8.1, row.get_double(11).unwrap()); + assert_eq!("abc", row.get_string(12).unwrap()); + assert_eq!(5, row.get_bytes(13).unwrap().len()); + assert_eq!(7, row.get_decimal(14).unwrap().precision()); + } + + #[test] + fn test_row_primitive_invalid_accessors() { + // primitives + let row = make_row(vec![ + ("a".to_string(), Field::Null), + ("b".to_string(), Field::Bool(false)), + ("c".to_string(), Field::Byte(3)), + ("d".to_string(), Field::Short(4)), + ("e".to_string(), Field::Int(5)), + ("f".to_string(), Field::Long(6)), + ("g".to_string(), Field::UByte(3)), + ("h".to_string(), Field::UShort(4)), + ("i".to_string(), Field::UInt(5)), + ("j".to_string(), Field::ULong(6)), + ("k".to_string(), Field::Float(7.1)), + ("l".to_string(), Field::Double(8.1)), + ("m".to_string(), Field::Str("abc".to_string())), + ( + "n".to_string(), + Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5])), + ), + ("o".to_string(), Field::Decimal(Decimal::from_i32(4, 7, 2))), + ]); + + for i in 0..row.len() { + assert!(row.get_group(i).is_err()); + } + } + + #[test] + fn test_row_complex_accessors() { + let row = make_row(vec![ + ( + "a".to_string(), + Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ])), + ), + ( + "b".to_string(), + Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ])), + ), + ( + "c".to_string(), + Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ])), + ), + ]); + + assert_eq!(2, row.get_group(0).unwrap().len()); + assert_eq!(4, row.get_list(1).unwrap().len()); + assert_eq!(3, row.get_map(2).unwrap().len()); + } + + #[test] + fn test_row_complex_invalid_accessors() { + let row = make_row(vec![ + ( + "a".to_string(), + Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ])), + ), + ( + "b".to_string(), + Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ])), + ), + ( + "c".to_string(), + Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ])), + ), + ]); + + assert_eq!( + ParquetError::General("Cannot access Group as Float".to_string()), + row.get_float(0).unwrap_err() + ); + assert_eq!( + ParquetError::General("Cannot access ListInternal as Float".to_string()), + row.get_float(1).unwrap_err() + ); + assert_eq!( + ParquetError::General("Cannot access MapInternal as Float".to_string()), + row.get_float(2).unwrap_err() + ); + } + + #[test] + fn test_list_primitive_accessors() { + // primitives + let list = make_list(vec![Field::Bool(false)]); + assert_eq!(false, list.get_bool(0).unwrap()); + + let list = make_list(vec![Field::Byte(3), Field::Byte(4)]); + assert_eq!(4, list.get_byte(1).unwrap()); + + let list = make_list(vec![Field::Short(4), Field::Short(5), Field::Short(6)]); + assert_eq!(6, list.get_short(2).unwrap()); + + let list = make_list(vec![Field::Int(5)]); + assert_eq!(5, list.get_int(0).unwrap()); + + let list = make_list(vec![Field::Long(6), Field::Long(7)]); + assert_eq!(7, list.get_long(1).unwrap()); + + let list = make_list(vec![Field::UByte(3), Field::UByte(4)]); + assert_eq!(4, list.get_ubyte(1).unwrap()); + + let list = make_list(vec![Field::UShort(4), Field::UShort(5), Field::UShort(6)]); + assert_eq!(6, list.get_ushort(2).unwrap()); + + let list = make_list(vec![Field::UInt(5)]); + assert_eq!(5, list.get_uint(0).unwrap()); + + let list = make_list(vec![Field::ULong(6), Field::ULong(7)]); + assert_eq!(7, list.get_ulong(1).unwrap()); + + let list = make_list(vec![ + Field::Float(8.1), + Field::Float(9.2), + Field::Float(10.3), + ]); + assert_eq!(10.3, list.get_float(2).unwrap()); + + let list = make_list(vec![Field::Double(3.1415)]); + assert_eq!(3.1415, list.get_double(0).unwrap()); + + let list = make_list(vec![Field::Str("abc".to_string())]); + assert_eq!(&"abc".to_string(), list.get_string(0).unwrap()); + + let list = make_list(vec![Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5]))]); + assert_eq!(&[1, 2, 3, 4, 5], list.get_bytes(0).unwrap().data()); + + let list = make_list(vec![Field::Decimal(Decimal::from_i32(4, 5, 2))]); + assert_eq!(&[0, 0, 0, 4], list.get_decimal(0).unwrap().data()); + } + + #[test] + fn test_list_primitive_invalid_accessors() { + // primitives + let list = make_list(vec![Field::Bool(false)]); + assert!(list.get_byte(0).is_err()); + + let list = make_list(vec![Field::Byte(3), Field::Byte(4)]); + assert!(list.get_short(1).is_err()); + + let list = make_list(vec![Field::Short(4), Field::Short(5), Field::Short(6)]); + assert!(list.get_int(2).is_err()); + + let list = make_list(vec![Field::Int(5)]); + assert!(list.get_long(0).is_err()); + + let list = make_list(vec![Field::Long(6), Field::Long(7)]); + assert!(list.get_float(1).is_err()); + + let list = make_list(vec![Field::UByte(3), Field::UByte(4)]); + assert!(list.get_short(1).is_err()); + + let list = make_list(vec![Field::UShort(4), Field::UShort(5), Field::UShort(6)]); + assert!(list.get_int(2).is_err()); + + let list = make_list(vec![Field::UInt(5)]); + assert!(list.get_long(0).is_err()); + + let list = make_list(vec![Field::ULong(6), Field::ULong(7)]); + assert!(list.get_float(1).is_err()); + + let list = make_list(vec![ + Field::Float(8.1), + Field::Float(9.2), + Field::Float(10.3), + ]); + assert!(list.get_double(2).is_err()); + + let list = make_list(vec![Field::Double(3.1415)]); + assert!(list.get_string(0).is_err()); + + let list = make_list(vec![Field::Str("abc".to_string())]); + assert!(list.get_bytes(0).is_err()); + + let list = make_list(vec![Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5]))]); + assert!(list.get_bool(0).is_err()); + + let list = make_list(vec![Field::Decimal(Decimal::from_i32(4, 5, 2))]); + assert!(list.get_bool(0).is_err()); + } + + #[test] + fn test_list_complex_accessors() { + let list = make_list(vec![Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ]))]); + assert_eq!(2, list.get_group(0).unwrap().len()); + + let list = make_list(vec![Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ]))]); + assert_eq!(4, list.get_list(0).unwrap().len()); + + let list = make_list(vec![Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ]))]); + assert_eq!(3, list.get_map(0).unwrap().len()); + } + + #[test] + fn test_list_complex_invalid_accessors() { + let list = make_list(vec![Field::Group(make_row(vec![ + ("x".to_string(), Field::Null), + ("Y".to_string(), Field::Int(2)), + ]))]); + assert_eq!( + general_err!("Cannot access Group as Float".to_string()), + list.get_float(0).unwrap_err() + ); + + let list = make_list(vec![Field::ListInternal(make_list(vec![ + Field::Int(2), + Field::Int(1), + Field::Null, + Field::Int(12), + ]))]); + assert_eq!( + general_err!("Cannot access ListInternal as Float".to_string()), + list.get_float(0).unwrap_err() + ); + + let list = make_list(vec![Field::MapInternal(make_map(vec![ + (Field::Int(1), Field::Float(1.2)), + (Field::Int(2), Field::Float(4.5)), + (Field::Int(3), Field::Float(2.3)), + ]))]); + assert_eq!( + general_err!("Cannot access MapInternal as Float".to_string()), + list.get_float(0).unwrap_err() + ); + } + + #[test] + fn test_map_accessors() { + // a map from int to string + let map = make_map(vec![ + (Field::Int(1), Field::Str("a".to_string())), + (Field::Int(2), Field::Str("b".to_string())), + (Field::Int(3), Field::Str("c".to_string())), + (Field::Int(4), Field::Str("d".to_string())), + (Field::Int(5), Field::Str("e".to_string())), + ]); + + assert_eq!(5, map.len()); + for i in 0..5 { + assert_eq!((i + 1) as i32, map.get_keys().get_int(i).unwrap()); + assert_eq!( + &((i as u8 + 'a' as u8) as char).to_string(), + map.get_values().get_string(i).unwrap() + ); + } + } +} diff --git a/rust/src/parquet/record/mod.rs b/rust/src/parquet/record/mod.rs new file mode 100644 index 0000000000000..0dba8a78bd165 --- /dev/null +++ b/rust/src/parquet/record/mod.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains record-based API for reading Parquet files. + +mod api; +pub mod reader; +mod triplet; + +pub use self::api::{List, ListAccessor, Map, MapAccessor, Row, RowAccessor}; diff --git a/rust/src/parquet/record/reader.rs b/rust/src/parquet/record/reader.rs new file mode 100644 index 0000000000000..d9f3d6fea1978 --- /dev/null +++ b/rust/src/parquet/record/reader.rs @@ -0,0 +1,1464 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains implementation of record assembly and converting Parquet types into +//! [`Row`](`::record::api::Row`)s. + +use std::{collections::HashMap, fmt, rc::Rc}; + +use crate::parquet::basic::{LogicalType, Repetition}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::file::reader::{FileReader, RowGroupReader}; +use crate::parquet::record::{ + api::{make_list, make_map, make_row, Field, Row}, + triplet::TripletIter, +}; +use crate::parquet::schema::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; + +/// Default batch size for a reader +const DEFAULT_BATCH_SIZE: usize = 1024; + +/// Tree builder for `Reader` enum. +/// Serves as a container of options for building a reader tree and a builder, and +/// accessing a records iterator [`RowIter`]. +pub struct TreeBuilder { + // Batch size (>= 1) for triplet iterators + batch_size: usize, +} + +impl TreeBuilder { + /// Creates new tree builder with default parameters. + pub fn new() -> Self { + Self { + batch_size: DEFAULT_BATCH_SIZE, + } + } + + /// Sets batch size for this tree builder. + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = batch_size; + self + } + + /// Creates new root reader for provided schema and row group. + pub fn build(&self, descr: SchemaDescPtr, row_group_reader: &RowGroupReader) -> Reader { + // Prepare lookup table of column path -> original column index + // This allows to prune columns and map schema leaf nodes to the column readers + let mut paths: HashMap = HashMap::new(); + let row_group_metadata = row_group_reader.metadata(); + + for col_index in 0..row_group_reader.num_columns() { + let col_meta = row_group_metadata.column(col_index); + let col_path = col_meta.column_path().clone(); + paths.insert(col_path, col_index); + } + + // Build child readers for the message type + let mut readers = Vec::new(); + let mut path = Vec::new(); + + for field in descr.root_schema().get_fields() { + let reader = self.reader_tree(field.clone(), &mut path, 0, 0, &paths, row_group_reader); + readers.push(reader); + } + + // Return group reader for message type, + // it is always required with definition level 0 + Reader::GroupReader(None, 0, readers) + } + + /// Creates iterator of `Row`s directly from schema descriptor and row group. + pub fn as_iter(&self, descr: SchemaDescPtr, row_group_reader: &RowGroupReader) -> ReaderIter { + let num_records = row_group_reader.metadata().num_rows() as usize; + ReaderIter::new(self.build(descr, row_group_reader), num_records) + } + + /// Builds tree of readers for the current schema recursively. + fn reader_tree( + &self, + field: TypePtr, + mut path: &mut Vec, + mut curr_def_level: i16, + mut curr_rep_level: i16, + paths: &HashMap, + row_group_reader: &RowGroupReader, + ) -> Reader { + assert!(field.get_basic_info().has_repetition()); + // Update current definition and repetition levels for this type + let repetition = field.get_basic_info().repetition(); + match repetition { + Repetition::OPTIONAL => { + curr_def_level += 1; + } + Repetition::REPEATED => { + curr_def_level += 1; + curr_rep_level += 1; + } + _ => {} + } + + path.push(String::from(field.name())); + let reader = if field.is_primitive() { + let col_path = ColumnPath::new(path.to_vec()); + let orig_index = *paths.get(&col_path).unwrap(); + let col_descr = row_group_reader + .metadata() + .column(orig_index) + .column_descr_ptr(); + let col_reader = row_group_reader.get_column_reader(orig_index).unwrap(); + let column = TripletIter::new(col_descr, col_reader, self.batch_size); + Reader::PrimitiveReader(field, column) + } else { + match field.get_basic_info().logical_type() { + // List types + LogicalType::LIST => { + assert_eq!(field.get_fields().len(), 1, "Invalid list type {:?}", field); + + let repeated_field = field.get_fields()[0].clone(); + assert_eq!( + repeated_field.get_basic_info().repetition(), + Repetition::REPEATED, + "Invalid list type {:?}", + field + ); + + if Reader::is_element_type(&repeated_field) { + // Support for backward compatible lists + let reader = self.reader_tree( + repeated_field.clone(), + &mut path, + curr_def_level, + curr_rep_level, + paths, + row_group_reader, + ); + + Reader::RepeatedReader( + field, + curr_def_level, + curr_rep_level, + Box::new(reader), + ) + } else { + let child_field = repeated_field.get_fields()[0].clone(); + + path.push(String::from(repeated_field.name())); + + let reader = self.reader_tree( + child_field, + &mut path, + curr_def_level + 1, + curr_rep_level + 1, + paths, + row_group_reader, + ); + + path.pop(); + + Reader::RepeatedReader( + field, + curr_def_level, + curr_rep_level, + Box::new(reader), + ) + } + } + // Map types (key-value pairs) + LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + assert_eq!(field.get_fields().len(), 1, "Invalid map type: {:?}", field); + assert!( + !field.get_fields()[0].is_primitive(), + "Invalid map type: {:?}", + field + ); + + let key_value_type = field.get_fields()[0].clone(); + assert_eq!( + key_value_type.get_basic_info().repetition(), + Repetition::REPEATED, + "Invalid map type: {:?}", + field + ); + assert_eq!( + key_value_type.get_fields().len(), + 2, + "Invalid map type: {:?}", + field + ); + + path.push(String::from(key_value_type.name())); + + let key_type = &key_value_type.get_fields()[0]; + assert!( + key_type.is_primitive(), + "Map key type is expected to be a primitive type, but found {:?}", + key_type + ); + let key_reader = self.reader_tree( + key_type.clone(), + &mut path, + curr_def_level + 1, + curr_rep_level + 1, + paths, + row_group_reader, + ); + + let value_type = &key_value_type.get_fields()[1]; + let value_reader = self.reader_tree( + value_type.clone(), + &mut path, + curr_def_level + 1, + curr_rep_level + 1, + paths, + row_group_reader, + ); + + path.pop(); + + Reader::KeyValueReader( + field, + curr_def_level, + curr_rep_level, + Box::new(key_reader), + Box::new(value_reader), + ) + } + // A repeated field that is neither contained by a `LIST`- or `MAP`-annotated + // group nor annotated by `LIST` or `MAP` should be interpreted as a required + // list of required elements where the element type is the type of the field. + _ if repetition == Repetition::REPEATED => { + let required_field = Type::group_type_builder(field.name()) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(field.get_basic_info().logical_type()) + .with_fields(&mut Vec::from(field.get_fields())) + .build() + .unwrap(); + + path.pop(); + + let reader = self.reader_tree( + Rc::new(required_field), + &mut path, + curr_def_level, + curr_rep_level, + paths, + row_group_reader, + ); + + Reader::RepeatedReader( + field, + curr_def_level - 1, + curr_rep_level - 1, + Box::new(reader), + ) + } + // Group types (structs) + _ => { + let mut readers = Vec::new(); + for child in field.get_fields() { + let reader = self.reader_tree( + child.clone(), + &mut path, + curr_def_level, + curr_rep_level, + paths, + row_group_reader, + ); + readers.push(reader); + } + Reader::GroupReader(Some(field), curr_def_level, readers) + } + } + }; + path.pop(); + + Reader::option(repetition, curr_def_level, reader) + } +} + +/// Reader tree for record assembly +pub enum Reader { + // Primitive reader with type information and triplet iterator + PrimitiveReader(TypePtr, TripletIter), + // Optional reader with definition level of a parent and a reader + OptionReader(i16, Box), + // Group (struct) reader with type information, definition level and list of child + // readers. When it represents message type, type information is None + GroupReader(Option, i16, Vec), + // Reader for repeated values, e.g. lists, contains type information, definition + // level, repetition level and a child reader + RepeatedReader(TypePtr, i16, i16, Box), + // Reader of key-value pairs, e.g. maps, contains type information, definition level, + // repetition level, child reader for keys and child reader for values + KeyValueReader(TypePtr, i16, i16, Box, Box), +} + +impl Reader { + /// Wraps reader in option reader based on repetition. + fn option(repetition: Repetition, def_level: i16, reader: Reader) -> Self { + if repetition == Repetition::OPTIONAL { + Reader::OptionReader(def_level - 1, Box::new(reader)) + } else { + reader + } + } + + /// Returns true if repeated type is an element type for the list. + /// Used to determine legacy list types. + /// This method is copied from Spark Parquet reader and is based on the reference: + /// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + /// #backward-compatibility-rules + fn is_element_type(repeated_type: &Type) -> bool { + // For legacy 2-level list types with primitive element type, e.g.: + // + // // ARRAY (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated int32 element; + // } + // + repeated_type.is_primitive() || + // For legacy 2-level list types whose element type is a group type with 2 or more + // fields, e.g.: + // + // // ARRAY> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group element { + // required binary str (UTF8); + // required int32 num; + // }; + // } + // + repeated_type.is_group() && repeated_type.get_fields().len() > 1 || + // For legacy 2-level list types generated by parquet-avro (Parquet version < 1.6.0), + // e.g.: + // + // // ARRAY> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group array { + // required binary str (UTF8); + // }; + // } + // + repeated_type.name() == "array" || + // For Parquet data generated by parquet-thrift, e.g.: + // + // // ARRAY> (nullable list, non-null elements) + // optional group my_list (LIST) { + // repeated group my_list_tuple { + // required binary str (UTF8); + // }; + // } + // + repeated_type.name().ends_with("_tuple") + } + + /// Reads current record as `Row` from the reader tree. + /// Automatically advances all necessary readers. + /// This must be called on the root level reader (i.e., for Message type). + /// Otherwise, it will panic. + fn read(&mut self) -> Row { + match *self { + Reader::GroupReader(_, _, ref mut readers) => { + let mut fields = Vec::new(); + for reader in readers { + fields.push((String::from(reader.field_name()), reader.read_field())); + } + make_row(fields) + } + _ => panic!("Cannot call read() on {}", self), + } + } + + /// Reads current record as `Field` from the reader tree. + /// Automatically advances all necessary readers. + fn read_field(&mut self) -> Field { + match *self { + Reader::PrimitiveReader(_, ref mut column) => { + let value = column.current_value(); + column.read_next().unwrap(); + value + } + Reader::OptionReader(def_level, ref mut reader) => { + if reader.current_def_level() > def_level { + reader.read_field() + } else { + reader.advance_columns(); + Field::Null + } + } + Reader::GroupReader(_, def_level, ref mut readers) => { + let mut fields = Vec::new(); + for reader in readers { + if reader.repetition() != Repetition::OPTIONAL + || reader.current_def_level() > def_level + { + fields.push((String::from(reader.field_name()), reader.read_field())); + } else { + reader.advance_columns(); + fields.push((String::from(reader.field_name()), Field::Null)); + } + } + let row = make_row(fields); + Field::Group(row) + } + Reader::RepeatedReader(_, def_level, rep_level, ref mut reader) => { + let mut elements = Vec::new(); + loop { + if reader.current_def_level() > def_level { + elements.push(reader.read_field()); + } else { + reader.advance_columns(); + // If the current definition level is equal to the definition level of this + // repeated type, then the result is an empty list and the repetition level + // will always be <= rl. + break; + } + + // This covers case when we are out of repetition levels and should close the + // group, or there are no values left to buffer. + if !reader.has_next() || reader.current_rep_level() <= rep_level { + break; + } + } + Field::ListInternal(make_list(elements)) + } + Reader::KeyValueReader(_, def_level, rep_level, ref mut keys, ref mut values) => { + let mut pairs = Vec::new(); + loop { + if keys.current_def_level() > def_level { + pairs.push((keys.read_field(), values.read_field())); + } else { + keys.advance_columns(); + values.advance_columns(); + // If the current definition level is equal to the definition level of this + // repeated type, then the result is an empty list and the repetition level + // will always be <= rl. + break; + } + + // This covers case when we are out of repetition levels and should close the + // group, or there are no values left to buffer. + if !keys.has_next() || keys.current_rep_level() <= rep_level { + break; + } + } + + Field::MapInternal(make_map(pairs)) + } + } + } + + /// Returns field name for the current reader. + fn field_name(&self) -> &str { + match *self { + Reader::PrimitiveReader(ref field, _) => field.name(), + Reader::OptionReader(_, ref reader) => reader.field_name(), + Reader::GroupReader(ref opt, ..) => match opt { + &Some(ref field) => field.name(), + &None => panic!("Field is None for group reader"), + }, + Reader::RepeatedReader(ref field, ..) => field.name(), + Reader::KeyValueReader(ref field, ..) => field.name(), + } + } + + /// Returns repetition for the current reader. + fn repetition(&self) -> Repetition { + match *self { + Reader::PrimitiveReader(ref field, _) => field.get_basic_info().repetition(), + Reader::OptionReader(_, ref reader) => reader.repetition(), + Reader::GroupReader(ref opt, ..) => match opt { + &Some(ref field) => field.get_basic_info().repetition(), + &None => panic!("Field is None for group reader"), + }, + Reader::RepeatedReader(ref field, ..) => field.get_basic_info().repetition(), + Reader::KeyValueReader(ref field, ..) => field.get_basic_info().repetition(), + } + } + + /// Returns true, if current reader has more values, false otherwise. + /// Method does not advance internal iterator. + fn has_next(&self) -> bool { + match *self { + Reader::PrimitiveReader(_, ref column) => column.has_next(), + Reader::OptionReader(_, ref reader) => reader.has_next(), + Reader::GroupReader(_, _, ref readers) => readers.first().unwrap().has_next(), + Reader::RepeatedReader(_, _, _, ref reader) => reader.has_next(), + Reader::KeyValueReader(_, _, _, ref keys, _) => keys.has_next(), + } + } + + /// Returns current definition level, + /// Method does not advance internal iterator. + fn current_def_level(&self) -> i16 { + match *self { + Reader::PrimitiveReader(_, ref column) => column.current_def_level(), + Reader::OptionReader(_, ref reader) => reader.current_def_level(), + Reader::GroupReader(_, _, ref readers) => match readers.first() { + Some(reader) => reader.current_def_level(), + None => panic!("Current definition level: empty group reader"), + }, + Reader::RepeatedReader(_, _, _, ref reader) => reader.current_def_level(), + Reader::KeyValueReader(_, _, _, ref keys, _) => keys.current_def_level(), + } + } + + /// Returns current repetition level. + /// Method does not advance internal iterator. + fn current_rep_level(&self) -> i16 { + match *self { + Reader::PrimitiveReader(_, ref column) => column.current_rep_level(), + Reader::OptionReader(_, ref reader) => reader.current_rep_level(), + Reader::GroupReader(_, _, ref readers) => match readers.first() { + Some(reader) => reader.current_rep_level(), + None => panic!("Current repetition level: empty group reader"), + }, + Reader::RepeatedReader(_, _, _, ref reader) => reader.current_rep_level(), + Reader::KeyValueReader(_, _, _, ref keys, _) => keys.current_rep_level(), + } + } + + /// Advances leaf columns for the current reader. + fn advance_columns(&mut self) { + match *self { + Reader::PrimitiveReader(_, ref mut column) => { + column.read_next().unwrap(); + } + Reader::OptionReader(_, ref mut reader) => { + reader.advance_columns(); + } + Reader::GroupReader(_, _, ref mut readers) => { + for reader in readers { + reader.advance_columns(); + } + } + Reader::RepeatedReader(_, _, _, ref mut reader) => { + reader.advance_columns(); + } + Reader::KeyValueReader(_, _, _, ref mut keys, ref mut values) => { + keys.advance_columns(); + values.advance_columns(); + } + } + } +} + +impl fmt::Display for Reader { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let s = match self { + Reader::PrimitiveReader(..) => "PrimitiveReader", + Reader::OptionReader(..) => "OptionReader", + Reader::GroupReader(..) => "GroupReader", + Reader::RepeatedReader(..) => "RepeatedReader", + Reader::KeyValueReader(..) => "KeyValueReader", + }; + write!(f, "{}", s) + } +} + +// ---------------------------------------------------------------------- +// Row iterators + +/// Iterator of [`Row`](`::record::api::Row`)s. +/// It is used either for a single row group to iterate over data in that row group, or +/// an entire file with auto buffering of all row groups. +pub struct RowIter<'a> { + descr: SchemaDescPtr, + tree_builder: TreeBuilder, + file_reader: Option<&'a FileReader>, + current_row_group: usize, + num_row_groups: usize, + row_iter: Option, +} + +impl<'a> RowIter<'a> { + /// Creates iterator of [`Row`](`::record::api::Row`)s for all row groups in a file. + pub fn from_file(proj: Option, reader: &'a FileReader) -> Result { + let descr = + Self::get_proj_descr(proj, reader.metadata().file_metadata().schema_descr_ptr())?; + let num_row_groups = reader.num_row_groups(); + + Ok(Self { + descr, + tree_builder: Self::tree_builder(), + file_reader: Some(reader), + current_row_group: 0, + num_row_groups, + row_iter: None, + }) + } + + /// Creates iterator of [`Row`](`::record::api::Row`)s for a specific row group. + pub fn from_row_group(proj: Option, reader: &'a RowGroupReader) -> Result { + let descr = Self::get_proj_descr(proj, reader.metadata().schema_descr_ptr())?; + let tree_builder = Self::tree_builder(); + let row_iter = tree_builder.as_iter(descr.clone(), reader); + + // For row group we need to set `current_row_group` >= `num_row_groups`, because we + // only have one row group and can't buffer more. + Ok(Self { + descr, + tree_builder, + file_reader: None, + current_row_group: 0, + num_row_groups: 0, + row_iter: Some(row_iter), + }) + } + + /// Returns common tree builder, so the same settings are applied to both iterators + /// from file reader and row group. + #[inline] + fn tree_builder() -> TreeBuilder { + TreeBuilder::new() + } + + /// Helper method to get schema descriptor for projected schema. + /// If projection is None, then full schema is returned. + #[inline] + fn get_proj_descr(proj: Option, root_descr: SchemaDescPtr) -> Result { + match proj { + Some(projection) => { + // check if projection is part of file schema + let root_schema = root_descr.root_schema(); + if !root_schema.check_contains(&projection) { + return Err(general_err!("Root schema does not contain projection")); + } + Ok(Rc::new(SchemaDescriptor::new(Rc::new(projection)))) + } + None => Ok(root_descr), + } + } +} + +impl<'a> Iterator for RowIter<'a> { + type Item = Row; + + fn next(&mut self) -> Option { + let mut row = None; + if let Some(ref mut iter) = self.row_iter { + row = iter.next(); + } + + while row.is_none() && self.current_row_group < self.num_row_groups { + // We do not expect any failures when accessing a row group, and file reader + // must be set for selecting next row group. + let row_group_reader = &*self + .file_reader + .as_ref() + .expect("File reader is required to advance row group") + .get_row_group(self.current_row_group) + .unwrap(); + self.current_row_group += 1; + let mut iter = self + .tree_builder + .as_iter(self.descr.clone(), row_group_reader); + row = iter.next(); + self.row_iter = Some(iter); + } + + row + } +} + +/// Internal iterator of [`Row`](`::record::api::Row`)s for a reader. +pub struct ReaderIter { + root_reader: Reader, + records_left: usize, +} + +impl ReaderIter { + fn new(mut root_reader: Reader, num_records: usize) -> Self { + // Prepare root reader by advancing all column vectors + root_reader.advance_columns(); + Self { + root_reader, + records_left: num_records, + } + } +} + +impl Iterator for ReaderIter { + type Item = Row; + + fn next(&mut self) -> Option { + if self.records_left > 0 { + self.records_left -= 1; + Some(self.root_reader.read()) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::errors::{ParquetError, Result}; + use crate::parquet::file::reader::{FileReader, SerializedFileReader}; + use crate::parquet::record::api::{Field, Row}; + use crate::parquet::schema::parser::parse_message_type; + use crate::parquet::util::test_common::get_test_file; + + // Convenient macros to assemble row, list, map, and group. + + macro_rules! row { + () => { + { + let result = Vec::new(); + make_row(result) + } + }; + ( $( $e:expr ), + ) => { + { + let mut result = Vec::new(); + $( + result.push($e); + )* + make_row(result) + } + } + } + + macro_rules! list { + () => { + { + let result = Vec::new(); + Field::ListInternal(make_list(result)) + } + }; + ( $( $e:expr ), + ) => { + { + let mut result = Vec::new(); + $( + result.push($e); + )* + Field::ListInternal(make_list(result)) + } + } + } + + macro_rules! map { + () => { + { + let result = Vec::new(); + Field::MapInternal(make_map(result)) + } + }; + ( $( $e:expr ), + ) => { + { + let mut result = Vec::new(); + $( + result.push($e); + )* + Field::MapInternal(make_map(result)) + } + } + } + + macro_rules! group { + ( $( $e:expr ), * ) => { + { + Field::Group(row!($( $e ), *)) + } + } + } + + #[test] + fn test_file_reader_rows_nulls() { + let rows = test_file_reader_rows("nulls.snappy.parquet", None).unwrap(); + let expected_rows = vec![ + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + row![( + "b_struct".to_string(), + group![("b_c_int".to_string(), Field::Null)] + )], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_nonnullable() { + let rows = test_file_reader_rows("nonnullable.impala.parquet", None).unwrap(); + let expected_rows = vec![row![ + ("ID".to_string(), Field::Long(8)), + ("Int_Array".to_string(), list![Field::Int(-1)]), + ( + "int_array_array".to_string(), + list![list![Field::Int(-1), Field::Int(-2)], list![]] + ), + ( + "Int_Map".to_string(), + map![(Field::Str("k1".to_string()), Field::Int(-1))] + ), + ( + "int_map_array".to_string(), + list![ + map![], + map![(Field::Str("k1".to_string()), Field::Int(1))], + map![], + map![] + ] + ), + ( + "nested_Struct".to_string(), + group![ + ("a".to_string(), Field::Int(-1)), + ("B".to_string(), list![Field::Int(-1)]), + ( + "c".to_string(), + group![( + "D".to_string(), + list![list![group![ + ("e".to_string(), Field::Int(-1)), + ("f".to_string(), Field::Str("nonnullable".to_string())) + ]]] + )] + ), + ("G".to_string(), map![]) + ] + ) + ]]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_nullable() { + let rows = test_file_reader_rows("nullable.impala.parquet", None).unwrap(); + let expected_rows = vec![ + row![ + ("id".to_string(), Field::Long(1)), + ( + "int_array".to_string(), + list![Field::Int(1), Field::Int(2), Field::Int(3)] + ), + ( + "int_array_Array".to_string(), + list![ + list![Field::Int(1), Field::Int(2)], + list![Field::Int(3), Field::Int(4)] + ] + ), + ( + "int_map".to_string(), + map![ + (Field::Str("k1".to_string()), Field::Int(1)), + (Field::Str("k2".to_string()), Field::Int(100)) + ] + ), + ( + "int_Map_Array".to_string(), + list![map![(Field::Str("k1".to_string()), Field::Int(1))]] + ), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Int(1)), + ("b".to_string(), list![Field::Int(1)]), + ( + "C".to_string(), + group![( + "d".to_string(), + list![ + list![ + group![ + ("E".to_string(), Field::Int(10)), + ("F".to_string(), Field::Str("aaa".to_string())) + ], + group![ + ("E".to_string(), Field::Int(-10)), + ("F".to_string(), Field::Str("bbb".to_string())) + ] + ], + list![group![ + ("E".to_string(), Field::Int(11)), + ("F".to_string(), Field::Str("c".to_string())) + ]] + ] + )] + ), + ( + "g".to_string(), + map![( + Field::Str("foo".to_string()), + group![( + "H".to_string(), + group![("i".to_string(), list![Field::Double(1.1)])] + )] + )] + ) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(2)), + ( + "int_array".to_string(), + list![ + Field::Null, + Field::Int(1), + Field::Int(2), + Field::Null, + Field::Int(3), + Field::Null + ] + ), + ( + "int_array_Array".to_string(), + list![ + list![Field::Null, Field::Int(1), Field::Int(2), Field::Null], + list![Field::Int(3), Field::Null, Field::Int(4)], + list![], + Field::Null + ] + ), + ( + "int_map".to_string(), + map![ + (Field::Str("k1".to_string()), Field::Int(2)), + (Field::Str("k2".to_string()), Field::Null) + ] + ), + ( + "int_Map_Array".to_string(), + list![ + map![ + (Field::Str("k3".to_string()), Field::Null), + (Field::Str("k1".to_string()), Field::Int(1)) + ], + Field::Null, + map![] + ] + ), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Null), + ("b".to_string(), list![Field::Null]), + ( + "C".to_string(), + group![( + "d".to_string(), + list![ + list![ + group![ + ("E".to_string(), Field::Null), + ("F".to_string(), Field::Null) + ], + group![ + ("E".to_string(), Field::Int(10)), + ("F".to_string(), Field::Str("aaa".to_string())) + ], + group![ + ("E".to_string(), Field::Null), + ("F".to_string(), Field::Null) + ], + group![ + ("E".to_string(), Field::Int(-10)), + ("F".to_string(), Field::Str("bbb".to_string())) + ], + group![ + ("E".to_string(), Field::Null), + ("F".to_string(), Field::Null) + ] + ], + list![ + group![ + ("E".to_string(), Field::Int(11)), + ("F".to_string(), Field::Str("c".to_string())) + ], + Field::Null + ], + list![], + Field::Null + ] + )] + ), + ( + "g".to_string(), + map![ + ( + Field::Str("g1".to_string()), + group![( + "H".to_string(), + group![( + "i".to_string(), + list![Field::Double(2.2), Field::Null] + )] + )] + ), + ( + Field::Str("g2".to_string()), + group![("H".to_string(), group![("i".to_string(), list![])])] + ), + (Field::Str("g3".to_string()), Field::Null), + ( + Field::Str("g4".to_string()), + group![( + "H".to_string(), + group![("i".to_string(), Field::Null)] + )] + ), + ( + Field::Str("g5".to_string()), + group![("H".to_string(), Field::Null)] + ) + ] + ) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(3)), + ("int_array".to_string(), list![]), + ("int_array_Array".to_string(), list![Field::Null]), + ("int_map".to_string(), map![]), + ("int_Map_Array".to_string(), list![Field::Null, Field::Null]), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Null), + ("b".to_string(), Field::Null), + ("C".to_string(), group![("d".to_string(), list![])]), + ("g".to_string(), map![]) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(4)), + ("int_array".to_string(), Field::Null), + ("int_array_Array".to_string(), list![]), + ("int_map".to_string(), map![]), + ("int_Map_Array".to_string(), list![]), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Null), + ("b".to_string(), Field::Null), + ("C".to_string(), group![("d".to_string(), Field::Null)]), + ("g".to_string(), Field::Null) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(5)), + ("int_array".to_string(), Field::Null), + ("int_array_Array".to_string(), Field::Null), + ("int_map".to_string(), map![]), + ("int_Map_Array".to_string(), Field::Null), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Null), + ("b".to_string(), Field::Null), + ("C".to_string(), Field::Null), + ( + "g".to_string(), + map![( + Field::Str("foo".to_string()), + group![( + "H".to_string(), + group![( + "i".to_string(), + list![Field::Double(2.2), Field::Double(3.3)] + )] + )] + )] + ) + ] + ) + ], + row![ + ("id".to_string(), Field::Long(6)), + ("int_array".to_string(), Field::Null), + ("int_array_Array".to_string(), Field::Null), + ("int_map".to_string(), Field::Null), + ("int_Map_Array".to_string(), Field::Null), + ("nested_struct".to_string(), Field::Null) + ], + row![ + ("id".to_string(), Field::Long(7)), + ("int_array".to_string(), Field::Null), + ( + "int_array_Array".to_string(), + list![Field::Null, list![Field::Int(5), Field::Int(6)]] + ), + ( + "int_map".to_string(), + map![ + (Field::Str("k1".to_string()), Field::Null), + (Field::Str("k3".to_string()), Field::Null) + ] + ), + ("int_Map_Array".to_string(), Field::Null), + ( + "nested_struct".to_string(), + group![ + ("A".to_string(), Field::Int(7)), + ( + "b".to_string(), + list![Field::Int(2), Field::Int(3), Field::Null] + ), + ( + "C".to_string(), + group![( + "d".to_string(), + list![list![], list![Field::Null], Field::Null] + )] + ), + ("g".to_string(), Field::Null) + ] + ) + ], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_projection() { + let schema = " + message spark_schema { + REQUIRED DOUBLE c; + REQUIRED INT32 b; + } + "; + let schema = parse_message_type(&schema).unwrap(); + let rows = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + let expected_rows = vec![ + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + row![ + ("c".to_string(), Field::Double(1.0)), + ("b".to_string(), Field::Int(1)) + ], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_projection_map() { + let schema = " + message spark_schema { + OPTIONAL group a (MAP) { + REPEATED group key_value { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value (MAP) { + REPEATED group key_value { + REQUIRED INT32 key; + REQUIRED BOOLEAN value; + } + } + } + } + } + "; + let schema = parse_message_type(&schema).unwrap(); + let rows = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + let expected_rows = vec![ + row![( + "a".to_string(), + map![( + Field::Str("a".to_string()), + map![ + (Field::Int(1), Field::Bool(true)), + (Field::Int(2), Field::Bool(false)) + ] + )] + )], + row![( + "a".to_string(), + map![( + Field::Str("b".to_string()), + map![(Field::Int(1), Field::Bool(true))] + )] + )], + row![( + "a".to_string(), + map![(Field::Str("c".to_string()), Field::Null)] + )], + row![("a".to_string(), map![(Field::Str("d".to_string()), map![])])], + row![( + "a".to_string(), + map![( + Field::Str("e".to_string()), + map![(Field::Int(1), Field::Bool(true))] + )] + )], + row![( + "a".to_string(), + map![( + Field::Str("f".to_string()), + map![ + (Field::Int(3), Field::Bool(true)), + (Field::Int(4), Field::Bool(false)), + (Field::Int(5), Field::Bool(true)) + ] + )] + )], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_projection_list() { + let schema = " + message spark_schema { + OPTIONAL group a (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL BYTE_ARRAY element (UTF8); + } + } + } + } + } + } + } + "; + let schema = parse_message_type(&schema).unwrap(); + let rows = test_file_reader_rows("nested_lists.snappy.parquet", Some(schema)).unwrap(); + let expected_rows = vec![ + row![( + "a".to_string(), + list![ + list![ + list![Field::Str("a".to_string()), Field::Str("b".to_string())], + list![Field::Str("c".to_string())] + ], + list![Field::Null, list![Field::Str("d".to_string())]] + ] + )], + row![( + "a".to_string(), + list![ + list![ + list![Field::Str("a".to_string()), Field::Str("b".to_string())], + list![Field::Str("c".to_string()), Field::Str("d".to_string())] + ], + list![Field::Null, list![Field::Str("e".to_string())]] + ] + )], + row![( + "a".to_string(), + list![ + list![ + list![Field::Str("a".to_string()), Field::Str("b".to_string())], + list![Field::Str("c".to_string()), Field::Str("d".to_string())], + list![Field::Str("e".to_string())] + ], + list![Field::Null, list![Field::Str("f".to_string())]] + ] + )], + ]; + assert_eq!(rows, expected_rows); + } + + #[test] + fn test_file_reader_rows_invalid_projection() { + let schema = " + message spark_schema { + REQUIRED INT32 key; + REQUIRED BOOLEAN value; + } + "; + let schema = parse_message_type(&schema).unwrap(); + let res = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)); + assert!(res.is_err()); + assert_eq!( + res.unwrap_err(), + general_err!("Root schema does not contain projection") + ); + } + + #[test] + fn test_row_group_rows_invalid_projection() { + let schema = " + message spark_schema { + REQUIRED INT32 key; + REQUIRED BOOLEAN value; + } + "; + let schema = parse_message_type(&schema).unwrap(); + let res = test_row_group_rows("nested_maps.snappy.parquet", Some(schema)); + assert!(res.is_err()); + assert_eq!( + res.unwrap_err(), + general_err!("Root schema does not contain projection") + ); + } + + #[test] + #[should_panic(expected = "Invalid map type")] + fn test_file_reader_rows_invalid_map_type() { + let schema = " + message spark_schema { + OPTIONAL group a (MAP) { + REPEATED group key_value { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value (MAP) { + REPEATED group key_value { + REQUIRED INT32 key; + } + } + } + } + } + "; + let schema = parse_message_type(&schema).unwrap(); + test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + } + + #[test] + fn test_tree_reader_handle_repeated_fields_with_no_annotation() { + // Array field `phoneNumbers` does not contain LIST annotation. + // We parse it as struct with `phone` repeated field as array. + let rows = test_file_reader_rows("repeated_no_annotation.parquet", None).unwrap(); + let expected_rows = vec![ + row![ + ("id".to_string(), Field::Int(1)), + ("phoneNumbers".to_string(), Field::Null) + ], + row![ + ("id".to_string(), Field::Int(2)), + ("phoneNumbers".to_string(), Field::Null) + ], + row![ + ("id".to_string(), Field::Int(3)), + ( + "phoneNumbers".to_string(), + group![("phone".to_string(), list![])] + ) + ], + row![ + ("id".to_string(), Field::Int(4)), + ( + "phoneNumbers".to_string(), + group![( + "phone".to_string(), + list![group![ + ("number".to_string(), Field::Long(5555555555)), + ("kind".to_string(), Field::Null) + ]] + )] + ) + ], + row![ + ("id".to_string(), Field::Int(5)), + ( + "phoneNumbers".to_string(), + group![( + "phone".to_string(), + list![group![ + ("number".to_string(), Field::Long(1111111111)), + ("kind".to_string(), Field::Str("home".to_string())) + ]] + )] + ) + ], + row![ + ("id".to_string(), Field::Int(6)), + ( + "phoneNumbers".to_string(), + group![( + "phone".to_string(), + list![ + group![ + ("number".to_string(), Field::Long(1111111111)), + ("kind".to_string(), Field::Str("home".to_string())) + ], + group![ + ("number".to_string(), Field::Long(2222222222)), + ("kind".to_string(), Field::Null) + ], + group![ + ("number".to_string(), Field::Long(3333333333)), + ("kind".to_string(), Field::Str("mobile".to_string())) + ] + ] + )] + ) + ], + ]; + + assert_eq!(rows, expected_rows); + } + + fn test_file_reader_rows(file_name: &str, schema: Option) -> Result> { + let file = get_test_file(file_name); + let file_reader: Box = Box::new(SerializedFileReader::new(file)?); + let iter = file_reader.get_row_iter(schema)?; + Ok(iter.collect()) + } + + fn test_row_group_rows(file_name: &str, schema: Option) -> Result> { + let file = get_test_file(file_name); + let file_reader: Box = Box::new(SerializedFileReader::new(file)?); + // Check the first row group only, because files will contain only single row group + let row_group_reader = file_reader.get_row_group(0).unwrap(); + let iter = row_group_reader.get_row_iter(schema)?; + Ok(iter.collect()) + } +} diff --git a/rust/src/parquet/record/triplet.rs b/rust/src/parquet/record/triplet.rs new file mode 100644 index 0000000000000..fadcbbce9ba5b --- /dev/null +++ b/rust/src/parquet/record/triplet.rs @@ -0,0 +1,561 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::parquet::basic::Type as PhysicalType; +use crate::parquet::column::reader::{get_typed_column_reader, ColumnReader, ColumnReaderImpl}; +use crate::parquet::data_type::*; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::record::api::Field; +use crate::parquet::schema::types::ColumnDescPtr; + +/// Macro to generate simple functions that cover all types of triplet iterator. +/// $func is a function of a typed triplet iterator and $token is a either {`ref`} or +/// {`ref`, `mut`} +macro_rules! triplet_enum_func { + ($self:ident, $func:ident, $( $token:tt ),*) => ({ + match *$self { + TripletIter::BoolTripletIter($($token)* typed) => typed.$func(), + TripletIter::Int32TripletIter($($token)* typed) => typed.$func(), + TripletIter::Int64TripletIter($($token)* typed) => typed.$func(), + TripletIter::Int96TripletIter($($token)* typed) => typed.$func(), + TripletIter::FloatTripletIter($($token)* typed) => typed.$func(), + TripletIter::DoubleTripletIter($($token)* typed) => typed.$func(), + TripletIter::ByteArrayTripletIter($($token)* typed) => typed.$func(), + TripletIter::FixedLenByteArrayTripletIter($($token)* typed) => typed.$func() + } + }); +} + +/// High level API wrapper on column reader. +/// Provides per-element access for each primitive column. +pub enum TripletIter { + BoolTripletIter(TypedTripletIter), + Int32TripletIter(TypedTripletIter), + Int64TripletIter(TypedTripletIter), + Int96TripletIter(TypedTripletIter), + FloatTripletIter(TypedTripletIter), + DoubleTripletIter(TypedTripletIter), + ByteArrayTripletIter(TypedTripletIter), + FixedLenByteArrayTripletIter(TypedTripletIter), +} + +impl TripletIter { + /// Creates new triplet for column reader + pub fn new(descr: ColumnDescPtr, reader: ColumnReader, batch_size: usize) -> Self { + match descr.physical_type() { + PhysicalType::BOOLEAN => { + TripletIter::BoolTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT32 => { + TripletIter::Int32TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT64 => { + TripletIter::Int64TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT96 => { + TripletIter::Int96TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::FLOAT => { + TripletIter::FloatTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::DOUBLE => { + TripletIter::DoubleTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::BYTE_ARRAY => { + TripletIter::ByteArrayTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::FIXED_LEN_BYTE_ARRAY => TripletIter::FixedLenByteArrayTripletIter( + TypedTripletIter::new(descr, batch_size, reader), + ), + } + } + + /// Invokes underlying typed triplet iterator to buffer current value. + /// Should be called once - either before `is_null` or `current_value`. + #[inline] + pub fn read_next(&mut self) -> Result { + triplet_enum_func!(self, read_next, ref, mut) + } + + /// Provides check on values/levels left without invoking the underlying typed triplet + /// iterator. + /// Returns true if more values/levels exist, false otherwise. + /// It is always in sync with `read_next` method. + #[inline] + pub fn has_next(&self) -> bool { + triplet_enum_func!(self, has_next, ref) + } + + /// Returns current definition level for a leaf triplet iterator + #[inline] + pub fn current_def_level(&self) -> i16 { + triplet_enum_func!(self, current_def_level, ref) + } + + /// Returns max definition level for a leaf triplet iterator + #[inline] + pub fn max_def_level(&self) -> i16 { + triplet_enum_func!(self, max_def_level, ref) + } + + /// Returns current repetition level for a leaf triplet iterator + #[inline] + pub fn current_rep_level(&self) -> i16 { + triplet_enum_func!(self, current_rep_level, ref) + } + + /// Returns max repetition level for a leaf triplet iterator + #[inline] + pub fn max_rep_level(&self) -> i16 { + triplet_enum_func!(self, max_rep_level, ref) + } + + /// Returns true, if current value is null. + /// Based on the fact that for non-null value current definition level + /// equals to max definition level. + #[inline] + pub fn is_null(&self) -> bool { + self.current_def_level() < self.max_def_level() + } + + /// Updates non-null value for current row. + pub fn current_value(&self) -> Field { + assert!(!self.is_null(), "Value is null"); + match *self { + TripletIter::BoolTripletIter(ref typed) => { + Field::convert_bool(typed.column_descr(), *typed.current_value()) + } + TripletIter::Int32TripletIter(ref typed) => { + Field::convert_int32(typed.column_descr(), *typed.current_value()) + } + TripletIter::Int64TripletIter(ref typed) => { + Field::convert_int64(typed.column_descr(), *typed.current_value()) + } + TripletIter::Int96TripletIter(ref typed) => { + Field::convert_int96(typed.column_descr(), typed.current_value().clone()) + } + TripletIter::FloatTripletIter(ref typed) => { + Field::convert_float(typed.column_descr(), *typed.current_value()) + } + TripletIter::DoubleTripletIter(ref typed) => { + Field::convert_double(typed.column_descr(), *typed.current_value()) + } + TripletIter::ByteArrayTripletIter(ref typed) => { + Field::convert_byte_array(typed.column_descr(), typed.current_value().clone()) + } + TripletIter::FixedLenByteArrayTripletIter(ref typed) => { + Field::convert_byte_array(typed.column_descr(), typed.current_value().clone()) + } + } + } +} + +/// Internal typed triplet iterator as a wrapper for column reader +/// (primitive leaf column), provides per-element access. +pub struct TypedTripletIter { + reader: ColumnReaderImpl, + column_descr: ColumnDescPtr, + batch_size: usize, + // type properties + max_def_level: i16, + max_rep_level: i16, + // values and levels + values: Vec, + def_levels: Option>, + rep_levels: Option>, + // current index for the triplet (value, def, rep) + curr_triplet_index: usize, + // how many triplets are left before we need to buffer + triplets_left: usize, + // helper flag to quickly check if we have more values/levels to read + has_next: bool, +} + +impl TypedTripletIter { + /// Creates new typed triplet iterator based on provided column reader. + /// Use batch size to specify the amount of values to buffer from column reader. + fn new(descr: ColumnDescPtr, batch_size: usize, column_reader: ColumnReader) -> Self { + assert!( + batch_size > 0, + "Expected positive batch size, found: {}", + batch_size + ); + + let max_def_level = descr.max_def_level(); + let max_rep_level = descr.max_rep_level(); + + let def_levels = if max_def_level == 0 { + None + } else { + Some(vec![0; batch_size]) + }; + let rep_levels = if max_rep_level == 0 { + None + } else { + Some(vec![0; batch_size]) + }; + + Self { + reader: get_typed_column_reader(column_reader), + column_descr: descr, + batch_size, + max_def_level, + max_rep_level, + values: vec![T::T::default(); batch_size], + def_levels, + rep_levels, + curr_triplet_index: 0, + triplets_left: 0, + has_next: false, + } + } + + /// Returns column descriptor reference for the current typed triplet iterator. + #[inline] + pub fn column_descr(&self) -> &ColumnDescPtr { + &self.column_descr + } + + /// Returns maximum definition level for the triplet iterator (leaf column). + #[inline] + fn max_def_level(&self) -> i16 { + self.max_def_level + } + + /// Returns maximum repetition level for the triplet iterator (leaf column). + #[inline] + fn max_rep_level(&self) -> i16 { + self.max_rep_level + } + + /// Returns current value. + /// Method does not advance the iterator, therefore can be called multiple times. + #[inline] + fn current_value(&self) -> &T::T { + assert!( + self.current_def_level() == self.max_def_level(), + "Cannot extract value, max definition level: {}, current level: {}", + self.max_def_level(), + self.current_def_level() + ); + &self.values[self.curr_triplet_index] + } + + /// Returns current definition level. + /// If field is required, then maximum definition level is returned. + #[inline] + fn current_def_level(&self) -> i16 { + match self.def_levels { + Some(ref vec) => vec[self.curr_triplet_index], + None => self.max_def_level, + } + } + + /// Returns current repetition level. + /// If field is required, then maximum repetition level is returned. + #[inline] + fn current_rep_level(&self) -> i16 { + match self.rep_levels { + Some(ref vec) => vec[self.curr_triplet_index], + None => self.max_rep_level, + } + } + + /// Quick check if iterator has more values/levels to read. + /// It is updated as a result of `read_next` method, so they are synchronized. + #[inline] + fn has_next(&self) -> bool { + self.has_next + } + + /// Advances to the next triplet. + /// Returns true, if there are more records to read, false there are no records left. + fn read_next(&mut self) -> Result { + self.curr_triplet_index += 1; + + if self.curr_triplet_index >= self.triplets_left { + let (values_read, levels_read) = { + // Get slice of definition levels, if available + let def_levels = match self.def_levels { + Some(ref mut vec) => Some(&mut vec[..]), + None => None, + }; + + // Get slice of repetition levels, if available + let rep_levels = match self.rep_levels { + Some(ref mut vec) => Some(&mut vec[..]), + None => None, + }; + + // Buffer triplets + self.reader + .read_batch(self.batch_size, def_levels, rep_levels, &mut self.values)? + }; + + // No more values or levels to read + if values_read == 0 && levels_read == 0 { + self.has_next = false; + return Ok(false); + } + + // We never read values more than levels + if levels_read == 0 || values_read == levels_read { + // There are no definition levels to read, column is required + // or definition levels match values, so it does not require spacing + self.curr_triplet_index = 0; + self.triplets_left = values_read; + } else if values_read < levels_read { + // Add spacing for triplets. + // The idea is setting values for positions in def_levels when current definition + // level equals to maximum definition level. Values and levels are guaranteed to + // line up, because of the column reader method. + + // Note: if values_read == 0, then spacing will not be triggered + let mut idx = values_read; + let def_levels = self.def_levels.as_ref().unwrap(); + for i in 0..levels_read { + if def_levels[levels_read - i - 1] == self.max_def_level { + idx -= 1; // This is done to avoid usize becoming a negative value + self.values.swap(levels_read - i - 1, idx); + } + } + self.curr_triplet_index = 0; + self.triplets_left = levels_read; + } else { + return Err(general_err!( + "Spacing of values/levels is wrong, values_read: {}, levels_read: {}", + values_read, + levels_read + )); + } + } + + self.has_next = true; + Ok(true) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::file::reader::{FileReader, SerializedFileReader}; + use crate::parquet::schema::types::ColumnPath; + use crate::parquet::util::test_common::get_test_file; + + #[test] + #[should_panic(expected = "Expected positive batch size, found: 0")] + fn test_triplet_zero_batch_size() { + let column_path = ColumnPath::from(vec!["b_struct".to_string(), "b_c_int".to_string()]); + test_column_in_file( + "nulls.snappy.parquet", + 0, + &column_path, + &vec![], + &vec![], + &vec![], + ); + } + + #[test] + fn test_triplet_null_column() { + let path = vec!["b_struct", "b_c_int"]; + let values = vec![]; + let def_levels = vec![1, 1, 1, 1, 1, 1, 1, 1]; + let rep_levels = vec![0, 0, 0, 0, 0, 0, 0, 0]; + test_triplet_iter( + "nulls.snappy.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + #[test] + fn test_triplet_required_column() { + let path = vec!["ID"]; + let values = vec![Field::Long(8)]; + let def_levels = vec![0]; + let rep_levels = vec![0]; + test_triplet_iter( + "nonnullable.impala.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + #[test] + fn test_triplet_optional_column() { + let path = vec!["nested_struct", "A"]; + let values = vec![Field::Int(1), Field::Int(7)]; + let def_levels = vec![2, 1, 1, 1, 1, 0, 2]; + let rep_levels = vec![0, 0, 0, 0, 0, 0, 0]; + test_triplet_iter( + "nullable.impala.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + #[test] + fn test_triplet_optional_list_column() { + let path = vec!["a", "list", "element", "list", "element", "list", "element"]; + let values = vec![ + Field::Str("a".to_string()), + Field::Str("b".to_string()), + Field::Str("c".to_string()), + Field::Str("d".to_string()), + Field::Str("a".to_string()), + Field::Str("b".to_string()), + Field::Str("c".to_string()), + Field::Str("d".to_string()), + Field::Str("e".to_string()), + Field::Str("a".to_string()), + Field::Str("b".to_string()), + Field::Str("c".to_string()), + Field::Str("d".to_string()), + Field::Str("e".to_string()), + Field::Str("f".to_string()), + ]; + let def_levels = vec![7, 7, 7, 4, 7, 7, 7, 7, 7, 4, 7, 7, 7, 7, 7, 7, 4, 7]; + let rep_levels = vec![0, 3, 2, 1, 2, 0, 3, 2, 3, 1, 2, 0, 3, 2, 3, 2, 1, 2]; + test_triplet_iter( + "nested_lists.snappy.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + #[test] + fn test_triplet_optional_map_column() { + let path = vec!["a", "key_value", "value", "key_value", "key"]; + let values = vec![ + Field::Int(1), + Field::Int(2), + Field::Int(1), + Field::Int(1), + Field::Int(3), + Field::Int(4), + Field::Int(5), + ]; + let def_levels = vec![4, 4, 4, 2, 3, 4, 4, 4, 4]; + let rep_levels = vec![0, 2, 0, 0, 0, 0, 0, 2, 2]; + test_triplet_iter( + "nested_maps.snappy.parquet", + path, + &values, + &def_levels, + &rep_levels, + ); + } + + // Check triplet iterator across different batch sizes + fn test_triplet_iter( + file_name: &str, + column_path: Vec<&str>, + expected_values: &[Field], + expected_def_levels: &[i16], + expected_rep_levels: &[i16], + ) { + // Convert path into column path + let path: Vec = column_path.iter().map(|x| x.to_string()).collect(); + let column_path = ColumnPath::from(path); + + let batch_sizes = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 128, 256]; + for batch_size in batch_sizes { + test_column_in_file( + file_name, + batch_size, + &column_path, + expected_values, + expected_def_levels, + expected_rep_levels, + ); + } + } + + // Check values of a selectd column in a file + fn test_column_in_file( + file_name: &str, + batch_size: usize, + column_path: &ColumnPath, + expected_values: &[Field], + expected_def_levels: &[i16], + expected_rep_levels: &[i16], + ) { + let file = get_test_file(file_name); + let file_reader = SerializedFileReader::new(file).unwrap(); + // Get schema descriptor + let file_metadata = file_reader.metadata().file_metadata(); + let schema = file_metadata.schema_descr(); + // Get first row group + let row_group_reader = file_reader.get_row_group(0).unwrap(); + + for i in 0..schema.num_columns() { + let descr = schema.column(i); + if descr.path() == column_path { + let reader = row_group_reader.get_column_reader(i).unwrap(); + test_triplet_column( + descr, + reader, + batch_size, + expected_values, + expected_def_levels, + expected_rep_levels, + ); + } + } + } + + // Check values for individual triplet iterator + fn test_triplet_column( + descr: ColumnDescPtr, + reader: ColumnReader, + batch_size: usize, + expected_values: &[Field], + expected_def_levels: &[i16], + expected_rep_levels: &[i16], + ) { + let mut iter = TripletIter::new(descr.clone(), reader, batch_size); + let mut values: Vec = Vec::new(); + let mut def_levels: Vec = Vec::new(); + let mut rep_levels: Vec = Vec::new(); + + assert_eq!(iter.max_def_level(), descr.max_def_level()); + assert_eq!(iter.max_rep_level(), descr.max_rep_level()); + + while let Ok(true) = iter.read_next() { + assert!(iter.has_next()); + if !iter.is_null() { + values.push(iter.current_value()); + } + def_levels.push(iter.current_def_level()); + rep_levels.push(iter.current_rep_level()); + } + + assert_eq!(values, expected_values); + assert_eq!(def_levels, expected_def_levels); + assert_eq!(rep_levels, expected_rep_levels); + } +} diff --git a/rust/src/parquet/schema/mod.rs b/rust/src/parquet/schema/mod.rs new file mode 100644 index 0000000000000..5319504964627 --- /dev/null +++ b/rust/src/parquet/schema/mod.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet schema definitions and methods to print and parse schema. +//! +//! # Example +//! +//! ```rust +//! use arrow::parquet::{ +//! basic::{LogicalType, Repetition, Type as PhysicalType}, +//! schema::{parser, printer, types::Type}, +//! }; +//! use std::rc::Rc; +//! +//! // Create the following schema: +//! // +//! // message schema { +//! // OPTIONAL BYTE_ARRAY a (UTF8); +//! // REQUIRED INT32 b; +//! // } +//! +//! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) +//! .with_logical_type(LogicalType::UTF8) +//! .with_repetition(Repetition::OPTIONAL) +//! .build() +//! .unwrap(); +//! +//! let field_b = Type::primitive_type_builder("b", PhysicalType::INT32) +//! .with_repetition(Repetition::REQUIRED) +//! .build() +//! .unwrap(); +//! +//! let schema = Type::group_type_builder("schema") +//! .with_fields(&mut vec![Rc::new(field_a), Rc::new(field_b)]) +//! .build() +//! .unwrap(); +//! +//! let mut buf = Vec::new(); +//! +//! // Print schema into buffer +//! printer::print_schema(&mut buf, &schema); +//! +//! // Parse schema from the string +//! let string_schema = String::from_utf8(buf).unwrap(); +//! let parsed_schema = parser::parse_message_type(&string_schema).unwrap(); +//! +//! assert_eq!(schema, parsed_schema); +//! ``` + +pub mod parser; +pub mod printer; +pub mod types; diff --git a/rust/src/parquet/schema/parser.rs b/rust/src/parquet/schema/parser.rs new file mode 100644 index 0000000000000..2890c84a755ba --- /dev/null +++ b/rust/src/parquet/schema/parser.rs @@ -0,0 +1,764 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet schema parser. +//! Provides methods to parse and validate string message type into Parquet +//! [`Type`](`::schema::types::Type`). +//! +//! # Example +//! +//! ```rust +//! use arrow::parquet::schema::parser::parse_message_type; +//! +//! let message_type = " +//! message spark_schema { +//! OPTIONAL BYTE_ARRAY a (UTF8); +//! REQUIRED INT32 b; +//! REQUIRED DOUBLE c; +//! REQUIRED BOOLEAN d; +//! OPTIONAL group e (LIST) { +//! REPEATED group list { +//! REQUIRED INT32 element; +//! } +//! } +//! } +//! "; +//! +//! let schema = parse_message_type(message_type).expect("Expected valid schema"); +//! println!("{:?}", schema); +//! ``` + +use std::rc::Rc; + +use crate::parquet::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::schema::types::{Type, TypePtr}; + +/// Parses message type as string into a Parquet [`Type`](`::schema::types::Type`) which, +/// for example, could be used to extract individual columns. Returns Parquet general +/// error when parsing or validation fails. +pub fn parse_message_type<'a>(message_type: &'a str) -> Result { + let mut parser = Parser { + tokenizer: &mut Tokenizer::from_str(message_type), + }; + parser.parse_message_type() +} + +/// Tokenizer to split message type string into tokens that are separated using characters +/// defined in `is_schema_delim` method. Tokenizer also preserves delimiters as tokens. +/// Tokenizer provides Iterator interface to process tokens; it also allows to step back +/// to reprocess previous tokens. +struct Tokenizer<'a> { + // List of all tokens for a string + tokens: Vec<&'a str>, + // Current index of vector + index: usize, +} + +impl<'a> Tokenizer<'a> { + // Create tokenizer from message type string + pub fn from_str(string: &'a str) -> Self { + let vec = string + .split_whitespace() + .flat_map(|t| Self::split_token(t)) + .collect(); + Tokenizer { + tokens: vec, + index: 0, + } + } + + // List of all special characters in schema + fn is_schema_delim(c: char) -> bool { + c == ';' || c == '{' || c == '}' || c == '(' || c == ')' || c == '=' || c == ',' + } + + /// Splits string into tokens; input string can already be token or can contain + /// delimiters, e.g. required" -> Vec("required") and + /// "(UTF8);" -> Vec("(", "UTF8", ")", ";") + fn split_token(string: &str) -> Vec<&str> { + let mut buffer: Vec<&str> = Vec::new(); + let mut tail = string; + while let Some(index) = tail.find(Self::is_schema_delim) { + let (h, t) = tail.split_at(index); + if !h.is_empty() { + buffer.push(h); + } + buffer.push(&t[0..1]); + tail = &t[1..]; + } + if !tail.is_empty() { + buffer.push(tail); + } + buffer + } + + // Move pointer to a previous element + fn backtrack(&mut self) { + self.index -= 1; + } +} + +impl<'a> Iterator for Tokenizer<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if self.index < self.tokens.len() { + self.index += 1; + Some(self.tokens[self.index - 1]) + } else { + None + } + } +} + +/// Internal Schema parser. +/// Traverses message type using tokenizer and parses each group/primitive type +/// recursively. +struct Parser<'a> { + tokenizer: &'a mut Tokenizer<'a>, +} + +// Utility function to assert token on validity. +fn assert_token(token: Option<&str>, expected: &str) -> Result<()> { + match token { + Some(value) if value == expected => Ok(()), + Some(other) => Err(general_err!( + "Expected '{}', found token '{}'", + expected, + other + )), + None => Err(general_err!( + "Expected '{}', but no token found (None)", + expected + )), + } +} + +// Utility function to parse i32 or return general error. +fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result { + value + .ok_or(general_err!(not_found_msg)) + .and_then(|v| v.parse::().map_err(|_| general_err!(parse_fail_msg))) +} + +impl<'a> Parser<'a> { + // Entry function to parse message type, uses internal tokenizer. + fn parse_message_type(&mut self) -> Result { + // Check that message type starts with "message". + match self.tokenizer.next() { + Some("message") => { + let name = self + .tokenizer + .next() + .ok_or(general_err!("Expected name, found None"))?; + let mut fields = self.parse_child_types()?; + Type::group_type_builder(name) + .with_fields(&mut fields) + .build() + } + _ => Err(general_err!("Message type does not start with 'message'")), + } + } + + // Parses child types for a current group type. + // This is only invoked on root and group types. + fn parse_child_types(&mut self) -> Result> { + assert_token(self.tokenizer.next(), "{")?; + let mut vec = Vec::new(); + while let Some(value) = self.tokenizer.next() { + if value == "}" { + break; + } else { + self.tokenizer.backtrack(); + vec.push(Rc::new(self.add_type()?)); + } + } + Ok(vec) + } + + fn add_type(&mut self) -> Result { + // Parse repetition + let repetition = self + .tokenizer + .next() + .ok_or(general_err!("Expected repetition, found None")) + .and_then(|v| v.to_uppercase().parse::())?; + + match self.tokenizer.next() { + Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(Some(repetition)), + Some(type_string) => { + let physical_type = type_string.to_uppercase().parse::()?; + self.add_primitive_type(repetition, physical_type) + } + None => Err(general_err!("Invalid type, could not extract next token")), + } + } + + fn add_group_type(&mut self, repetition: Option) -> Result { + // Parse name of the group type + let name = self + .tokenizer + .next() + .ok_or(general_err!("Expected name, found None"))?; + + // Parse logical type if exists + let logical_type = if let Some("(") = self.tokenizer.next() { + let tpe = self + .tokenizer + .next() + .ok_or(general_err!("Expected logical type, found None")) + .and_then(|v| v.to_uppercase().parse::())?; + assert_token(self.tokenizer.next(), ")")?; + tpe + } else { + self.tokenizer.backtrack(); + LogicalType::NONE + }; + + // Parse optional id + let id = if let Some("=") = self.tokenizer.next() { + self.tokenizer.next().and_then(|v| v.parse::().ok()) + } else { + self.tokenizer.backtrack(); + None + }; + + let mut fields = self.parse_child_types()?; + let mut builder = Type::group_type_builder(name) + .with_logical_type(logical_type) + .with_fields(&mut fields); + if let Some(rep) = repetition { + builder = builder.with_repetition(rep); + } + if let Some(id) = id { + builder = builder.with_id(id); + } + builder.build() + } + + fn add_primitive_type( + &mut self, + repetition: Repetition, + physical_type: PhysicalType, + ) -> Result { + // Read type length if the type is FIXED_LEN_BYTE_ARRAY. + let mut length: i32 = -1; + if physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY { + assert_token(self.tokenizer.next(), "(")?; + length = parse_i32( + self.tokenizer.next(), + "Expected length for FIXED_LEN_BYTE_ARRAY, found None", + "Failed to parse length for FIXED_LEN_BYTE_ARRAY", + )?; + assert_token(self.tokenizer.next(), ")")?; + } + + // Parse name of the primitive type + let name = self + .tokenizer + .next() + .ok_or(general_err!("Expected name, found None"))?; + + // Parse logical type + let (logical_type, precision, scale) = if let Some("(") = self.tokenizer.next() { + let tpe = self + .tokenizer + .next() + .ok_or(general_err!("Expected logical type, found None")) + .and_then(|v| v.to_uppercase().parse::())?; + + // Parse precision and scale for decimals + let mut precision: i32 = -1; + let mut scale: i32 = -1; + + if tpe == LogicalType::DECIMAL { + if let Some("(") = self.tokenizer.next() { + // Parse precision + precision = parse_i32( + self.tokenizer.next(), + "Expected precision, found None", + "Failed to parse precision for DECIMAL type", + )?; + + // Parse scale + scale = if let Some(",") = self.tokenizer.next() { + parse_i32( + self.tokenizer.next(), + "Expected scale, found None", + "Failed to parse scale for DECIMAL type", + )? + } else { + // Scale is not provided, set it to 0. + self.tokenizer.backtrack(); + 0 + }; + + assert_token(self.tokenizer.next(), ")")?; + } else { + self.tokenizer.backtrack(); + } + } + + assert_token(self.tokenizer.next(), ")")?; + (tpe, precision, scale) + } else { + self.tokenizer.backtrack(); + (LogicalType::NONE, -1, -1) + }; + + // Parse optional id + let id = if let Some("=") = self.tokenizer.next() { + self.tokenizer.next().and_then(|v| v.parse::().ok()) + } else { + self.tokenizer.backtrack(); + None + }; + assert_token(self.tokenizer.next(), ";")?; + + let mut builder = Type::primitive_type_builder(name, physical_type) + .with_repetition(repetition) + .with_logical_type(logical_type) + .with_length(length) + .with_precision(precision) + .with_scale(scale); + if let Some(id) = id { + builder = builder.with_id(id); + } + Ok(builder.build()?) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tokenize_empty_string() { + assert_eq!(Tokenizer::from_str("").next(), None); + } + + #[test] + fn test_tokenize_delimiters() { + let mut iter = Tokenizer::from_str(",;{}()="); + assert_eq!(iter.next(), Some(",")); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), Some("{")); + assert_eq!(iter.next(), Some("}")); + assert_eq!(iter.next(), Some("(")); + assert_eq!(iter.next(), Some(")")); + assert_eq!(iter.next(), Some("=")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_delimiters_with_whitespaces() { + let mut iter = Tokenizer::from_str(" , ; { } ( ) = "); + assert_eq!(iter.next(), Some(",")); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), Some("{")); + assert_eq!(iter.next(), Some("}")); + assert_eq!(iter.next(), Some("(")); + assert_eq!(iter.next(), Some(")")); + assert_eq!(iter.next(), Some("=")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_words() { + let mut iter = Tokenizer::from_str("abc def ghi jkl mno"); + assert_eq!(iter.next(), Some("abc")); + assert_eq!(iter.next(), Some("def")); + assert_eq!(iter.next(), Some("ghi")); + assert_eq!(iter.next(), Some("jkl")); + assert_eq!(iter.next(), Some("mno")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_backtrack() { + let mut iter = Tokenizer::from_str("abc;"); + assert_eq!(iter.next(), Some("abc")); + assert_eq!(iter.next(), Some(";")); + iter.backtrack(); + assert_eq!(iter.next(), Some(";")); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_tokenize_message_type() { + let schema = " + message schema { + required int32 a; + optional binary c (UTF8); + required group d { + required int32 a; + optional binary c (UTF8); + } + required group e (LIST) { + repeated group list { + required int32 element; + } + } + } + "; + let mut iter = Tokenizer::from_str(schema); + let mut res = Vec::new(); + while let Some(token) = iter.next() { + res.push(token); + } + assert_eq!( + res, + vec![ + "message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c", + "(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a", + ";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group", + "e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32", + "element", ";", "}", "}", "}" + ] + ); + } + + #[test] + fn test_assert_token() { + assert!(assert_token(Some("a"), "a").is_ok()); + assert!(assert_token(Some("a"), "b").is_err()); + assert!(assert_token(None, "b").is_err()); + } + + #[test] + fn test_parse_message_type_invalid() { + let mut iter = Tokenizer::from_str("test"); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Message type does not start with 'message'" + ); + } + + #[test] + fn test_parse_message_type_no_name() { + let mut iter = Tokenizer::from_str("message"); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Parquet error: Expected name, found None" + ); + } + + #[test] + fn test_parse_message_type_fixed_byte_array() { + let schema = " + message schema { + REQUIRED FIXED_LEN_BYTE_ARRAY col; + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + let schema = " + message schema { + REQUIRED FIXED_LEN_BYTE_ARRAY(16) col; + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_ok()); + } + + #[test] + fn test_parse_message_type_decimal() { + // It is okay for decimal to omit precision and scale with right syntax. + // Here we test wrong syntax of decimal type + + // Invalid decimal syntax + let schema = " + message root { + optional int32 f1 (DECIMAL(); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Invalid decimal, need precision and scale + let schema = " + message root { + optional int32 f1 (DECIMAL()); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Invalid decimal because of `,` - has precision, needs scale + let schema = " + message root { + optional int32 f1 (DECIMAL(8,)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Invalid decimal because, we always require either precision or scale to be + // specified as part of logical type + let schema = " + message root { + optional int32 f3 (DECIMAL); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_err()); + + // Valid decimal (precision, scale) + let schema = " + message root { + optional int32 f1 (DECIMAL(8, 3)); + optional int32 f2 (DECIMAL(8)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let result = Parser { + tokenizer: &mut iter, + } + .parse_message_type(); + assert!(result.is_ok()); + } + + #[test] + fn test_parse_message_type_compare_1() { + let schema = " + message root { + optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3)); + optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18)); + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let expected = Type::group_type_builder("root") + .with_fields(&mut vec![ + Rc::new( + Type::primitive_type_builder("f1", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(LogicalType::DECIMAL) + .with_length(5) + .with_precision(9) + .with_scale(3) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("f2", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(LogicalType::DECIMAL) + .with_length(16) + .with_precision(38) + .with_scale(18) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(); + + assert_eq!(message, expected); + } + + #[test] + fn test_parse_message_type_compare_2() { + let schema = " + message root { + required group a0 { + optional group a1 (LIST) { + repeated binary a2 (UTF8); + } + + optional group b1 (LIST) { + repeated group b2 { + optional int32 b3; + optional double b4; + } + } + } + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let expected = Type::group_type_builder("root") + .with_fields(&mut vec![Rc::new( + Type::group_type_builder("a0") + .with_repetition(Repetition::REQUIRED) + .with_fields(&mut vec![ + Rc::new( + Type::group_type_builder("a1") + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new( + Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ), + Rc::new( + Type::group_type_builder("b1") + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new( + Type::group_type_builder("b2") + .with_repetition(Repetition::REPEATED) + .with_fields(&mut vec![ + Rc::new( + Type::primitive_type_builder( + "b3", + PhysicalType::INT32, + ) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder( + "b4", + PhysicalType::DOUBLE, + ) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + )]) + .build() + .unwrap(); + + assert_eq!(message, expected); + } + + #[test] + fn test_parse_message_type_compare_3() { + let schema = " + message root { + required int32 _1 (INT_8); + required int32 _2 (INT_16); + required float _3; + required double _4; + optional int32 _5 (DATE); + optional binary _6 (UTF8); + } + "; + let mut iter = Tokenizer::from_str(schema); + let message = Parser { + tokenizer: &mut iter, + } + .parse_message_type() + .unwrap(); + + let mut fields = vec![ + Rc::new( + Type::primitive_type_builder("_1", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_8) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_2", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_16) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_3", PhysicalType::FLOAT) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_4", PhysicalType::DOUBLE) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_5", PhysicalType::INT32) + .with_logical_type(LogicalType::DATE) + .build() + .unwrap(), + ), + Rc::new( + Type::primitive_type_builder("_6", PhysicalType::BYTE_ARRAY) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(), + ), + ]; + + let expected = Type::group_type_builder("root") + .with_fields(&mut fields) + .build() + .unwrap(); + assert_eq!(message, expected); + } +} diff --git a/rust/src/parquet/schema/printer.rs b/rust/src/parquet/schema/printer.rs new file mode 100644 index 0000000000000..d61f116eb9e70 --- /dev/null +++ b/rust/src/parquet/schema/printer.rs @@ -0,0 +1,467 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet schema printer. +//! Provides methods to print Parquet file schema and list file metadata. +//! +//! # Example +//! +//! ```rust +//! use arrow::parquet::{ +//! file::reader::{FileReader, SerializedFileReader}, +//! schema::printer::{print_file_metadata, print_parquet_metadata, print_schema}, +//! }; +//! use std::{fs::File, path::Path}; +//! +//! // Open a file +//! let path = Path::new("test.parquet"); +//! if let Ok(file) = File::open(&path) { +//! let reader = SerializedFileReader::new(file).unwrap(); +//! let parquet_metadata = reader.metadata(); +//! +//! print_parquet_metadata(&mut std::io::stdout(), &parquet_metadata); +//! print_file_metadata(&mut std::io::stdout(), &parquet_metadata.file_metadata()); +//! +//! print_schema( +//! &mut std::io::stdout(), +//! &parquet_metadata.file_metadata().schema(), +//! ); +//! } +//! ``` + +use std::{fmt, io}; + +use crate::parquet::basic::{LogicalType, Type as PhysicalType}; +use crate::parquet::file::metadata::{ + ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, +}; +use crate::parquet::schema::types::Type; + +/// Prints Parquet metadata [`ParquetMetaData`](`::file::metadata::ParquetMetaData`) +/// information. +#[allow(unused_must_use)] +pub fn print_parquet_metadata(out: &mut io::Write, metadata: &ParquetMetaData) { + print_file_metadata(out, &metadata.file_metadata()); + writeln!(out, ""); + writeln!(out, ""); + writeln!(out, "num of row groups: {}", metadata.num_row_groups()); + writeln!(out, "row groups:"); + writeln!(out, ""); + for (i, rg) in metadata.row_groups().iter().enumerate() { + writeln!(out, "row group {}:", i); + print_dashes(out, 80); + print_row_group_metadata(out, rg); + } +} + +/// Prints file metadata [`FileMetaData`](`::file::metadata::FileMetaData`) information. +#[allow(unused_must_use)] +pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) { + writeln!(out, "version: {}", file_metadata.version()); + writeln!(out, "num of rows: {}", file_metadata.num_rows()); + if let Some(created_by) = file_metadata.created_by().as_ref() { + writeln!(out, "created by: {}", created_by); + } + let schema = file_metadata.schema(); + print_schema(out, schema); +} + +/// Prints Parquet [`Type`](`::schema::types::Type`) information. +#[allow(unused_must_use)] +pub fn print_schema(out: &mut io::Write, tp: &Type) { + // TODO: better if we can pass fmt::Write to Printer. + // But how can we make it to accept both io::Write & fmt::Write? + let mut s = String::new(); + { + let mut printer = Printer::new(&mut s); + printer.print(tp); + } + writeln!(out, "{}", s); +} + +#[allow(unused_must_use)] +fn print_row_group_metadata(out: &mut io::Write, rg_metadata: &RowGroupMetaData) { + writeln!(out, "total byte size: {}", rg_metadata.total_byte_size()); + writeln!(out, "num of rows: {}", rg_metadata.num_rows()); + writeln!(out, ""); + writeln!(out, "num of columns: {}", rg_metadata.num_columns()); + writeln!(out, "columns: "); + for (i, cc) in rg_metadata.columns().iter().enumerate() { + writeln!(out, ""); + writeln!(out, "column {}:", i); + print_dashes(out, 80); + print_column_chunk_metadata(out, cc); + } +} + +#[allow(unused_must_use)] +fn print_column_chunk_metadata(out: &mut io::Write, cc_metadata: &ColumnChunkMetaData) { + writeln!(out, "column type: {}", cc_metadata.column_type()); + writeln!(out, "column path: {}", cc_metadata.column_path()); + let encoding_strs: Vec<_> = cc_metadata + .encodings() + .iter() + .map(|e| format!("{}", e)) + .collect(); + writeln!(out, "encodings: {}", encoding_strs.join(" ")); + let file_path_str = match cc_metadata.file_path() { + None => "N/A", + Some(ref fp) => *fp, + }; + writeln!(out, "file path: {}", file_path_str); + writeln!(out, "file offset: {}", cc_metadata.file_offset()); + writeln!(out, "num of values: {}", cc_metadata.num_values()); + writeln!( + out, + "total compressed size (in bytes): {}", + cc_metadata.compressed_size() + ); + writeln!( + out, + "total uncompressed size (in bytes): {}", + cc_metadata.uncompressed_size() + ); + writeln!(out, "data page offset: {}", cc_metadata.data_page_offset()); + let index_page_offset_str = match cc_metadata.index_page_offset() { + None => "N/A".to_owned(), + Some(ipo) => ipo.to_string(), + }; + writeln!(out, "index page offset: {}", index_page_offset_str); + let dict_page_offset_str = match cc_metadata.dictionary_page_offset() { + None => "N/A".to_owned(), + Some(dpo) => dpo.to_string(), + }; + writeln!(out, "dictionary page offset: {}", dict_page_offset_str); + let statistics_str = match cc_metadata.statistics() { + None => "N/A".to_owned(), + Some(stats) => stats.to_string(), + }; + writeln!(out, "statistics: {}", statistics_str); + writeln!(out, ""); +} + +#[allow(unused_must_use)] +fn print_dashes(out: &mut io::Write, num: i32) { + for _ in 0..num { + write!(out, "-"); + } + writeln!(out, ""); +} + +const INDENT_WIDTH: i32 = 2; + +/// Struct for printing Parquet message type. +struct Printer<'a> { + output: &'a mut fmt::Write, + indent: i32, +} + +#[allow(unused_must_use)] +impl<'a> Printer<'a> { + fn new(output: &'a mut fmt::Write) -> Self { + Printer { output, indent: 0 } + } + + fn print_indent(&mut self) { + for _ in 0..self.indent { + write!(self.output, " "); + } + } +} + +#[allow(unused_must_use)] +impl<'a> Printer<'a> { + pub fn print(&mut self, tp: &Type) { + self.print_indent(); + match tp { + &Type::PrimitiveType { + ref basic_info, + physical_type, + type_length, + scale, + precision, + } => { + let phys_type_str = match physical_type { + PhysicalType::FIXED_LEN_BYTE_ARRAY => { + // We need to include length for fixed byte array + format!("{} ({})", physical_type, type_length) + } + _ => format!("{}", physical_type), + }; + // Also print logical type if it is available + let logical_type_str = match basic_info.logical_type() { + LogicalType::NONE => format!(""), + decimal @ LogicalType::DECIMAL => { + // For decimal type we should print precision and scale if they are > 0, e.g. + // DECIMAL(9, 2) - DECIMAL(9) - DECIMAL + let precision_scale = match (precision, scale) { + (p, s) if p > 0 && s > 0 => format!(" ({}, {})", p, s), + (p, 0) if p > 0 => format!(" ({})", p), + _ => format!(""), + }; + format!(" ({}{})", decimal, precision_scale) + } + other_logical_type => format!(" ({})", other_logical_type), + }; + write!( + self.output, + "{} {} {}{};", + basic_info.repetition(), + phys_type_str, + basic_info.name(), + logical_type_str + ); + } + &Type::GroupType { + ref basic_info, + ref fields, + } => { + if basic_info.has_repetition() { + let r = basic_info.repetition(); + write!(self.output, "{} group {} ", r, basic_info.name()); + if basic_info.logical_type() != LogicalType::NONE { + write!(self.output, "({}) ", basic_info.logical_type()); + } + writeln!(self.output, "{{"); + } else { + writeln!(self.output, "message {} {{", basic_info.name()); + } + + self.indent += INDENT_WIDTH; + for c in fields { + self.print(&c); + writeln!(self.output, ""); + } + self.indent -= INDENT_WIDTH; + self.print_indent(); + write!(self.output, "}}"); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::rc::Rc; + + use crate::parquet::basic::{Repetition, Type as PhysicalType}; + use crate::parquet::schema::{parser::parse_message_type, types::Type}; + + fn assert_print_parse_message(message: Type) { + let mut s = String::new(); + { + let mut p = Printer::new(&mut s); + p.print(&message); + } + let parsed = parse_message_type(&s).unwrap(); + assert_eq!(message, parsed); + } + + #[test] + fn test_print_primitive_type() { + let mut s = String::new(); + { + let mut p = Printer::new(&mut s); + let foo = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .build() + .unwrap(); + p.print(&foo); + } + assert_eq!(&mut s, "REQUIRED INT32 foo (INT_32);"); + } + + #[test] + fn test_print_primitive_type_without_logical() { + let mut s = String::new(); + { + let mut p = Printer::new(&mut s); + let foo = Type::primitive_type_builder("foo", PhysicalType::DOUBLE) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(); + p.print(&foo); + } + assert_eq!(&mut s, "REQUIRED DOUBLE foo;"); + } + + #[test] + fn test_print_group_type() { + let mut s = String::new(); + { + let mut p = Printer::new(&mut s); + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .with_id(0) + .build(); + let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) + .with_logical_type(LogicalType::UTF8) + .with_id(1) + .build(); + let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::INTERVAL) + .with_length(12) + .with_id(2) + .build(); + let mut struct_fields = Vec::new(); + struct_fields.push(Rc::new(f1.unwrap())); + struct_fields.push(Rc::new(f2.unwrap())); + let foo = Type::group_type_builder("foo") + .with_repetition(Repetition::OPTIONAL) + .with_fields(&mut struct_fields) + .with_id(1) + .build() + .unwrap(); + let mut fields = Vec::new(); + fields.push(Rc::new(foo)); + fields.push(Rc::new(f3.unwrap())); + let message = Type::group_type_builder("schema") + .with_fields(&mut fields) + .with_id(2) + .build() + .unwrap(); + p.print(&message); + } + let expected = "message schema { + OPTIONAL group foo { + REQUIRED INT32 f1 (INT_32); + OPTIONAL BYTE_ARRAY f2 (UTF8); + } + REPEATED FIXED_LEN_BYTE_ARRAY (12) f3 (INTERVAL); +}"; + assert_eq!(&mut s, expected); + } + + #[test] + fn test_print_and_parse_primitive() { + let a2 = Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(); + + let a1 = Type::group_type_builder("a1") + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new(a2)]) + .build() + .unwrap(); + + let b3 = Type::primitive_type_builder("b3", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(); + + let b4 = Type::primitive_type_builder("b4", PhysicalType::DOUBLE) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(); + + let b2 = Type::group_type_builder("b2") + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::NONE) + .with_fields(&mut vec![Rc::new(b3), Rc::new(b4)]) + .build() + .unwrap(); + + let b1 = Type::group_type_builder("b1") + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new(b2)]) + .build() + .unwrap(); + + let a0 = Type::group_type_builder("a0") + .with_repetition(Repetition::REQUIRED) + .with_fields(&mut vec![Rc::new(a1), Rc::new(b1)]) + .build() + .unwrap(); + + let message = Type::group_type_builder("root") + .with_fields(&mut vec![Rc::new(a0)]) + .build() + .unwrap(); + + assert_print_parse_message(message); + } + + #[test] + fn test_print_and_parse_nested() { + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .build() + .unwrap(); + + let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::UTF8) + .build() + .unwrap(); + + let foo = Type::group_type_builder("foo") + .with_repetition(Repetition::OPTIONAL) + .with_fields(&mut vec![Rc::new(f1), Rc::new(f2)]) + .build() + .unwrap(); + + let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::INTERVAL) + .with_length(12) + .build() + .unwrap(); + + let message = Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new(foo), Rc::new(f3)]) + .build() + .unwrap(); + + assert_print_parse_message(message); + } + + #[test] + fn test_print_and_parse_decimal() { + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(9) + .with_scale(2) + .build() + .unwrap(); + + let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(9) + .with_scale(0) + .build() + .unwrap(); + + let message = Type::group_type_builder("schema") + .with_fields(&mut vec![Rc::new(f1), Rc::new(f2)]) + .build() + .unwrap(); + + assert_print_parse_message(message); + } +} diff --git a/rust/src/parquet/schema/types.rs b/rust/src/parquet/schema/types.rs new file mode 100644 index 0000000000000..90c767c093055 --- /dev/null +++ b/rust/src/parquet/schema/types.rs @@ -0,0 +1,1830 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains structs and methods to build Parquet schema and schema descriptors. + +use std::{collections::HashMap, convert::From, fmt, rc::Rc}; + +use parquet_format::SchemaElement; + +use crate::parquet::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::parquet::errors::{ParquetError, Result}; + +// ---------------------------------------------------------------------- +// Parquet Type definitions + +/// Type alias for `Rc`. +pub type TypePtr = Rc; +/// Type alias for `Rc`. +pub type SchemaDescPtr = Rc; +/// Type alias for `Rc`. +pub type ColumnDescPtr = Rc; + +/// Representation of a Parquet type. +/// Used to describe primitive leaf fields and structs, including top-level schema. +/// Note that the top-level schema type is represented using `GroupType` whose +/// repetition is `None`. +#[derive(Debug, PartialEq)] +pub enum Type { + PrimitiveType { + basic_info: BasicTypeInfo, + physical_type: PhysicalType, + type_length: i32, + scale: i32, + precision: i32, + }, + GroupType { + basic_info: BasicTypeInfo, + fields: Vec, + }, +} + +impl Type { + /// Creates primitive type builder with provided field name and physical type. + pub fn primitive_type_builder(name: &str, physical_type: PhysicalType) -> PrimitiveTypeBuilder { + PrimitiveTypeBuilder::new(name, physical_type) + } + + /// Creates group type builder with provided column name. + pub fn group_type_builder(name: &str) -> GroupTypeBuilder { + GroupTypeBuilder::new(name) + } + + /// Returns [`BasicTypeInfo`] information about the type. + pub fn get_basic_info(&self) -> &BasicTypeInfo { + match *self { + Type::PrimitiveType { ref basic_info, .. } => &basic_info, + Type::GroupType { ref basic_info, .. } => &basic_info, + } + } + + /// Returns this type's field name. + pub fn name(&self) -> &str { + self.get_basic_info().name() + } + + /// Gets the fields from this group type. + /// Note that this will panic if called on a non-group type. + // TODO: should we return `&[&Type]` here? + pub fn get_fields(&self) -> &[TypePtr] { + match *self { + Type::GroupType { ref fields, .. } => &fields[..], + _ => panic!("Cannot call get_fields() on a non-group type"), + } + } + + /// Gets physical type of this primitive type. + /// Note that this will panic if called on a non-primitive type. + pub fn get_physical_type(&self) -> PhysicalType { + match *self { + Type::PrimitiveType { + basic_info: _, + physical_type, + .. + } => physical_type, + _ => panic!("Cannot call get_physical_type() on a non-primitive type"), + } + } + + /// Checks if `sub_type` schema is part of current schema. + /// This method can be used to check if projected columns are part of the root schema. + pub fn check_contains(&self, sub_type: &Type) -> bool { + // Names match, and repetitions match or not set for both + let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name() + && (self.is_schema() && sub_type.is_schema() + || !self.is_schema() + && !sub_type.is_schema() + && self.get_basic_info().repetition() + == sub_type.get_basic_info().repetition()); + + match *self { + Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => { + self.get_physical_type() == sub_type.get_physical_type() + } + Type::GroupType { .. } if basic_match && sub_type.is_group() => { + // build hashmap of name -> TypePtr + let mut field_map = HashMap::new(); + for field in self.get_fields() { + field_map.insert(field.name(), field); + } + + for field in sub_type.get_fields() { + if !field_map + .get(field.name()) + .map(|tpe| tpe.check_contains(field)) + .unwrap_or(false) + { + return false; + } + } + true + } + _ => false, + } + } + + /// Returns `true` if this type is a primitive type, `false` otherwise. + pub fn is_primitive(&self) -> bool { + match *self { + Type::PrimitiveType { .. } => true, + _ => false, + } + } + + /// Returns `true` if this type is a group type, `false` otherwise. + pub fn is_group(&self) -> bool { + match *self { + Type::GroupType { .. } => true, + _ => false, + } + } + + /// Returns `true` if this type is the top-level schema type (message type). + pub fn is_schema(&self) -> bool { + match *self { + Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(), + _ => false, + } + } +} + +/// A builder for primitive types. All attributes are optional +/// except the name and physical type. +/// Note that if not specified explicitly, `Repetition::OPTIONAL` is used. +pub struct PrimitiveTypeBuilder<'a> { + name: &'a str, + repetition: Repetition, + physical_type: PhysicalType, + logical_type: LogicalType, + length: i32, + precision: i32, + scale: i32, + id: Option, +} + +impl<'a> PrimitiveTypeBuilder<'a> { + /// Creates new primitive type builder with provided field name and physical type. + pub fn new(name: &'a str, physical_type: PhysicalType) -> Self { + Self { + name, + repetition: Repetition::OPTIONAL, + physical_type, + logical_type: LogicalType::NONE, + length: -1, + precision: -1, + scale: -1, + id: None, + } + } + + /// Sets [`Repetition`](`::basic::Repetition`) for this field and returns itself. + pub fn with_repetition(mut self, repetition: Repetition) -> Self { + self.repetition = repetition; + self + } + + /// Sets [`LogicalType`](`::basic::LogicalType`) for this field and returns itself. + pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { + self.logical_type = logical_type; + self + } + + /// Sets type length and returns itself. + /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because + /// they maintain fixed size underlying byte array. + /// By default, value is `0`. + pub fn with_length(mut self, length: i32) -> Self { + self.length = length; + self + } + + /// Sets precision for Parquet DECIMAL physical type and returns itself. + /// By default, it equals to `0` and used only for decimal context. + pub fn with_precision(mut self, precision: i32) -> Self { + self.precision = precision; + self + } + + /// Sets scale for Parquet DECIMAL physical type and returns itself. + /// By default, it equals to `0` and used only for decimal context. + pub fn with_scale(mut self, scale: i32) -> Self { + self.scale = scale; + self + } + + /// Sets optional field id and returns itself. + pub fn with_id(mut self, id: i32) -> Self { + self.id = Some(id); + self + } + + /// Creates a new `PrimitiveType` instance from the collected attributes. + /// Returns `Err` in case of any building conditions are not met. + pub fn build(self) -> Result { + let basic_info = BasicTypeInfo { + name: String::from(self.name), + repetition: Some(self.repetition), + logical_type: self.logical_type, + id: self.id, + }; + + // Check length before logical type, since it is used for logical type validation. + if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 { + return Err(general_err!( + "Invalid FIXED_LEN_BYTE_ARRAY length: {}", + self.length + )); + } + + match self.logical_type { + LogicalType::NONE => {} + LogicalType::UTF8 | LogicalType::BSON | LogicalType::JSON => { + if self.physical_type != PhysicalType::BYTE_ARRAY { + return Err(general_err!( + "{} can only annotate BYTE_ARRAY fields", + self.logical_type + )); + } + } + LogicalType::DECIMAL => { + match self.physical_type { + PhysicalType::INT32 + | PhysicalType::INT64 + | PhysicalType::BYTE_ARRAY + | PhysicalType::FIXED_LEN_BYTE_ARRAY => (), + _ => { + return Err(general_err!( + "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED" + )); + } + } + + // Precision is required and must be a non-zero positive integer. + if self.precision < 1 { + return Err(general_err!( + "Invalid DECIMAL precision: {}", + self.precision + )); + } + + // Scale must be zero or a positive integer less than the precision. + if self.scale < 0 { + return Err(general_err!("Invalid DECIMAL scale: {}", self.scale)); + } + + if self.scale >= self.precision { + return Err(general_err!( + "Invalid DECIMAL: scale ({}) cannot be greater than or equal to precision \ + ({})", + self.scale, + self.precision + )); + } + + // Check precision and scale based on physical type limitations. + match self.physical_type { + PhysicalType::INT32 => { + if self.precision > 9 { + return Err(general_err!( + "Cannot represent INT32 as DECIMAL with precision {}", + self.precision + )); + } + } + PhysicalType::INT64 => { + if self.precision > 18 { + return Err(general_err!( + "Cannot represent INT64 as DECIMAL with precision {}", + self.precision + )); + } + } + PhysicalType::FIXED_LEN_BYTE_ARRAY => { + let max_precision = + (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32; + + if self.precision > max_precision { + return Err(general_err!( + "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \ + precision {}", + self.length, + self.precision + )); + } + } + _ => (), // For BYTE_ARRAY precision is not limited + } + } + LogicalType::DATE + | LogicalType::TIME_MILLIS + | LogicalType::UINT_8 + | LogicalType::UINT_16 + | LogicalType::UINT_32 + | LogicalType::INT_8 + | LogicalType::INT_16 + | LogicalType::INT_32 => { + if self.physical_type != PhysicalType::INT32 { + return Err(general_err!( + "{} can only annotate INT32", + self.logical_type + )); + } + } + LogicalType::TIME_MICROS + | LogicalType::TIMESTAMP_MILLIS + | LogicalType::TIMESTAMP_MICROS + | LogicalType::UINT_64 + | LogicalType::INT_64 => { + if self.physical_type != PhysicalType::INT64 { + return Err(general_err!( + "{} can only annotate INT64", + self.logical_type + )); + } + } + LogicalType::INTERVAL => { + if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 { + return Err(general_err!( + "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + )); + } + } + LogicalType::ENUM => { + if self.physical_type != PhysicalType::BYTE_ARRAY { + return Err(general_err!("ENUM can only annotate BYTE_ARRAY fields")); + } + } + _ => { + return Err(general_err!( + "{} cannot be applied to a primitive type", + self.logical_type + )); + } + } + + Ok(Type::PrimitiveType { + basic_info, + physical_type: self.physical_type, + type_length: self.length, + scale: self.scale, + precision: self.precision, + }) + } +} + +/// A builder for group types. All attributes are optional except the name. +/// Note that if not specified explicitly, `None` is used as the repetition of the group, +/// which means it is a root (message) type. +pub struct GroupTypeBuilder<'a> { + name: &'a str, + repetition: Option, + logical_type: LogicalType, + fields: Vec, + id: Option, +} + +impl<'a> GroupTypeBuilder<'a> { + /// Creates new group type builder with provided field name. + pub fn new(name: &'a str) -> Self { + Self { + name, + repetition: None, + logical_type: LogicalType::NONE, + fields: Vec::new(), + id: None, + } + } + + /// Sets [`Repetition`](`::basic::Repetition`) for this field and returns itself. + pub fn with_repetition(mut self, repetition: Repetition) -> Self { + self.repetition = Some(repetition); + self + } + + /// Sets [`LogicalType`](`::basic::LogicalType`) for this field and returns itself. + pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { + self.logical_type = logical_type; + self + } + + /// Sets a list of fields that should be child nodes of this field. + /// Returns updated self. + pub fn with_fields(mut self, fields: &mut Vec) -> Self { + self.fields.append(fields); + self + } + + /// Sets optional field id and returns itself. + pub fn with_id(mut self, id: i32) -> Self { + self.id = Some(id); + self + } + + /// Creates a new `GroupType` instance from the gathered attributes. + pub fn build(self) -> Result { + let basic_info = BasicTypeInfo { + name: String::from(self.name), + repetition: self.repetition, + logical_type: self.logical_type, + id: self.id, + }; + Ok(Type::GroupType { + basic_info, + fields: self.fields, + }) + } +} + +/// Basic type info. This contains information such as the name of the type, +/// the repetition level, the logical type and the kind of the type (group, primitive). +#[derive(Debug, PartialEq)] +pub struct BasicTypeInfo { + name: String, + repetition: Option, + logical_type: LogicalType, + id: Option, +} + +impl BasicTypeInfo { + /// Returns field name. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns `true` if type has repetition field set, `false` otherwise. + /// This is mostly applied to group type, because primitive type always has + /// repetition set. + pub fn has_repetition(&self) -> bool { + self.repetition.is_some() + } + + /// Returns [`Repetition`](`::basic::Repetition`) value for the type. + pub fn repetition(&self) -> Repetition { + assert!(self.repetition.is_some()); + self.repetition.unwrap() + } + + /// Returns [`LogicalType`](`::basic::LogicalType`) value for the type. + pub fn logical_type(&self) -> LogicalType { + self.logical_type + } + + /// Returns `true` if id is set, `false` otherwise. + pub fn has_id(&self) -> bool { + self.id.is_some() + } + + /// Returns id value for the type. + pub fn id(&self) -> i32 { + assert!(self.id.is_some()); + self.id.unwrap() + } +} + +// ---------------------------------------------------------------------- +// Parquet descriptor definitions + +/// Represents a path in a nested schema +#[derive(Clone, PartialEq, Debug, Eq, Hash)] +pub struct ColumnPath { + parts: Vec, +} + +impl ColumnPath { + /// Creates new column path from vector of field names. + pub fn new(parts: Vec) -> Self { + ColumnPath { parts } + } + + /// Returns string representation of this column path. + /// ```rust + /// use arrow::parquet::schema::types::ColumnPath; + /// + /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]); + /// assert_eq!(&path.string(), "a.b.c"); + /// ``` + pub fn string(&self) -> String { + self.parts.join(".") + } +} + +impl fmt::Display for ColumnPath { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self.string()) + } +} + +impl From> for ColumnPath { + fn from(parts: Vec) -> Self { + ColumnPath { parts } + } +} + +impl<'a> From<&'a str> for ColumnPath { + fn from(single_path: &str) -> Self { + let s = String::from(single_path); + ColumnPath::from(s) + } +} + +impl From for ColumnPath { + fn from(single_path: String) -> Self { + let mut v = vec![]; + v.push(single_path); + ColumnPath { parts: v } + } +} + +impl AsRef<[String]> for ColumnPath { + fn as_ref(&self) -> &[String] { + &self.parts + } +} + +/// A descriptor for leaf-level primitive columns. +/// This encapsulates information such as definition and repetition levels and is used to +/// re-assemble nested data. +pub struct ColumnDescriptor { + // The "leaf" primitive type of this column + primitive_type: TypePtr, + + // The root type of this column. For instance, if the column is "a.b.c.d", then the + // primitive type is 'd' while the root_type is 'a'. + // + // NOTE: this is sometimes `None` for the convenience of testing. It should NEVER be + // `None` when running in production. + root_type: Option, + + // The maximum definition level for this column + max_def_level: i16, + + // The maximum repetition level for this column + max_rep_level: i16, + + // The path of this column. For instance, "a.b.c.d". + path: ColumnPath, +} + +impl ColumnDescriptor { + /// Creates new descriptor for leaf-level column. + pub fn new( + primitive_type: TypePtr, + root_type: Option, + max_def_level: i16, + max_rep_level: i16, + path: ColumnPath, + ) -> Self { + Self { + primitive_type, + root_type, + max_def_level, + max_rep_level, + path, + } + } + + /// Returns maximum definition level for this column. + pub fn max_def_level(&self) -> i16 { + self.max_def_level + } + + /// Returns maximum repetition level for this column. + pub fn max_rep_level(&self) -> i16 { + self.max_rep_level + } + + /// Returns [`ColumnPath`] for this column. + pub fn path(&self) -> &ColumnPath { + &self.path + } + + /// Returns self type [`Type`](`::schema::types::Type`) for this leaf column. + pub fn self_type(&self) -> &Type { + self.primitive_type.as_ref() + } + + /// Returns root [`Type`](`::schema::types::Type`) (most top-level parent field) for + /// this leaf column. + pub fn root_type(&self) -> &Type { + assert!(self.root_type.is_some()); + self.root_type.as_ref().unwrap() + } + + /// Returns column name. + pub fn name(&self) -> &str { + self.primitive_type.name() + } + + /// Returns [`LogicalType`](`::basic::LogicalType`) for this column. + pub fn logical_type(&self) -> LogicalType { + self.primitive_type.get_basic_info().logical_type() + } + + /// Returns physical type for this column. + /// Note that it will panic if called on a non-primitive type. + pub fn physical_type(&self) -> PhysicalType { + match self.primitive_type.as_ref() { + &Type::PrimitiveType { physical_type, .. } => physical_type, + _ => panic!("Expected primitive type!"), + } + } + + /// Returns type length for this column. + /// Note that it will panic if called on a non-primitive type. + pub fn type_length(&self) -> i32 { + match self.primitive_type.as_ref() { + &Type::PrimitiveType { type_length, .. } => type_length, + _ => panic!("Expected primitive type!"), + } + } + + /// Returns type precision for this column. + /// Note that it will panic if called on a non-primitive type. + pub fn type_precision(&self) -> i32 { + match self.primitive_type.as_ref() { + &Type::PrimitiveType { precision, .. } => precision, + _ => panic!("Expected primitive type!"), + } + } + + /// Returns type scale for this column. + /// Note that it will panic if called on a non-primitive type. + pub fn type_scale(&self) -> i32 { + match self.primitive_type.as_ref() { + &Type::PrimitiveType { scale, .. } => scale, + _ => panic!("Expected primitive type!"), + } + } +} + +/// A schema descriptor. This encapsulates the top-level schemas for all the columns, +/// as well as all descriptors for all the primitive columns. +pub struct SchemaDescriptor { + // The top-level schema (the "message" type). + // This must be a `GroupType` where each field is a root column type in the schema. + schema: TypePtr, + + // All the descriptors for primitive columns in this schema, constructed from + // `schema` in DFS order. + leaves: Vec, + + // Mapping from a leaf column's index to the root column type that it + // comes from. For instance: the leaf `a.b.c.d` would have a link back to `a`: + // -- a <-----+ + // -- -- b | + // -- -- -- c | + // -- -- -- -- d + leaf_to_base: HashMap, +} + +impl SchemaDescriptor { + /// Creates new schema descriptor from Parquet schema. + pub fn new(tp: TypePtr) -> Self { + assert!(tp.is_group(), "SchemaDescriptor should take a GroupType"); + let mut leaves = vec![]; + let mut leaf_to_base = HashMap::new(); + for f in tp.get_fields() { + let mut path = vec![]; + build_tree( + f.clone(), + tp.clone(), + f.clone(), + 0, + 0, + &mut leaves, + &mut leaf_to_base, + &mut path, + ); + } + + Self { + schema: tp, + leaves, + leaf_to_base, + } + } + + /// Returns [`ColumnDescriptor`] for a field position. + pub fn column(&self, i: usize) -> ColumnDescPtr { + assert!( + i < self.leaves.len(), + "Index out of bound: {} not in [0, {})", + i, + self.leaves.len() + ); + self.leaves[i].clone() + } + + /// Returns slice of [`ColumnDescriptor`]. + pub fn columns(&self) -> &[ColumnDescPtr] { + &self.leaves + } + + /// Returns number of leaf-level columns. + pub fn num_columns(&self) -> usize { + self.leaves.len() + } + + /// Returns column root [`Type`](`::schema::types::Type`) for a field position. + pub fn get_column_root(&self, i: usize) -> &Type { + assert!( + i < self.leaves.len(), + "Index out of bound: {} not in [0, {})", + i, + self.leaves.len() + ); + let result = self.leaf_to_base.get(&i); + assert!( + result.is_some(), + "Expected a value for index {} but found None", + i + ); + result.unwrap().as_ref() + } + + /// Returns schema as [`Type`](`::schema::types::Type`). + pub fn root_schema(&self) -> &Type { + self.schema.as_ref() + } + + /// Returns schema name. + pub fn name(&self) -> &str { + self.schema.name() + } +} + +fn build_tree( + tp: TypePtr, + root_tp: TypePtr, + base_tp: TypePtr, + mut max_rep_level: i16, + mut max_def_level: i16, + leaves: &mut Vec, + leaf_to_base: &mut HashMap, + path_so_far: &mut Vec, +) { + assert!(tp.get_basic_info().has_repetition()); + + path_so_far.push(String::from(tp.name())); + match tp.get_basic_info().repetition() { + Repetition::OPTIONAL => { + max_def_level += 1; + } + Repetition::REPEATED => { + max_def_level += 1; + max_rep_level += 1; + } + _ => {} + } + + match tp.as_ref() { + &Type::PrimitiveType { .. } => { + let mut path: Vec = vec![]; + path.extend_from_slice(&path_so_far[..]); + leaves.push(Rc::new(ColumnDescriptor::new( + tp.clone(), + Some(root_tp), + max_def_level, + max_rep_level, + ColumnPath::new(path), + ))); + leaf_to_base.insert(leaves.len() - 1, base_tp); + } + &Type::GroupType { ref fields, .. } => { + for f in fields { + build_tree( + f.clone(), + root_tp.clone(), + base_tp.clone(), + max_rep_level, + max_def_level, + leaves, + leaf_to_base, + path_so_far, + ); + let idx = path_so_far.len() - 1; + path_so_far.remove(idx); + } + } + } +} + +/// Method to convert from Thrift. +pub fn from_thrift(elements: &[SchemaElement]) -> Result { + let mut index = 0; + let mut schema_nodes = Vec::new(); + while index < elements.len() { + let t = from_thrift_helper(elements, index)?; + index = t.0; + schema_nodes.push(t.1); + } + if schema_nodes.len() != 1 { + return Err(general_err!( + "Expected exactly one root node, but found {}", + schema_nodes.len() + )); + } + + Ok(schema_nodes.remove(0)) +} + +/// Constructs a new Type from the `elements`, starting at index `index`. +/// The first result is the starting index for the next Type after this one. If it is +/// equal to `elements.len()`, then this Type is the last one. +/// The second result is the result Type. +fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, TypePtr)> { + // Whether or not the current node is root (message type). + // There is only one message type node in the schema tree. + let is_root_node = index == 0; + + if index > elements.len() { + return Err(general_err!( + "Index out of bound, index = {}, len = {}", + index, + elements.len() + )); + } + let logical_type = LogicalType::from(elements[index].converted_type); + let field_id = elements[index].field_id; + match elements[index].num_children { + // From parquet-format: + // The children count is used to construct the nested relationship. + // This field is not set when the element is a primitive type + // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we + // have to handle this case too. + None | Some(0) => { + // primitive type + if elements[index].repetition_type.is_none() { + return Err(general_err!( + "Repetition level must be defined for a primitive type" + )); + } + let repetition = Repetition::from(elements[index].repetition_type.unwrap()); + let physical_type = PhysicalType::from(elements[index].type_.unwrap()); + let length = elements[index].type_length.unwrap_or(-1); + let scale = elements[index].scale.unwrap_or(-1); + let precision = elements[index].precision.unwrap_or(-1); + let name = &elements[index].name; + let mut builder = Type::primitive_type_builder(name, physical_type) + .with_repetition(repetition) + .with_logical_type(logical_type) + .with_length(length) + .with_precision(precision) + .with_scale(scale); + if let Some(id) = field_id { + builder = builder.with_id(id); + } + Ok((index + 1, Rc::new(builder.build()?))) + } + Some(n) => { + let repetition = elements[index].repetition_type.map(|r| Repetition::from(r)); + let mut fields = vec![]; + let mut next_index = index + 1; + for _ in 0..n { + let child_result = from_thrift_helper(elements, next_index as usize)?; + next_index = child_result.0; + fields.push(child_result.1); + } + + let mut builder = Type::group_type_builder(&elements[index].name) + .with_logical_type(logical_type) + .with_fields(&mut fields); + if let Some(rep) = repetition { + // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or REPEATED + // for root node. + // + // We only set repetition for group types that are not top-level message type. + // According to parquet-format: + // Root of the schema does not have a repetition_type. + // All other types must have one. + if !is_root_node { + builder = builder.with_repetition(rep); + } + } + if let Some(id) = field_id { + builder = builder.with_id(id); + } + Ok((next_index, Rc::new(builder.build().unwrap()))) + } + } +} + +/// Method to convert to Thrift. +pub fn to_thrift(schema: &Type) -> Result> { + if !schema.is_group() { + return Err(general_err!("Root schema must be Group type")); + } + let mut elements: Vec = Vec::new(); + to_thrift_helper(schema, &mut elements); + Ok(elements) +} + +/// Constructs list of `SchemaElement` from the schema using depth-first traversal. +/// Here we assume that schema is always valid and starts with group type. +fn to_thrift_helper(schema: &Type, elements: &mut Vec) { + match *schema { + Type::PrimitiveType { + ref basic_info, + physical_type, + type_length, + scale, + precision, + } => { + let element = SchemaElement { + type_: Some(physical_type.into()), + type_length: if type_length >= 0 { + Some(type_length) + } else { + None + }, + repetition_type: Some(basic_info.repetition().into()), + name: basic_info.name().to_owned(), + num_children: None, + converted_type: basic_info.logical_type().into(), + scale: if scale >= 0 { Some(scale) } else { None }, + precision: if precision >= 0 { + Some(precision) + } else { + None + }, + field_id: if basic_info.has_id() { + Some(basic_info.id()) + } else { + None + }, + logical_type: None, + }; + + elements.push(element); + } + Type::GroupType { + ref basic_info, + ref fields, + } => { + let repetition = if basic_info.has_repetition() { + Some(basic_info.repetition().into()) + } else { + None + }; + + let element = SchemaElement { + type_: None, + type_length: None, + repetition_type: repetition, + name: basic_info.name().to_owned(), + num_children: Some(fields.len() as i32), + converted_type: basic_info.logical_type().into(), + scale: None, + precision: None, + field_id: if basic_info.has_id() { + Some(basic_info.id()) + } else { + None + }, + logical_type: None, + }; + + elements.push(element); + + // Add child elements for a group + for field in fields { + to_thrift_helper(field, elements); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::error::Error; + + use crate::parquet::schema::parser::parse_message_type; + + #[test] + fn test_primitive_type() { + let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_logical_type(LogicalType::INT_32) + .with_id(0) + .build(); + assert!(result.is_ok()); + + if let Ok(tp) = result { + assert!(tp.is_primitive()); + assert!(!tp.is_group()); + let basic_info = tp.get_basic_info(); + assert_eq!(basic_info.repetition(), Repetition::OPTIONAL); + assert_eq!(basic_info.logical_type(), LogicalType::INT_32); + assert_eq!(basic_info.id(), 0); + match tp { + Type::PrimitiveType { physical_type, .. } => { + assert_eq!(physical_type, PhysicalType::INT32); + } + _ => assert!(false), + } + } + + // Test illegal inputs + result = Type::primitive_type_builder("foo", PhysicalType::INT64) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::BSON) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "BSON can only annotate BYTE_ARRAY fields"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT96) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(-1) + .with_scale(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(-1) + .with_scale(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "Invalid DECIMAL precision: -1"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(0) + .with_scale(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "Invalid DECIMAL precision: 0"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(1) + .with_scale(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "Invalid DECIMAL scale: -1"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(1) + .with_scale(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "Invalid DECIMAL: scale (2) cannot be greater than or equal to precision (1)" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(18) + .with_scale(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "Cannot represent INT32 as DECIMAL with precision 18" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT64) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_precision(32) + .with_scale(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "Cannot represent INT64 as DECIMAL with precision 32" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_length(5) + .with_precision(12) + .with_scale(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT64) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::UINT_8) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "UINT_8 can only annotate INT32"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::TIME_MICROS) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "TIME_MICROS can only annotate INT64"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INTERVAL) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INTERVAL) + .with_length(1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + e.description(), + "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + ); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::ENUM) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "ENUM can only annotate BYTE_ARRAY fields"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::MAP) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "MAP cannot be applied to a primitive type"); + } + + result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::DECIMAL) + .with_length(-1) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.description(), "Invalid FIXED_LEN_BYTE_ARRAY length: -1"); + } + } + + #[test] + fn test_group_type() { + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .with_logical_type(LogicalType::INT_32) + .with_id(0) + .build(); + assert!(f1.is_ok()); + let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) + .with_logical_type(LogicalType::UTF8) + .with_id(1) + .build(); + assert!(f2.is_ok()); + + let mut fields = vec![]; + fields.push(Rc::new(f1.unwrap())); + fields.push(Rc::new(f2.unwrap())); + + let result = Type::group_type_builder("foo") + .with_repetition(Repetition::REPEATED) + .with_fields(&mut fields) + .with_id(1) + .build(); + assert!(result.is_ok()); + + let tp = result.unwrap(); + let basic_info = tp.get_basic_info(); + assert!(tp.is_group()); + assert!(!tp.is_primitive()); + assert_eq!(basic_info.repetition(), Repetition::REPEATED); + assert_eq!(basic_info.logical_type(), LogicalType::NONE); + assert_eq!(basic_info.id(), 1); + assert_eq!(tp.get_fields().len(), 2); + assert_eq!(tp.get_fields()[0].name(), "f1"); + assert_eq!(tp.get_fields()[1].name(), "f2"); + } + + #[test] + fn test_column_descriptor() { + let result = test_column_descriptor_helper(); + assert!( + result.is_ok(), + "Expected result to be OK but got err:\n {}", + result.unwrap_err() + ); + } + + fn test_column_descriptor_helper() -> Result<()> { + let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY) + .with_logical_type(LogicalType::UTF8) + .build()?; + + let root_tp = Type::group_type_builder("root") + .with_logical_type(LogicalType::LIST) + .build() + .unwrap(); + let root_tp_rc = Rc::new(root_tp); + + let descr = ColumnDescriptor::new( + Rc::new(tp), + Some(root_tp_rc.clone()), + 4, + 1, + ColumnPath::from("name"), + ); + + assert_eq!(descr.path(), &ColumnPath::from("name")); + assert_eq!(descr.logical_type(), LogicalType::UTF8); + assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY); + assert_eq!(descr.max_def_level(), 4); + assert_eq!(descr.max_rep_level(), 1); + assert_eq!(descr.name(), "name"); + assert_eq!(descr.type_length(), -1); + assert_eq!(descr.type_precision(), -1); + assert_eq!(descr.type_scale(), -1); + assert_eq!(descr.root_type(), root_tp_rc.as_ref()); + + Ok(()) + } + + #[test] + fn test_schema_descriptor() { + let result = test_schema_descriptor_helper(); + assert!( + result.is_ok(), + "Expected result to be OK but got err:\n {}", + result.unwrap_err() + ); + } + + // A helper fn to avoid handling the results from type creation + fn test_schema_descriptor_helper() -> Result<()> { + let mut fields = vec![]; + + let inta = Type::primitive_type_builder("a", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_32) + .build()?; + fields.push(Rc::new(inta)); + let intb = Type::primitive_type_builder("b", PhysicalType::INT64) + .with_logical_type(LogicalType::INT_64) + .build()?; + fields.push(Rc::new(intb)); + let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::UTF8) + .build()?; + fields.push(Rc::new(intc)); + + // 3-level list encoding + let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(LogicalType::INT_64) + .build()?; + let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?; + let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32) + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::INT_32) + .build()?; + let list = Type::group_type_builder("records") + .with_repetition(Repetition::REPEATED) + .with_logical_type(LogicalType::LIST) + .with_fields(&mut vec![Rc::new(item1), Rc::new(item2), Rc::new(item3)]) + .build()?; + let bag = Type::group_type_builder("bag") + .with_repetition(Repetition::OPTIONAL) + .with_fields(&mut vec![Rc::new(list)]) + .build()?; + fields.push(Rc::new(bag)); + + let schema = Type::group_type_builder("schema") + .with_repetition(Repetition::REPEATED) + .with_fields(&mut fields) + .build()?; + let descr = SchemaDescriptor::new(Rc::new(schema)); + + let nleaves = 6; + assert_eq!(descr.num_columns(), nleaves); + + // mdef mrep + // required int32 a 0 0 + // optional int64 b 1 0 + // repeated byte_array c 1 1 + // optional group bag 1 0 + // repeated group records 2 1 + // required int64 item1 2 1 + // optional boolean item2 3 1 + // repeated int32 item3 3 2 + let ex_max_def_levels = vec![0, 1, 1, 2, 3, 3]; + let ex_max_rep_levels = vec![0, 0, 1, 1, 1, 2]; + + for i in 0..nleaves { + let col = descr.column(i); + assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{}", i); + assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{}", i); + } + + assert_eq!(descr.column(0).path().string(), "a"); + assert_eq!(descr.column(1).path().string(), "b"); + assert_eq!(descr.column(2).path().string(), "c"); + assert_eq!(descr.column(3).path().string(), "bag.records.item1"); + assert_eq!(descr.column(4).path().string(), "bag.records.item2"); + assert_eq!(descr.column(5).path().string(), "bag.records.item3"); + + assert_eq!(descr.get_column_root(0).name(), "a"); + assert_eq!(descr.get_column_root(3).name(), "bag"); + assert_eq!(descr.get_column_root(4).name(), "bag"); + assert_eq!(descr.get_column_root(5).name(), "bag"); + + Ok(()) + } + + #[test] + fn test_schema_build_tree_def_rep_levels() { + let message_type = " + message spark_schema { + REQUIRED INT32 a; + OPTIONAL group b { + OPTIONAL INT32 _1; + OPTIONAL INT32 _2; + } + OPTIONAL group c (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + "; + let schema = parse_message_type(message_type).expect("should parse schema"); + let descr = SchemaDescriptor::new(Rc::new(schema)); + // required int32 a + assert_eq!(descr.column(0).max_def_level(), 0); + assert_eq!(descr.column(0).max_rep_level(), 0); + // optional int32 b._1 + assert_eq!(descr.column(1).max_def_level(), 2); + assert_eq!(descr.column(1).max_rep_level(), 0); + // optional int32 b._2 + assert_eq!(descr.column(2).max_def_level(), 2); + assert_eq!(descr.column(2).max_rep_level(), 0); + // repeated optional int32 c.list.element + assert_eq!(descr.column(3).max_def_level(), 3); + assert_eq!(descr.column(3).max_rep_level(), 1); + } + + #[test] + #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")] + fn test_get_physical_type_panic() { + let list = Type::group_type_builder("records") + .with_repetition(Repetition::REPEATED) + .build() + .unwrap(); + list.get_physical_type(); + } + + #[test] + fn test_get_physical_type_primitive() { + let f = Type::primitive_type_builder("f", PhysicalType::INT64) + .build() + .unwrap(); + assert_eq!(f.get_physical_type(), PhysicalType::INT64); + + let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY) + .build() + .unwrap(); + assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY); + } + + #[test] + fn test_check_contains_primitive_primitive() { + // OK + let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT32) + .build() + .unwrap(); + assert!(f1.check_contains(&f2)); + + // OK: different logical type does not affect check_contains + let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) + .with_logical_type(LogicalType::UINT_8) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT32) + .with_logical_type(LogicalType::UINT_16) + .build() + .unwrap(); + assert!(f1.check_contains(&f2)); + + // KO: different name + let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + + // KO: different type + let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT64) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + + // KO: different repetition + let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + } + + // function to create a new group type for testing + fn test_new_group_type(name: &str, repetition: Repetition, types: Vec) -> Type { + let mut fields = Vec::new(); + for tpe in types { + fields.push(Rc::new(tpe)) + } + Type::group_type_builder(name) + .with_repetition(repetition) + .with_fields(&mut fields) + .build() + .unwrap() + } + + #[test] + fn test_check_contains_group_group() { + // OK: should match okay with empty fields + let f1 = Type::group_type_builder("f").build().unwrap(); + let f2 = Type::group_type_builder("f").build().unwrap(); + assert!(f1.check_contains(&f2)); + + // OK: fields match + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + assert!(f1.check_contains(&f2)); + + // OK: subset of fields + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap()], + ); + assert!(f1.check_contains(&f2)); + + // KO: different name + let f1 = Type::group_type_builder("f1").build().unwrap(); + let f2 = Type::group_type_builder("f2").build().unwrap(); + assert!(!f1.check_contains(&f2)); + + // KO: different repetition + let f1 = Type::group_type_builder("f") + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(); + let f2 = Type::group_type_builder("f") + .with_repetition(Repetition::REPEATED) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + + // KO: different fields + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::BOOLEAN) + .build() + .unwrap(), + ], + ); + assert!(!f1.check_contains(&f2)); + + // KO: different fields + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![ + Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::INT64) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![Type::primitive_type_builder("f3", PhysicalType::INT32) + .build() + .unwrap()], + ); + assert!(!f1.check_contains(&f2)); + } + + #[test] + fn test_check_contains_group_primitive() { + // KO: should not match + let f1 = Type::group_type_builder("f").build().unwrap(); + let f2 = Type::primitive_type_builder("f", PhysicalType::INT64) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + assert!(!f2.check_contains(&f1)); + + // KO: should not match when primitive field is part of group type + let f1 = test_new_group_type( + "f", + Repetition::REPEATED, + vec![Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap()], + ); + let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32) + .build() + .unwrap(); + assert!(!f1.check_contains(&f2)); + assert!(!f2.check_contains(&f1)); + + // OK: match nested types + let f1 = test_new_group_type( + "a", + Repetition::REPEATED, + vec![ + test_new_group_type( + "b", + Repetition::REPEATED, + vec![Type::primitive_type_builder("c", PhysicalType::INT32) + .build() + .unwrap()], + ), + Type::primitive_type_builder("d", PhysicalType::INT64) + .build() + .unwrap(), + Type::primitive_type_builder("e", PhysicalType::BOOLEAN) + .build() + .unwrap(), + ], + ); + let f2 = test_new_group_type( + "a", + Repetition::REPEATED, + vec![test_new_group_type( + "b", + Repetition::REPEATED, + vec![Type::primitive_type_builder("c", PhysicalType::INT32) + .build() + .unwrap()], + )], + ); + assert!(f1.check_contains(&f2)); // should match + assert!(!f2.check_contains(&f1)); // should fail + } + + #[test] + fn test_schema_type_thrift_conversion_err() { + let schema = Type::primitive_type_builder("col", PhysicalType::INT32) + .build() + .unwrap(); + let thrift_schema = to_thrift(&schema); + assert!(thrift_schema.is_err()); + if let Err(e) = thrift_schema { + assert_eq!(e.description(), "Root schema must be Group type"); + } + } + + #[test] + fn test_schema_type_thrift_conversion() { + let message_type = " + message conversions { + REQUIRED INT64 id; + OPTIONAL group int_array_Array (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + } + OPTIONAL group int_map (MAP) { + REPEATED group map (MAP_KEY_VALUE) { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL INT32 value; + } + } + OPTIONAL group int_Map_Array (LIST) { + REPEATED group list { + OPTIONAL group g (MAP) { + REPEATED group map (MAP_KEY_VALUE) { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value { + OPTIONAL group H { + OPTIONAL group i (LIST) { + REPEATED group list { + OPTIONAL DOUBLE element; + } + } + } + } + } + } + } + } + OPTIONAL group nested_struct { + OPTIONAL INT32 A; + OPTIONAL group b (LIST) { + REPEATED group list { + REQUIRED FIXED_LEN_BYTE_ARRAY (16) element; + } + } + } + } + "; + let expected_schema = parse_message_type(message_type).unwrap(); + let thrift_schema = to_thrift(&expected_schema).unwrap(); + let result_schema = from_thrift(&thrift_schema).unwrap(); + assert_eq!(result_schema, Rc::new(expected_schema)); + } + + #[test] + fn test_schema_type_thrift_conversion_decimal() { + let message_type = " + message decimals { + OPTIONAL INT32 field0; + OPTIONAL INT64 field1 (DECIMAL (18, 2)); + OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18)); + OPTIONAL BYTE_ARRAY field3 (DECIMAL (9)); + } + "; + let expected_schema = parse_message_type(message_type).unwrap(); + let thrift_schema = to_thrift(&expected_schema).unwrap(); + let result_schema = from_thrift(&thrift_schema).unwrap(); + assert_eq!(result_schema, Rc::new(expected_schema)); + } + + // Tests schema conversion from thrift, when num_children is set to Some(0) for a + // primitive type. + #[test] + fn test_schema_from_thrift_with_num_children_set() { + // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT + let message_type = " + message schema { + OPTIONAL BYTE_ARRAY id (UTF8); + OPTIONAL BYTE_ARRAY name (UTF8); + OPTIONAL BYTE_ARRAY message (UTF8); + OPTIONAL INT32 type (UINT_8); + OPTIONAL INT64 author_time (TIMESTAMP_MILLIS); + OPTIONAL INT64 __index_level_0__; + } + "; + + let expected_schema = parse_message_type(message_type).unwrap(); + let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + // Change all of None to Some(0) + for mut elem in &mut thrift_schema[..] { + if elem.num_children == None { + elem.num_children = Some(0); + } + } + + let result_schema = from_thrift(&thrift_schema).unwrap(); + assert_eq!(result_schema, Rc::new(expected_schema)); + } + + // Sometimes parquet-cpp sets repetition level for the root node, which is against + // the format definition, but we need to handle it by setting it back to None. + #[test] + fn test_schema_from_thrift_root_has_repetition() { + // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT + let message_type = " + message schema { + OPTIONAL BYTE_ARRAY a (UTF8); + OPTIONAL INT32 b (UINT_8); + } + "; + + let expected_schema = parse_message_type(message_type).unwrap(); + let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into()); + + let result_schema = from_thrift(&thrift_schema).unwrap(); + assert_eq!(result_schema, Rc::new(expected_schema)); + } +} diff --git a/rust/src/parquet/util/bit_packing.rs b/rust/src/parquet/util/bit_packing.rs new file mode 100644 index 0000000000000..851fb36ea5c98 --- /dev/null +++ b/rust/src/parquet/util/bit_packing.rs @@ -0,0 +1,3658 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Unpack 32 values with bit width `num_bits` from `in_ptr`, and write to `out_ptr`. +/// Return the `in_ptr` where the starting offset points to the first byte after all the +/// bytes that were consumed. +// TODO: may be better to make these more compact using if-else conditions. +// However, this may require const generics: +// https://github.com/rust-lang/rust/issues/44580 +// to eliminate the branching cost. +// TODO: we should use SIMD instructions to further optimize this. I have explored +// https://github.com/tantivy-search/bitpacking +// but the layout it uses for SIMD is different from Parquet. +// TODO: support packing as well, which is used for encoding. +pub unsafe fn unpack32(mut in_ptr: *const u32, out_ptr: *mut u32, num_bits: usize) -> *const u32 { + in_ptr = match num_bits { + 0 => nullunpacker32(in_ptr, out_ptr), + 1 => unpack1_32(in_ptr, out_ptr), + 2 => unpack2_32(in_ptr, out_ptr), + 3 => unpack3_32(in_ptr, out_ptr), + 4 => unpack4_32(in_ptr, out_ptr), + 5 => unpack5_32(in_ptr, out_ptr), + 6 => unpack6_32(in_ptr, out_ptr), + 7 => unpack7_32(in_ptr, out_ptr), + 8 => unpack8_32(in_ptr, out_ptr), + 9 => unpack9_32(in_ptr, out_ptr), + 10 => unpack10_32(in_ptr, out_ptr), + 11 => unpack11_32(in_ptr, out_ptr), + 12 => unpack12_32(in_ptr, out_ptr), + 13 => unpack13_32(in_ptr, out_ptr), + 14 => unpack14_32(in_ptr, out_ptr), + 15 => unpack15_32(in_ptr, out_ptr), + 16 => unpack16_32(in_ptr, out_ptr), + 17 => unpack17_32(in_ptr, out_ptr), + 18 => unpack18_32(in_ptr, out_ptr), + 19 => unpack19_32(in_ptr, out_ptr), + 20 => unpack20_32(in_ptr, out_ptr), + 21 => unpack21_32(in_ptr, out_ptr), + 22 => unpack22_32(in_ptr, out_ptr), + 23 => unpack23_32(in_ptr, out_ptr), + 24 => unpack24_32(in_ptr, out_ptr), + 25 => unpack25_32(in_ptr, out_ptr), + 26 => unpack26_32(in_ptr, out_ptr), + 27 => unpack27_32(in_ptr, out_ptr), + 28 => unpack28_32(in_ptr, out_ptr), + 29 => unpack29_32(in_ptr, out_ptr), + 30 => unpack30_32(in_ptr, out_ptr), + 31 => unpack31_32(in_ptr, out_ptr), + 32 => unpack32_32(in_ptr, out_ptr), + _ => unimplemented!(), + }; + in_ptr +} + +unsafe fn nullunpacker32(in_buf: *const u32, mut out: *mut u32) -> *const u32 { + for _ in 0..32 { + *out = 0; + out = out.offset(1); + } + in_buf +} + +unsafe fn unpack1_32(in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 1) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 2) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 3) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 4) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 5) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 6) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 7) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 8) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 9) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 10) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 11) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 12) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 13) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 14) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 15) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 16) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 17) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 18) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 19) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 20) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 21) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 22) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 23) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 24) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 25) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 26) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 27) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 28) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 29) & 1; + out = out.offset(1); + *out = ((*in_buf) >> 30) & 1; + out = out.offset(1); + *out = (*in_buf) >> 31; + + in_buf.offset(1) +} + +unsafe fn unpack2_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 2) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 26) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 2); + out = out.offset(1); + *out = (*in_buf) >> 30; + out = out.offset(1); + in_buf = in_buf.offset(1); + *out = ((*in_buf) >> 0) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 2) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 26) % (1u32 << 2); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 2); + out = out.offset(1); + *out = (*in_buf) >> 30; + + in_buf.offset(1) +} + +unsafe fn unpack3_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 3) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 9) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 21) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 27) % (1u32 << 3); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (3 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 7) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 25) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 3); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (3 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 5) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 23) % (1u32 << 3); + out = out.offset(1); + *out = ((*in_buf) >> 26) % (1u32 << 3); + out = out.offset(1); + *out = (*in_buf) >> 29; + + in_buf.offset(1) +} + +unsafe fn unpack4_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 4); + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 4); + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 4); + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 4) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 4); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 4); + + in_buf.offset(1) +} + +unsafe fn unpack5_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 5) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 25) % (1u32 << 5); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (5 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 23) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 28) % (1u32 << 5); + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (5 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 21) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 26) % (1u32 << 5); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (5 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 9) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 5); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (5 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 7) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 5); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 5); + out = out.offset(1); + *out = (*in_buf) >> 27; + + in_buf.offset(1) +} + +unsafe fn unpack6_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (6 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (6 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 6) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (6 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (6 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 6); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 6); + out = out.offset(1); + *out = (*in_buf) >> 26; + + in_buf.offset(1) +} + +unsafe fn unpack7_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 7) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 21) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (7 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 24) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (7 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (7 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 9) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 23) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (7 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (7 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (7 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 7); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 7); + out = out.offset(1); + *out = (*in_buf) >> 25; + + in_buf.offset(1) +} + +unsafe fn unpack8_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 8) % (1u32 << 8); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 8); + out = out.offset(1); + *out = (*in_buf) >> 24; + + in_buf.offset(1) +} + +unsafe fn unpack9_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 9) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (9 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 22) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (9 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (9 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 21) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (9 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (9 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (9 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (9 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (9 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 9); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 9); + out = out.offset(1); + *out = (*in_buf) >> 23; + + in_buf.offset(1) +} + +unsafe fn unpack10_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (10 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (10 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (10 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (10 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 10) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (10 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (10 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (10 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (10 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 10); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 10); + out = out.offset(1); + *out = (*in_buf) >> 22; + + in_buf.offset(1) +} + +unsafe fn unpack11_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 11) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (11 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (11 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (11 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (11 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (11 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (11 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (11 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (11 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 19) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (11 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 11); + out = out.offset(1); + *out = ((*in_buf) >> 20) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (11 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 11); + out = out.offset(1); + *out = (*in_buf) >> 21; + + in_buf.offset(1) +} + +unsafe fn unpack12_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (12 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (12 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (12 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (12 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (12 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (12 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 12) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (12 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 12); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (12 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 12); + out = out.offset(1); + *out = (*in_buf) >> 20; + + in_buf.offset(1) +} + +unsafe fn unpack13_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 13) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (13 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (13 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (13 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (13 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (13 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (13 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (13 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (13 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 17) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (13 - 11); + out = out.offset(1); + + *out = ((*in_buf) >> 11) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (13 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 13); + out = out.offset(1); + *out = ((*in_buf) >> 18) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (13 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (13 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 13); + out = out.offset(1); + *out = (*in_buf) >> 19; + + in_buf.offset(1) +} + +unsafe fn unpack14_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 14); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (14 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (14 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (14 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 14); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (14 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (14 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (14 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 14); + out = out.offset(1); + *out = ((*in_buf) >> 14) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (14 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (14 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (14 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 14); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (14 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (14 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (14 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 14); + out = out.offset(1); + *out = (*in_buf) >> 18; + + in_buf.offset(1) +} + +unsafe fn unpack15_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 15); + out = out.offset(1); + *out = ((*in_buf) >> 15) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (15 - 13); + out = out.offset(1); + + *out = ((*in_buf) >> 13) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (15 - 11); + out = out.offset(1); + + *out = ((*in_buf) >> 11) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (15 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (15 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (15 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (15 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (15 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 15); + out = out.offset(1); + *out = ((*in_buf) >> 16) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (15 - 14); + out = out.offset(1); + + *out = ((*in_buf) >> 14) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (15 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (15 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (15 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (15 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (15 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (15 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 15); + out = out.offset(1); + *out = (*in_buf) >> 17; + + in_buf.offset(1) +} + +unsafe fn unpack16_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + out = out.offset(1); + in_buf = in_buf.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 16); + out = out.offset(1); + *out = (*in_buf) >> 16; + + in_buf.offset(1) +} + +unsafe fn unpack17_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (17 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (17 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (17 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (17 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (17 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (17 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (17 - 14); + out = out.offset(1); + + *out = ((*in_buf) >> 14) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (17 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (17 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (17 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (17 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (17 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (17 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (17 - 11); + out = out.offset(1); + + *out = ((*in_buf) >> 11) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (17 - 13); + out = out.offset(1); + + *out = ((*in_buf) >> 13) % (1u32 << 17); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (17 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + + in_buf.offset(1) +} + +unsafe fn unpack18_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (18 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (18 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (18 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (18 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (18 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (18 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (18 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (18 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (18 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (18 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (18 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (18 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (18 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (18 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (18 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 18); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (18 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + + in_buf.offset(1) +} + +unsafe fn unpack19_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (19 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (19 - 12); + out = out.offset(1); + + *out = ((*in_buf) >> 12) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (19 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (19 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (19 - 11); + out = out.offset(1); + + *out = ((*in_buf) >> 11) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (19 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (19 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (19 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (19 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (19 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (19 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (19 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (19 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (19 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (19 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (19 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (19 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 19); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (19 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + + in_buf.offset(1) +} + +unsafe fn unpack20_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (20 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (20 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (20 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (20 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (20 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (20 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (20 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (20 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (20 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (20 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (20 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (20 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (20 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (20 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (20 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 20); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (20 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + + in_buf.offset(1) +} + +unsafe fn unpack21_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (21 - 10); + out = out.offset(1); + + *out = ((*in_buf) >> 10) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (21 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (21 - 9); + out = out.offset(1); + + *out = ((*in_buf) >> 9) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (21 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (21 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (21 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (21 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (21 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (21 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (21 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (21 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (21 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (21 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (21 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (21 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (21 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (21 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (21 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (21 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 21); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (21 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + + in_buf.offset(1) +} + +unsafe fn unpack22_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (22 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (22 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (22 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (22 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (22 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (22 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (22 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (22 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (22 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (22 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (22 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (22 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (22 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (22 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (22 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (22 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (22 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (22 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 22); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (22 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (22 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + + in_buf.offset(1) +} + +unsafe fn unpack23_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (23 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (23 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (23 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (23 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (23 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (23 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (23 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (23 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (23 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (23 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (23 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (23 - 7); + out = out.offset(1); + + *out = ((*in_buf) >> 7) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (23 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (23 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (23 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (23 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (23 - 8); + out = out.offset(1); + + *out = ((*in_buf) >> 8) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (23 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (23 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (23 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 23); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (23 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (23 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + + in_buf.offset(1) +} + +unsafe fn unpack24_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 24); + out = out.offset(1); + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (24 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (24 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + + in_buf.offset(1) +} + +unsafe fn unpack25_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (25 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (25 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (25 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (25 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (25 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (25 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (25 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (25 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (25 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (25 - 5); + out = out.offset(1); + + *out = ((*in_buf) >> 5) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 23)) << (25 - 23); + out = out.offset(1); + + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (25 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (25 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (25 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (25 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (25 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (25 - 6); + out = out.offset(1); + + *out = ((*in_buf) >> 6) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (25 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (25 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (25 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (25 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 25); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (25 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (25 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (25 - 7); + out = out.offset(1); + + *out = (*in_buf) >> 7; + + in_buf.offset(1) +} + +unsafe fn unpack26_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (26 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (26 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (26 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (26 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (26 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (26 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (26 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (26 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (26 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (26 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (26 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (26 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (26 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (26 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (26 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (26 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (26 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (26 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (26 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (26 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 26); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (26 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (26 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (26 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (26 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + + in_buf.offset(1) +} + +unsafe fn unpack27_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (27 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (27 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (27 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (27 - 7); + out = out.offset(1); + + *out = (*in_buf) >> 7; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (27 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (27 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (27 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (27 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (27 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (27 - 4); + out = out.offset(1); + + *out = ((*in_buf) >> 4) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (27 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (27 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (27 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (27 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (27 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (27 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 23)) << (27 - 23); + out = out.offset(1); + + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (27 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (27 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (27 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (27 - 3); + out = out.offset(1); + + *out = ((*in_buf) >> 3) % (1u32 << 27); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 25)) << (27 - 25); + out = out.offset(1); + + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (27 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (27 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (27 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (27 - 5); + out = out.offset(1); + + *out = (*in_buf) >> 5; + + in_buf.offset(1) +} + +unsafe fn unpack28_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 28); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (28 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (28 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (28 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (28 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (28 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (28 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 28); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (28 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (28 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (28 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (28 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (28 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (28 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 28); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (28 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (28 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (28 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (28 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (28 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (28 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 28); + out = out.offset(1); + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (28 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (28 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (28 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (28 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (28 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (28 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + + in_buf.offset(1) +} + +unsafe fn unpack29_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 29); + out = out.offset(1); + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (29 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 23)) << (29 - 23); + out = out.offset(1); + + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (29 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (29 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (29 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (29 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (29 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (29 - 5); + out = out.offset(1); + + *out = (*in_buf) >> 5; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (29 - 2); + out = out.offset(1); + + *out = ((*in_buf) >> 2) % (1u32 << 29); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 28)) << (29 - 28); + out = out.offset(1); + + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 25)) << (29 - 25); + out = out.offset(1); + + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (29 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (29 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (29 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (29 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (29 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (29 - 7); + out = out.offset(1); + + *out = (*in_buf) >> 7; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (29 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (29 - 1); + out = out.offset(1); + + *out = ((*in_buf) >> 1) % (1u32 << 29); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 27)) << (29 - 27); + out = out.offset(1); + + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (29 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (29 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (29 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (29 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (29 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (29 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (29 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (29 - 3); + out = out.offset(1); + + *out = (*in_buf) >> 3; + + in_buf.offset(1) +} + +unsafe fn unpack30_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 30); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 28)) << (30 - 28); + out = out.offset(1); + + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (30 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (30 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (30 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (30 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (30 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (30 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (30 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (30 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (30 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (30 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (30 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (30 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (30 - 2); + out = out.offset(1); + + *out = (*in_buf) >> 2; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = ((*in_buf) >> 0) % (1u32 << 30); + out = out.offset(1); + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 28)) << (30 - 28); + out = out.offset(1); + + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (30 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (30 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (30 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (30 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (30 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (30 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (30 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (30 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (30 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (30 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (30 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (30 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (30 - 2); + out = out.offset(1); + + *out = (*in_buf) >> 2; + + in_buf.offset(1) +} + +unsafe fn unpack31_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = ((*in_buf) >> 0) % (1u32 << 31); + out = out.offset(1); + *out = (*in_buf) >> 31; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 30)) << (31 - 30); + out = out.offset(1); + + *out = (*in_buf) >> 30; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 29)) << (31 - 29); + out = out.offset(1); + + *out = (*in_buf) >> 29; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 28)) << (31 - 28); + out = out.offset(1); + + *out = (*in_buf) >> 28; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 27)) << (31 - 27); + out = out.offset(1); + + *out = (*in_buf) >> 27; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 26)) << (31 - 26); + out = out.offset(1); + + *out = (*in_buf) >> 26; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 25)) << (31 - 25); + out = out.offset(1); + + *out = (*in_buf) >> 25; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 24)) << (31 - 24); + out = out.offset(1); + + *out = (*in_buf) >> 24; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 23)) << (31 - 23); + out = out.offset(1); + + *out = (*in_buf) >> 23; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 22)) << (31 - 22); + out = out.offset(1); + + *out = (*in_buf) >> 22; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 21)) << (31 - 21); + out = out.offset(1); + + *out = (*in_buf) >> 21; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 20)) << (31 - 20); + out = out.offset(1); + + *out = (*in_buf) >> 20; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 19)) << (31 - 19); + out = out.offset(1); + + *out = (*in_buf) >> 19; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 18)) << (31 - 18); + out = out.offset(1); + + *out = (*in_buf) >> 18; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 17)) << (31 - 17); + out = out.offset(1); + + *out = (*in_buf) >> 17; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 16)) << (31 - 16); + out = out.offset(1); + + *out = (*in_buf) >> 16; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 15)) << (31 - 15); + out = out.offset(1); + + *out = (*in_buf) >> 15; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 14)) << (31 - 14); + out = out.offset(1); + + *out = (*in_buf) >> 14; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 13)) << (31 - 13); + out = out.offset(1); + + *out = (*in_buf) >> 13; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 12)) << (31 - 12); + out = out.offset(1); + + *out = (*in_buf) >> 12; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 11)) << (31 - 11); + out = out.offset(1); + + *out = (*in_buf) >> 11; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 10)) << (31 - 10); + out = out.offset(1); + + *out = (*in_buf) >> 10; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 9)) << (31 - 9); + out = out.offset(1); + + *out = (*in_buf) >> 9; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 8)) << (31 - 8); + out = out.offset(1); + + *out = (*in_buf) >> 8; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 7)) << (31 - 7); + out = out.offset(1); + + *out = (*in_buf) >> 7; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 6)) << (31 - 6); + out = out.offset(1); + + *out = (*in_buf) >> 6; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 5)) << (31 - 5); + out = out.offset(1); + + *out = (*in_buf) >> 5; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 4)) << (31 - 4); + out = out.offset(1); + + *out = (*in_buf) >> 4; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 3)) << (31 - 3); + out = out.offset(1); + + *out = (*in_buf) >> 3; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 2)) << (31 - 2); + out = out.offset(1); + + *out = (*in_buf) >> 2; + in_buf = in_buf.offset(1); + *out |= ((*in_buf) % (1u32 << 1)) << (31 - 1); + out = out.offset(1); + + *out = (*in_buf) >> 1; + + in_buf.offset(1) +} + +unsafe fn unpack32_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + in_buf = in_buf.offset(1); + out = out.offset(1); + + *out = (*in_buf) >> 0; + + in_buf.offset(1) +} diff --git a/rust/src/parquet/util/bit_util.rs b/rust/src/parquet/util/bit_util.rs new file mode 100644 index 0000000000000..9dbb9a32333d2 --- /dev/null +++ b/rust/src/parquet/util/bit_util.rs @@ -0,0 +1,1058 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + cmp, + mem::{size_of, transmute_copy}, +}; + +use crate::parquet::errors::{ParquetError, Result}; +use crate::parquet::util::{bit_packing::unpack32, memory::ByteBufferPtr}; + +/// Reads `$size` of bytes from `$src`, and reinterprets them as type `$ty`, in +/// little-endian order. `$ty` must implement the `Default` trait. Otherwise this won't +/// compile. +/// This is copied and modified from byteorder crate. +macro_rules! read_num_bytes { + ($ty:ty, $size:expr, $src:expr) => {{ + assert!($size <= $src.len()); + let mut data: $ty = Default::default(); + unsafe { + ::std::ptr::copy_nonoverlapping($src.as_ptr(), &mut data as *mut $ty as *mut u8, $size); + } + data + }}; +} + +/// Converts value `val` of type `T` to a byte vector, by reading `num_bytes` from `val`. +/// NOTE: if `val` is less than the size of `T` then it can be truncated. +#[inline] +pub fn convert_to_bytes(val: &T, num_bytes: usize) -> Vec { + let mut bytes: Vec = vec![0; num_bytes]; + memcpy_value(val, num_bytes, &mut bytes); + bytes +} + +#[inline] +pub fn memcpy(source: &[u8], target: &mut [u8]) { + assert!(target.len() >= source.len()); + unsafe { ::std::ptr::copy_nonoverlapping(source.as_ptr(), target.as_mut_ptr(), source.len()) } +} + +#[inline] +pub fn memcpy_value(source: &T, num_bytes: usize, target: &mut [u8]) { + assert!( + target.len() >= num_bytes, + "Not enough space. Only had {} bytes but need to put {} bytes", + target.len(), + num_bytes + ); + unsafe { + ::std::ptr::copy_nonoverlapping( + source as *const T as *const u8, + target.as_mut_ptr(), + num_bytes, + ) + } +} + +/// Returns the ceil of value/divisor +#[inline] +pub fn ceil(value: i64, divisor: i64) -> i64 { + let mut result = value / divisor; + if value % divisor != 0 { + result += 1 + }; + result +} + +/// Returns ceil(log2(x)) +#[inline] +pub fn log2(mut x: u64) -> i32 { + if x == 1 { + return 0; + } + x -= 1; + let mut result = 0; + while x > 0 { + x >>= 1; + result += 1; + } + result +} + +/// Returns the `num_bits` least-significant bits of `v` +#[inline] +pub fn trailing_bits(v: u64, num_bits: usize) -> u64 { + if num_bits == 0 { + return 0; + } + if num_bits >= 64 { + return v; + } + let n = 64 - num_bits; + (v << n) >> n +} + +#[inline] +pub fn set_array_bit(bits: &mut [u8], i: usize) { + bits[i / 8] |= 1 << (i % 8); +} + +#[inline] +pub fn unset_array_bit(bits: &mut [u8], i: usize) { + bits[i / 8] &= !(1 << (i % 8)); +} + +/// Returns the minimum number of bits needed to represent the value 'x' +#[inline] +pub fn num_required_bits(x: u64) -> usize { + for i in (0..64).rev() { + if x & (1u64 << i) != 0 { + return i + 1; + } + } + 0 +} + +/// Utility class for writing bit/byte streams. This class can write data in either +/// bit packed or byte aligned fashion. +pub struct BitWriter { + buffer: Vec, + max_bytes: usize, + buffered_values: u64, + byte_offset: usize, + bit_offset: usize, + start: usize, +} + +impl BitWriter { + pub fn new(max_bytes: usize) -> Self { + Self { + buffer: vec![0; max_bytes], + max_bytes, + buffered_values: 0, + byte_offset: 0, + bit_offset: 0, + start: 0, + } + } + + /// Initializes the writer from the existing buffer `buffer` and starting + /// offset `start`. + pub fn new_from_buf(buffer: Vec, start: usize) -> Self { + assert!(start < buffer.len()); + let len = buffer.len(); + Self { + buffer, + max_bytes: len, + buffered_values: 0, + byte_offset: start, + bit_offset: 0, + start, + } + } + + /// Consumes and returns the current buffer. + #[inline] + pub fn consume(mut self) -> Vec { + self.flush(); + self.buffer.truncate(self.byte_offset); + self.buffer + } + + /// Flushes the internal buffered bits and returns the buffer's content. + /// This is a borrow equivalent of `consume` method. + #[inline] + pub fn flush_buffer(&mut self) -> &[u8] { + self.flush(); + &self.buffer()[0..self.byte_offset] + } + + /// Clears the internal state so the buffer can be reused. + #[inline] + pub fn clear(&mut self) { + self.buffered_values = 0; + self.byte_offset = self.start; + self.bit_offset = 0; + } + + /// Flushes the internal buffered bits and the align the buffer to the next byte. + #[inline] + pub fn flush(&mut self) { + let num_bytes = ceil(self.bit_offset as i64, 8) as usize; + assert!(self.byte_offset + num_bytes <= self.max_bytes); + memcpy_value( + &self.buffered_values, + num_bytes, + &mut self.buffer[self.byte_offset..], + ); + self.buffered_values = 0; + self.bit_offset = 0; + self.byte_offset += num_bytes; + } + + /// Advances the current offset by skipping `num_bytes`, flushing the internal bit + /// buffer first. + /// This is useful when you want to jump over `num_bytes` bytes and come back later + /// to fill these bytes. + /// + /// Returns error if `num_bytes` is beyond the boundary of the internal buffer. + /// Otherwise, returns the old offset. + #[inline] + pub fn skip(&mut self, num_bytes: usize) -> Result { + self.flush(); + assert!(self.byte_offset <= self.max_bytes); + if self.byte_offset + num_bytes > self.max_bytes { + return Err(general_err!( + "Not enough bytes left in BitWriter. Need {} but only have {}", + self.byte_offset + num_bytes, + self.max_bytes + )); + } + let result = self.byte_offset; + self.byte_offset += num_bytes; + Ok(result) + } + + /// Returns a slice containing the next `num_bytes` bytes starting from the current + /// offset, and advances the underlying buffer by `num_bytes`. + /// This is useful when you want to jump over `num_bytes` bytes and come back later + /// to fill these bytes. + #[inline] + pub fn get_next_byte_ptr(&mut self, num_bytes: usize) -> Result<&mut [u8]> { + let offset = self.skip(num_bytes)?; + Ok(&mut self.buffer[offset..offset + num_bytes]) + } + + #[inline] + pub fn bytes_written(&self) -> usize { + self.byte_offset - self.start + ceil(self.bit_offset as i64, 8) as usize + } + + #[inline] + pub fn buffer(&self) -> &[u8] { + &self.buffer[self.start..] + } + + #[inline] + pub fn byte_offset(&self) -> usize { + self.byte_offset + } + + /// Returns the internal buffer length. This is the maximum number of bytes that this + /// writer can write. User needs to call `consume` to consume the current buffer before + /// more data can be written. + #[inline] + pub fn buffer_len(&self) -> usize { + self.max_bytes + } + + /// Writes the `num_bits` LSB of value `v` to the internal buffer of this writer. + /// The `num_bits` must not be greater than 64. This is bit packed. + /// + /// Returns false if there's not enough room left. True otherwise. + #[inline] + pub fn put_value(&mut self, v: u64, num_bits: usize) -> bool { + assert!(num_bits <= 64); + assert_eq!(v.checked_shr(num_bits as u32).unwrap_or(0), 0); // covers case v >> 64 + + if self.byte_offset * 8 + self.bit_offset + num_bits > self.max_bytes as usize * 8 { + return false; + } + + self.buffered_values |= v << self.bit_offset; + self.bit_offset += num_bits; + if self.bit_offset >= 64 { + memcpy_value( + &self.buffered_values, + 8, + &mut self.buffer[self.byte_offset..], + ); + self.byte_offset += 8; + self.bit_offset -= 64; + self.buffered_values = 0; + // Perform checked right shift: v >> offset, where offset < 64, otherwise we shift + // all bits + self.buffered_values = v + .checked_shr((num_bits - self.bit_offset) as u32) + .unwrap_or(0); + } + assert!(self.bit_offset < 64); + true + } + + /// Writes `val` of `num_bytes` bytes to the next aligned byte. If size of `T` is + /// larger than `num_bytes`, extra higher ordered bytes will be ignored. + /// + /// Returns false if there's not enough room left. True otherwise. + #[inline] + pub fn put_aligned(&mut self, val: T, num_bytes: usize) -> bool { + let result = self.get_next_byte_ptr(num_bytes); + if result.is_err() { + // TODO: should we return `Result` for this func? + return false; + } + let mut ptr = result.unwrap(); + memcpy_value(&val, num_bytes, &mut ptr); + true + } + + /// Writes `val` of `num_bytes` bytes at the designated `offset`. The `offset` is the + /// offset starting from the beginning of the internal buffer that this writer + /// maintains. Note that this will overwrite any existing data between `offset` and + /// `offset + num_bytes`. Also that if size of `T` is larger than `num_bytes`, extra + /// higher ordered bytes will be ignored. + /// + /// Returns false if there's not enough room left, or the `pos` is not valid. + /// True otherwise. + #[inline] + pub fn put_aligned_offset(&mut self, val: T, num_bytes: usize, offset: usize) -> bool { + if num_bytes + offset > self.max_bytes { + return false; + } + memcpy_value( + &val, + num_bytes, + &mut self.buffer[offset..offset + num_bytes], + ); + true + } + + /// Writes a VLQ encoded integer `v` to this buffer. The value is byte aligned. + /// + /// Returns false if there's not enough room left. True otherwise. + #[inline] + pub fn put_vlq_int(&mut self, mut v: u64) -> bool { + let mut result = true; + while v & 0xFFFFFFFFFFFFFF80 != 0 { + result &= self.put_aligned::(((v & 0x7F) | 0x80) as u8, 1); + v >>= 7; + } + result &= self.put_aligned::((v & 0x7F) as u8, 1); + result + } + + /// Writes a zigzag-VLQ encoded (in little endian order) int `v` to this buffer. + /// Zigzag-VLQ is a variant of VLQ encoding where negative and positive + /// numbers are encoded in a zigzag fashion. + /// See: https://developers.google.com/protocol-buffers/docs/encoding + /// + /// Returns false if there's not enough room left. True otherwise. + #[inline] + pub fn put_zigzag_vlq_int(&mut self, v: i64) -> bool { + let u: u64 = ((v << 1) ^ (v >> 63)) as u64; + self.put_vlq_int(u) + } +} + +/// Maximum byte length for a VLQ encoded integer +/// MAX_VLQ_BYTE_LEN = 5 for i32, and MAX_VLQ_BYTE_LEN = 10 for i64 +pub const MAX_VLQ_BYTE_LEN: usize = 10; + +pub struct BitReader { + // The byte buffer to read from, passed in by client + buffer: ByteBufferPtr, + + // Bytes are memcpy'd from `buffer` and values are read from this variable. + // This is faster than reading values byte by byte directly from `buffer` + buffered_values: u64, + + // + // End Start + // |............|B|B|B|B|B|B|B|B|..............| + // ^ ^ + // bit_offset byte_offset + // + // Current byte offset in `buffer` + byte_offset: usize, + + // Current bit offset in `buffered_values` + bit_offset: usize, + + // Total number of bytes in `buffer` + total_bytes: usize, +} + +/// Utility class to read bit/byte stream. This class can read bits or bytes that are +/// either byte aligned or not. +impl BitReader { + pub fn new(buffer: ByteBufferPtr) -> Self { + let total_bytes = buffer.len(); + let num_bytes = cmp::min(8, total_bytes); + let buffered_values = read_num_bytes!(u64, num_bytes, buffer.as_ref()); + BitReader { + buffer, + buffered_values, + byte_offset: 0, + bit_offset: 0, + total_bytes, + } + } + + #[inline] + pub fn reset(&mut self, buffer: ByteBufferPtr) { + self.buffer = buffer; + self.total_bytes = self.buffer.len(); + let num_bytes = cmp::min(8, self.total_bytes); + self.buffered_values = read_num_bytes!(u64, num_bytes, self.buffer.as_ref()); + self.byte_offset = 0; + self.bit_offset = 0; + } + + /// Gets the current byte offset + #[inline] + pub fn get_byte_offset(&self) -> usize { + self.byte_offset + ceil(self.bit_offset as i64, 8) as usize + } + + /// Reads a value of type `T` and of size `num_bits`. + /// + /// Returns `None` if there's not enough data available. `Some` otherwise. + #[inline] + pub fn get_value(&mut self, num_bits: usize) -> Option { + assert!(num_bits <= 64); + assert!(num_bits <= size_of::() * 8); + + if self.byte_offset * 8 + self.bit_offset + num_bits > self.total_bytes * 8 { + return None; + } + + let mut v = + trailing_bits(self.buffered_values, self.bit_offset + num_bits) >> self.bit_offset; + self.bit_offset += num_bits; + + if self.bit_offset >= 64 { + self.byte_offset += 8; + self.bit_offset -= 64; + + self.reload_buffer_values(); + v |= trailing_bits(self.buffered_values, self.bit_offset) + .wrapping_shl((num_bits - self.bit_offset) as u32); + } + + // TODO: better to avoid copying here + let result: T = unsafe { transmute_copy::(&v) }; + Some(result) + } + + #[inline] + pub fn get_batch(&mut self, batch: &mut [T], num_bits: usize) -> usize { + assert!(num_bits <= 32); + assert!(num_bits <= size_of::() * 8); + + let mut values_to_read = batch.len(); + let needed_bits = num_bits * values_to_read; + let remaining_bits = (self.total_bytes - self.byte_offset) * 8 - self.bit_offset; + if remaining_bits < needed_bits { + values_to_read = remaining_bits / num_bits; + } + + let mut i = 0; + + // First align bit offset to byte offset + if self.bit_offset != 0 { + while i < values_to_read && self.bit_offset != 0 { + batch[i] = self + .get_value(num_bits) + .expect("expected to have more data"); + i += 1; + } + } + + unsafe { + let in_buf = &self.buffer.data()[self.byte_offset..]; + let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32; + if size_of::() == 4 { + while values_to_read - i >= 32 { + let out_ptr = &mut batch[i..] as *mut [T] as *mut T as *mut u32; + in_ptr = unpack32(in_ptr, out_ptr, num_bits); + self.byte_offset += 4 * num_bits; + i += 32; + } + } else { + let mut out_buf = [0u32; 32]; + let out_ptr = &mut out_buf as &mut [u32] as *mut [u32] as *mut u32; + while values_to_read - i >= 32 { + in_ptr = unpack32(in_ptr, out_ptr, num_bits); + self.byte_offset += 4 * num_bits; + for n in 0..32 { + // We need to copy from smaller size to bigger size to avoid overwritting + // other memory regions. + if size_of::() > size_of::() { + ::std::ptr::copy_nonoverlapping( + out_buf[n..].as_ptr() as *const u32, + &mut batch[i] as *mut T as *mut u32, + 1, + ); + } else { + ::std::ptr::copy_nonoverlapping( + out_buf[n..].as_ptr() as *const T, + &mut batch[i] as *mut T, + 1, + ); + } + i += 1; + } + } + } + } + + assert!(values_to_read - i < 32); + + self.reload_buffer_values(); + while i < values_to_read { + batch[i] = self + .get_value(num_bits) + .expect("expected to have more data"); + i += 1; + } + + values_to_read + } + + /// Reads a `num_bytes`-sized value from this buffer and return it. + /// `T` needs to be a little-endian native type. The value is assumed to be byte + /// aligned so the bit reader will be advanced to the start of the next byte before + /// reading the value. + + /// Returns `Some` if there's enough bytes left to form a value of `T`. + /// Otherwise `None`. + #[inline] + pub fn get_aligned(&mut self, num_bytes: usize) -> Option { + let bytes_read = ceil(self.bit_offset as i64, 8) as usize; + if self.byte_offset + bytes_read + num_bytes > self.total_bytes { + return None; + } + + // Advance byte_offset to next unread byte and read num_bytes + self.byte_offset += bytes_read; + let v = read_num_bytes!( + T, + num_bytes, + self.buffer.start_from(self.byte_offset).as_ref() + ); + self.byte_offset += num_bytes; + + // Reset buffered_values + self.bit_offset = 0; + self.reload_buffer_values(); + Some(v) + } + + /// Reads a VLQ encoded (in little endian order) int from the stream. + /// The encoded int must start at the beginning of a byte. + /// + /// Returns `None` if there's not enough bytes in the stream. `Some` otherwise. + #[inline] + pub fn get_vlq_int(&mut self) -> Option { + let mut shift = 0; + let mut v: i64 = 0; + while let Some(byte) = self.get_aligned::(1) { + v |= ((byte & 0x7F) as i64) << shift; + shift += 7; + assert!( + shift <= MAX_VLQ_BYTE_LEN * 7, + "Num of bytes exceed MAX_VLQ_BYTE_LEN ({})", + MAX_VLQ_BYTE_LEN + ); + if byte & 0x80 == 0 { + return Some(v); + } + } + None + } + + /// Reads a zigzag-VLQ encoded (in little endian order) int from the stream + /// Zigzag-VLQ is a variant of VLQ encoding where negative and positive numbers are + /// encoded in a zigzag fashion. + /// See: https://developers.google.com/protocol-buffers/docs/encoding + /// + /// Note: the encoded int must start at the beginning of a byte. + /// + /// Returns `None` if the number of bytes there's not enough bytes in the stream. + /// `Some` otherwise. + #[inline] + pub fn get_zigzag_vlq_int(&mut self) -> Option { + self.get_vlq_int().map(|v| { + let u = v as u64; + ((u >> 1) as i64 ^ -((u & 1) as i64)) + }) + } + + #[inline] + fn reload_buffer_values(&mut self) { + let bytes_to_read = cmp::min(self.total_bytes - self.byte_offset, 8); + self.buffered_values = read_num_bytes!( + u64, + bytes_to_read, + self.buffer.start_from(self.byte_offset).as_ref() + ); + } +} + +impl From> for BitReader { + #[inline] + fn from(buffer: Vec) -> Self { + BitReader::new(ByteBufferPtr::new(buffer)) + } +} + +#[cfg(test)] +mod tests { + use super::super::test_common::*; + use super::*; + + use rand::distributions::{Distribution, Standard}; + use std::fmt::Debug; + + #[test] + fn test_ceil() { + assert_eq!(ceil(0, 1), 0); + assert_eq!(ceil(1, 1), 1); + assert_eq!(ceil(1, 2), 1); + assert_eq!(ceil(1, 8), 1); + assert_eq!(ceil(7, 8), 1); + assert_eq!(ceil(8, 8), 1); + assert_eq!(ceil(9, 8), 2); + assert_eq!(ceil(9, 9), 1); + assert_eq!(ceil(10000000000, 10), 1000000000); + assert_eq!(ceil(10, 10000000000), 1); + assert_eq!(ceil(10000000000, 1000000000), 10); + } + + #[test] + fn test_bit_reader_get_byte_offset() { + let buffer = vec![255; 10]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_byte_offset(), 0); // offset (0 bytes, 0 bits) + bit_reader.get_value::(6); + assert_eq!(bit_reader.get_byte_offset(), 1); // offset (0 bytes, 6 bits) + bit_reader.get_value::(10); + assert_eq!(bit_reader.get_byte_offset(), 2); // offset (0 bytes, 16 bits) + bit_reader.get_value::(20); + assert_eq!(bit_reader.get_byte_offset(), 5); // offset (0 bytes, 36 bits) + bit_reader.get_value::(30); + assert_eq!(bit_reader.get_byte_offset(), 9); // offset (8 bytes, 2 bits) + } + + #[test] + fn test_bit_reader_get_value() { + let buffer = vec![255, 0]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_value::(1), Some(1)); + assert_eq!(bit_reader.get_value::(2), Some(3)); + assert_eq!(bit_reader.get_value::(3), Some(7)); + assert_eq!(bit_reader.get_value::(4), Some(3)); + } + + #[test] + fn test_bit_reader_get_value_boundary() { + let buffer = vec![10, 0, 0, 0, 20, 0, 30, 0, 0, 0, 40, 0]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_value::(32), Some(10)); + assert_eq!(bit_reader.get_value::(16), Some(20)); + assert_eq!(bit_reader.get_value::(32), Some(30)); + assert_eq!(bit_reader.get_value::(16), Some(40)); + } + + #[test] + fn test_bit_reader_get_aligned() { + // 01110101 11001011 + let buffer = ByteBufferPtr::new(vec![0x75, 0xCB]); + let mut bit_reader = BitReader::new(buffer.all()); + assert_eq!(bit_reader.get_value::(3), Some(5)); + assert_eq!(bit_reader.get_aligned::(1), Some(203)); + assert_eq!(bit_reader.get_value::(1), None); + bit_reader.reset(buffer.all()); + assert_eq!(bit_reader.get_aligned::(3), None); + } + + #[test] + fn test_bit_reader_get_vlq_int() { + // 10001001 00000001 11110010 10110101 00000110 + let buffer: Vec = vec![0x89, 0x01, 0xF2, 0xB5, 0x06]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_vlq_int(), Some(137)); + assert_eq!(bit_reader.get_vlq_int(), Some(105202)); + } + + #[test] + fn test_bit_reader_get_zigzag_vlq_int() { + let buffer: Vec = vec![0, 1, 2, 3]; + let mut bit_reader = BitReader::from(buffer); + assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(0)); + assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(-1)); + assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(1)); + assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(-2)); + } + + #[test] + fn test_set_array_bit() { + let mut buffer = vec![0, 0, 0]; + set_array_bit(&mut buffer[..], 1); + assert_eq!(buffer, vec![2, 0, 0]); + set_array_bit(&mut buffer[..], 4); + assert_eq!(buffer, vec![18, 0, 0]); + unset_array_bit(&mut buffer[..], 1); + assert_eq!(buffer, vec![16, 0, 0]); + set_array_bit(&mut buffer[..], 10); + assert_eq!(buffer, vec![16, 4, 0]); + set_array_bit(&mut buffer[..], 10); + assert_eq!(buffer, vec![16, 4, 0]); + set_array_bit(&mut buffer[..], 11); + assert_eq!(buffer, vec![16, 12, 0]); + unset_array_bit(&mut buffer[..], 10); + assert_eq!(buffer, vec![16, 8, 0]); + } + + #[test] + fn test_num_required_bits() { + assert_eq!(num_required_bits(0), 0); + assert_eq!(num_required_bits(1), 1); + assert_eq!(num_required_bits(2), 2); + assert_eq!(num_required_bits(4), 3); + assert_eq!(num_required_bits(8), 4); + assert_eq!(num_required_bits(10), 4); + assert_eq!(num_required_bits(12), 4); + assert_eq!(num_required_bits(16), 5); + } + + #[test] + fn test_log2() { + assert_eq!(log2(1), 0); + assert_eq!(log2(2), 1); + assert_eq!(log2(3), 2); + assert_eq!(log2(4), 2); + assert_eq!(log2(5), 3); + assert_eq!(log2(5), 3); + assert_eq!(log2(6), 3); + assert_eq!(log2(7), 3); + assert_eq!(log2(8), 3); + assert_eq!(log2(9), 4); + } + + #[test] + fn test_skip() { + let mut writer = BitWriter::new(5); + let old_offset = writer.skip(1).expect("skip() should return OK"); + writer.put_aligned(42, 4); + writer.put_aligned_offset(0x10, 1, old_offset); + let result = writer.consume(); + assert_eq!(result.as_ref(), [0x10, 42, 0, 0, 0]); + + writer = BitWriter::new(4); + let result = writer.skip(5); + assert!(result.is_err()); + } + + #[test] + fn test_get_next_byte_ptr() { + let mut writer = BitWriter::new(5); + { + let first_byte = writer + .get_next_byte_ptr(1) + .expect("get_next_byte_ptr() should return OK"); + first_byte[0] = 0x10; + } + writer.put_aligned(42, 4); + let result = writer.consume(); + assert_eq!(result.as_ref(), [0x10, 42, 0, 0, 0]); + } + + #[test] + fn test_consume_flush_buffer() { + let mut writer1 = BitWriter::new(3); + let mut writer2 = BitWriter::new(3); + for i in 1..10 { + writer1.put_value(i, 4); + writer2.put_value(i, 4); + } + let res1 = writer1.flush_buffer(); + let res2 = writer2.consume(); + assert_eq!(res1, &res2[..]); + } + + #[test] + fn test_put_get_bool() { + let len = 8; + let mut writer = BitWriter::new(len); + + for i in 0..8 { + let result = writer.put_value(i % 2, 1); + assert!(result); + } + + writer.flush(); + { + let buffer = writer.buffer(); + assert_eq!(buffer[0], 0b10101010); + } + + // Write 00110011 + for i in 0..8 { + let result = match i { + 0 | 1 | 4 | 5 => writer.put_value(false as u64, 1), + _ => writer.put_value(true as u64, 1), + }; + assert!(result); + } + writer.flush(); + { + let buffer = writer.buffer(); + assert_eq!(buffer[0], 0b10101010); + assert_eq!(buffer[1], 0b11001100); + } + + let mut reader = BitReader::from(writer.consume()); + + for i in 0..8 { + let val = reader + .get_value::(1) + .expect("get_value() should return OK"); + assert_eq!(val, i % 2); + } + + for i in 0..8 { + let val = reader + .get_value::(1) + .expect("get_value() should return OK"); + match i { + 0 | 1 | 4 | 5 => assert_eq!(val, false), + _ => assert_eq!(val, true), + } + } + } + + #[test] + fn test_put_value_roundtrip() { + test_put_value_rand_numbers(32, 2); + test_put_value_rand_numbers(32, 3); + test_put_value_rand_numbers(32, 4); + test_put_value_rand_numbers(32, 5); + test_put_value_rand_numbers(32, 6); + test_put_value_rand_numbers(32, 7); + test_put_value_rand_numbers(32, 8); + test_put_value_rand_numbers(64, 16); + test_put_value_rand_numbers(64, 24); + test_put_value_rand_numbers(64, 32); + } + + fn test_put_value_rand_numbers(total: usize, num_bits: usize) { + assert!(num_bits < 64); + let num_bytes = ceil(num_bits as i64, 8); + let mut writer = BitWriter::new(num_bytes as usize * total); + let values: Vec = random_numbers::(total) + .iter() + .map(|v| v & ((1 << num_bits) - 1)) + .collect(); + for i in 0..total { + assert!( + writer.put_value(values[i] as u64, num_bits), + "[{}]: put_value() failed", + i + ); + } + + let mut reader = BitReader::from(writer.consume()); + for i in 0..total { + let v = reader + .get_value::(num_bits) + .expect("get_value() should return OK"); + assert_eq!( + v, values[i], + "[{}]: expected {} but got {}", + i, values[i], v + ); + } + } + + #[test] + fn test_get_batch() { + const SIZE: &[usize] = &[1, 31, 32, 33, 128, 129]; + for s in SIZE { + for i in 0..33 { + match i { + 0...8 => test_get_batch_helper::(*s, i), + 9...16 => test_get_batch_helper::(*s, i), + _ => test_get_batch_helper::(*s, i), + } + } + } + } + + fn test_get_batch_helper(total: usize, num_bits: usize) + where + T: Default + Clone + Debug + Eq, + { + assert!(num_bits <= 32); + let num_bytes = ceil(num_bits as i64, 8); + let mut writer = BitWriter::new(num_bytes as usize * total); + + let values: Vec = random_numbers::(total) + .iter() + .map(|v| v & ((1u64 << num_bits) - 1) as u32) + .collect(); + + // Generic values used to check against actual values read from `get_batch`. + let expected_values: Vec = values + .iter() + .map(|v| unsafe { transmute_copy::(&v) }) + .collect(); + + for i in 0..total { + assert!(writer.put_value(values[i] as u64, num_bits)); + } + + let buf = writer.consume(); + let mut reader = BitReader::from(buf); + let mut batch = vec![T::default(); values.len()]; + let values_read = reader.get_batch::(&mut batch, num_bits); + assert_eq!(values_read, values.len()); + for i in 0..batch.len() { + assert_eq!( + batch[i], expected_values[i], + "num_bits = {}, index = {}", + num_bits, i + ); + } + } + + #[test] + fn test_put_aligned_roundtrip() { + test_put_aligned_rand_numbers::(4, 3); + test_put_aligned_rand_numbers::(16, 5); + test_put_aligned_rand_numbers::(32, 7); + test_put_aligned_rand_numbers::(32, 9); + test_put_aligned_rand_numbers::(32, 11); + test_put_aligned_rand_numbers::(32, 13); + test_put_aligned_rand_numbers::(32, 17); + test_put_aligned_rand_numbers::(32, 23); + } + + fn test_put_aligned_rand_numbers(total: usize, num_bits: usize) + where + T: Copy + Default + Debug + PartialEq, + Standard: Distribution, + { + assert!(num_bits <= 32); + assert!(total % 2 == 0); + + let aligned_value_byte_width = ::std::mem::size_of::(); + let value_byte_width = ceil(num_bits as i64, 8) as usize; + let mut writer = + BitWriter::new((total / 2) * (aligned_value_byte_width + value_byte_width)); + let values: Vec = random_numbers::(total / 2) + .iter() + .map(|v| v & ((1 << num_bits) - 1)) + .collect(); + let aligned_values = random_numbers::(total / 2); + + for i in 0..total { + let j = i / 2; + if i % 2 == 0 { + assert!( + writer.put_value(values[j] as u64, num_bits), + "[{}]: put_value() failed", + i + ); + } else { + assert!( + writer.put_aligned::(aligned_values[j], aligned_value_byte_width), + "[{}]: put_aligned() failed", + i + ); + } + } + + let mut reader = BitReader::from(writer.consume()); + for i in 0..total { + let j = i / 2; + if i % 2 == 0 { + let v = reader + .get_value::(num_bits) + .expect("get_value() should return OK"); + assert_eq!( + v, values[j] as u64, + "[{}]: expected {} but got {}", + i, values[j], v + ); + } else { + let v = reader + .get_aligned::(aligned_value_byte_width) + .expect("get_aligned() should return OK"); + assert_eq!( + v, aligned_values[j], + "[{}]: expected {:?} but got {:?}", + i, aligned_values[j], v + ); + } + } + } + + #[test] + fn test_put_vlq_int() { + let total = 64; + let mut writer = BitWriter::new(total * 32); + let values = random_numbers::(total); + for i in 0..total { + assert!( + writer.put_vlq_int(values[i] as u64), + "[{}]; put_vlq_int() failed", + i + ); + } + + let mut reader = BitReader::from(writer.consume()); + for i in 0..total { + let v = reader + .get_vlq_int() + .expect("get_vlq_int() should return OK"); + assert_eq!( + v as u32, values[i], + "[{}]: expected {} but got {}", + i, values[i], v + ); + } + } + + #[test] + fn test_put_zigzag_vlq_int() { + let total = 64; + let mut writer = BitWriter::new(total * 32); + let values = random_numbers::(total); + for i in 0..total { + assert!( + writer.put_zigzag_vlq_int(values[i] as i64), + "[{}]; put_zigzag_vlq_int() failed", + i + ); + } + + let mut reader = BitReader::from(writer.consume()); + for i in 0..total { + let v = reader + .get_zigzag_vlq_int() + .expect("get_zigzag_vlq_int() should return OK"); + assert_eq!( + v as i32, values[i], + "[{}]: expected {} but got {}", + i, values[i], v + ); + } + } +} diff --git a/rust/src/parquet/util/hash_util.rs b/rust/src/parquet/util/hash_util.rs new file mode 100644 index 0000000000000..c7bffef8bbf34 --- /dev/null +++ b/rust/src/parquet/util/hash_util.rs @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::parquet::data_type::AsBytes; + +/// Computes hash value for `data`, with a seed value `seed`. +/// The data type `T` must implement the `AsBytes` trait. +pub fn hash(data: &T, seed: u32) -> u32 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("sse4.2") { + unsafe { crc32_hash(data, seed) } + } else { + murmur_hash2_64a(data, seed as u64) as u32 + } + } +} + +const MURMUR_PRIME: u64 = 0xc6a4a7935bd1e995; +const MURMUR_R: i32 = 47; + +/// Rust implementation of MurmurHash2, 64-bit version for 64-bit platforms +fn murmur_hash2_64a(data: &T, seed: u64) -> u64 { + let data_bytes = data.as_bytes(); + let len = data_bytes.len(); + let len_64 = (len / 8) * 8; + let data_bytes_64 = unsafe { + ::std::slice::from_raw_parts(&data_bytes[0..len_64] as *const [u8] as *const u64, len / 8) + }; + + let mut h = seed ^ (MURMUR_PRIME.wrapping_mul(data_bytes.len() as u64)); + for v in data_bytes_64 { + let mut k = *v; + k = k.wrapping_mul(MURMUR_PRIME); + k ^= k >> MURMUR_R; + k = k.wrapping_mul(MURMUR_PRIME); + h ^= k; + h = h.wrapping_mul(MURMUR_PRIME); + } + + let data2 = &data_bytes[len_64..]; + + let v = len & 7; + if v == 7 { + h ^= (data2[6] as u64) << 48; + } + if v >= 6 { + h ^= (data2[5] as u64) << 40; + } + if v >= 5 { + h ^= (data2[4] as u64) << 32; + } + if v >= 4 { + h ^= (data2[3] as u64) << 24; + } + if v >= 3 { + h ^= (data2[2] as u64) << 16; + } + if v >= 2 { + h ^= (data2[1] as u64) << 8; + } + if v >= 1 { + h ^= data2[0] as u64; + } + if v > 0 { + h = h.wrapping_mul(MURMUR_PRIME); + } + + h ^= h >> MURMUR_R; + h = h.wrapping_mul(MURMUR_PRIME); + h ^= h >> MURMUR_R; + h +} + +/// CRC32 hash implementation using SSE4 instructions. Borrowed from Impala. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[target_feature(enable = "sse4.2")] +unsafe fn crc32_hash(data: &T, seed: u32) -> u32 { + #[cfg(target_arch = "x86")] + use std::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::*; + + let bytes: &[u8] = data.as_bytes(); + let u32_num_bytes = ::std::mem::size_of::(); + let mut num_bytes = bytes.len(); + let num_words = num_bytes / u32_num_bytes; + num_bytes %= u32_num_bytes; + + let bytes_u32: &[u32] = ::std::slice::from_raw_parts( + &bytes[0..num_words * u32_num_bytes] as *const [u8] as *const u32, + num_words, + ); + + let mut offset = 0; + let mut hash = seed; + while offset < num_words { + hash = _mm_crc32_u32(hash, bytes_u32[offset]); + offset += 1; + } + + offset = num_words * u32_num_bytes; + while offset < num_bytes { + hash = _mm_crc32_u8(hash, bytes[offset]); + offset += 1; + } + + // The lower half of the CRC hash has poor uniformity, so swap the halves + // for anyone who only uses the first several bits of the hash. + hash = (hash << 16) | (hash >> 16); + hash +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_murmur2_64a() { + let result = murmur_hash2_64a(&"hello", 123); + assert_eq!(result, 2597646618390559622); + + let result = murmur_hash2_64a(&"helloworld", 123); + assert_eq!(result, 4934371746140206573); + + let result = murmur_hash2_64a(&"helloworldparquet", 123); + assert_eq!(result, 2392198230801491746); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_crc32() { + if is_x86_feature_detected!("sse4.2") { + unsafe { + let result = crc32_hash(&"hello", 123); + assert_eq!(result, 2927487359); + + let result = crc32_hash(&"helloworld", 123); + assert_eq!(result, 314229527); + + let result = crc32_hash(&"helloworldparquet", 123); + assert_eq!(result, 667078870); + } + } + } +} diff --git a/rust/src/parquet/util/io.rs b/rust/src/parquet/util/io.rs new file mode 100644 index 0000000000000..8724e67c2dbe7 --- /dev/null +++ b/rust/src/parquet/util/io.rs @@ -0,0 +1,220 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{cmp, fs::File, io::*, sync::Mutex}; + +use crate::parquet::file::reader::ParquetReader; + +// ---------------------------------------------------------------------- +// Read/Write wrappers for `File`. + +/// Position trait returns the current position in the stream. +/// Should be viewed as a lighter version of `Seek` that does not allow seek operations, +/// and does not require mutable reference for the current position. +pub trait Position { + /// Returns position in the stream. + fn pos(&self) -> u64; +} + +/// Struct that represents a slice of a file data with independent start position and +/// length. Internally clones provided file handle, wraps with BufReader and resets +/// position before any read. +/// +/// This is workaround and alternative for `file.try_clone()` method. It clones `File` +/// while preserving independent position, which is not available with `try_clone()`. +/// +/// Designed after `arrow::io::RandomAccessFile`. +pub struct FileSource { + reader: Mutex>, + start: u64, // start position in a file + end: u64, // end position in a file +} + +impl FileSource { + /// Creates new file reader with start and length from a file handle + pub fn new(fd: &R, start: u64, length: usize) -> Self { + Self { + reader: Mutex::new(BufReader::new(fd.try_clone().unwrap())), + start, + end: start + length as u64, + } + } +} + +impl Read for FileSource { + fn read(&mut self, buf: &mut [u8]) -> Result { + let mut reader = self + .reader + .lock() + .map_err(|err| Error::new(ErrorKind::Other, err.to_string()))?; + + let bytes_to_read = cmp::min(buf.len(), (self.end - self.start) as usize); + let buf = &mut buf[0..bytes_to_read]; + + reader.seek(SeekFrom::Start(self.start as u64))?; + let res = reader.read(buf); + if let Ok(bytes_read) = res { + self.start += bytes_read as u64; + } + + res + } +} + +impl Position for FileSource { + fn pos(&self) -> u64 { + self.start + } +} + +/// Struct that represents `File` output stream with position tracking. +/// Used as a sink in file writer. +pub struct FileSink { + buf: BufWriter, + // This is not necessarily position in the underlying file, + // but rather current position in the sink. + pos: u64, +} + +impl FileSink { + /// Creates new file sink. + /// Position is set to whatever position file has. + pub fn new(file: &File) -> Self { + let mut owned_file = file.try_clone().unwrap(); + let pos = owned_file.seek(SeekFrom::Current(0)).unwrap(); + Self { + buf: BufWriter::new(owned_file), + pos, + } + } +} + +impl Write for FileSink { + fn write(&mut self, buf: &[u8]) -> Result { + let num_bytes = self.buf.write(buf)?; + self.pos += num_bytes as u64; + Ok(num_bytes) + } + + fn flush(&mut self) -> Result<()> { + self.buf.flush() + } +} + +impl Position for FileSink { + fn pos(&self) -> u64 { + self.pos + } +} + +// Position implementation for Cursor to use in various tests. +impl<'a> Position for Cursor<&'a mut Vec> { + fn pos(&self) -> u64 { + self.position() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::parquet::util::test_common::{get_temp_file, get_test_file}; + + #[test] + fn test_io_read_fully() { + let mut buf = vec![0; 8]; + let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); + + let bytes_read = src.read(&mut buf[..]).unwrap(); + assert_eq!(bytes_read, 4); + assert_eq!(buf, vec![b'P', b'A', b'R', b'1', 0, 0, 0, 0]); + } + + #[test] + fn test_io_read_in_chunks() { + let mut buf = vec![0; 4]; + let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); + + let bytes_read = src.read(&mut buf[0..2]).unwrap(); + assert_eq!(bytes_read, 2); + let bytes_read = src.read(&mut buf[2..]).unwrap(); + assert_eq!(bytes_read, 2); + assert_eq!(buf, vec![b'P', b'A', b'R', b'1']); + } + + #[test] + fn test_io_read_pos() { + let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); + + src.read(&mut vec![0; 1]).unwrap(); + assert_eq!(src.pos(), 1); + + src.read(&mut vec![0; 4]).unwrap(); + assert_eq!(src.pos(), 4); + } + + #[test] + fn test_io_read_over_limit() { + let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); + + // Read all bytes from source + src.read(&mut vec![0; 128]).unwrap(); + assert_eq!(src.pos(), 4); + + // Try reading again, should return 0 bytes. + let bytes_read = src.read(&mut vec![0; 128]).unwrap(); + assert_eq!(bytes_read, 0); + assert_eq!(src.pos(), 4); + } + + #[test] + fn test_io_seek_switch() { + let mut buf = vec![0; 4]; + let mut file = get_test_file("alltypes_plain.parquet"); + let mut src = FileSource::new(&file, 0, 4); + + file.seek(SeekFrom::Start(5 as u64)) + .expect("File seek to a position"); + + let bytes_read = src.read(&mut buf[..]).unwrap(); + assert_eq!(bytes_read, 4); + assert_eq!(buf, vec![b'P', b'A', b'R', b'1']); + } + + #[test] + fn test_io_write_with_pos() { + let mut file = get_temp_file("file_sink_test", &[b'a', b'b', b'c']); + file.seek(SeekFrom::Current(3)).unwrap(); + + // Write into sink + let mut sink = FileSink::new(&file); + assert_eq!(sink.pos(), 3); + + sink.write(&[b'd', b'e', b'f', b'g']).unwrap(); + assert_eq!(sink.pos(), 7); + + sink.flush().unwrap(); + assert_eq!(sink.pos(), file.seek(SeekFrom::Current(0)).unwrap()); + + // Read data using file chunk + let mut res = vec![0u8; 7]; + let mut chunk = FileSource::new(&file, 0, file.metadata().unwrap().len() as usize); + chunk.read(&mut res[..]).unwrap(); + + assert_eq!(res, vec![b'a', b'b', b'c', b'd', b'e', b'f', b'g']); + } +} diff --git a/rust/src/parquet/util/memory.rs b/rust/src/parquet/util/memory.rs new file mode 100644 index 0000000000000..69a389e50fe92 --- /dev/null +++ b/rust/src/parquet/util/memory.rs @@ -0,0 +1,524 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utility methods and structs for working with memory. + +use std::{ + cell::Cell, + fmt::{Debug, Display, Formatter, Result as FmtResult}, + io::{Result as IoResult, Write}, + mem, + ops::{Index, IndexMut}, + rc::{Rc, Weak}, +}; + +// ---------------------------------------------------------------------- +// Memory Tracker classes + +/// Reference counted pointer for [`MemTracker`]. +pub type MemTrackerPtr = Rc; +/// Non-owning reference for [`MemTracker`]. +pub type WeakMemTrackerPtr = Weak; + +/// Struct to track memory usage information. +#[derive(Debug)] +pub struct MemTracker { + // In the tuple, the first element is the current memory allocated (in bytes), + // and the second element is the maximum memory allocated so far (in bytes). + memory_usage: Cell<(i64, i64)>, +} + +impl MemTracker { + /// Creates new memory tracker. + #[inline] + pub fn new() -> MemTracker { + MemTracker { + memory_usage: Cell::new((0, 0)), + } + } + + /// Returns the current memory consumption, in bytes. + pub fn memory_usage(&self) -> i64 { + self.memory_usage.get().0 + } + + /// Returns the maximum memory consumption so far, in bytes. + pub fn max_memory_usage(&self) -> i64 { + self.memory_usage.get().1 + } + + /// Adds `num_bytes` to the memory consumption tracked by this memory tracker. + #[inline] + pub fn alloc(&self, num_bytes: i64) { + let (current, mut maximum) = self.memory_usage.get(); + let new_current = current + num_bytes; + if new_current > maximum { + maximum = new_current + } + self.memory_usage.set((new_current, maximum)); + } +} + +// ---------------------------------------------------------------------- +// Buffer classes + +/// Type alias for [`Buffer`]. +pub type ByteBuffer = Buffer; +/// Type alias for [`BufferPtr`]. +pub type ByteBufferPtr = BufferPtr; + +/// A resize-able buffer class with generic member, with optional memory tracker. +/// +/// Note that a buffer has two attributes: +/// `capacity` and `size`: the former is the total number of space reserved for +/// the buffer, while the latter is the actual number of elements. +/// Invariant: `capacity` >= `size`. +/// The total allocated bytes for a buffer equals to `capacity * sizeof()`. +pub struct Buffer { + data: Vec, + mem_tracker: Option, + type_length: usize, +} + +impl Buffer { + /// Creates new empty buffer. + pub fn new() -> Self { + Buffer { + data: vec![], + mem_tracker: None, + type_length: ::std::mem::size_of::(), + } + } + + /// Adds [`MemTracker`] for this buffer. + #[inline] + pub fn with_mem_tracker(mut self, mc: MemTrackerPtr) -> Self { + mc.alloc((self.data.capacity() * self.type_length) as i64); + self.mem_tracker = Some(mc); + self + } + + /// Returns slice of data in this buffer. + #[inline] + pub fn data(&self) -> &[T] { + self.data.as_slice() + } + + /// Sets data for this buffer. + #[inline] + pub fn set_data(&mut self, new_data: Vec) { + if let Some(ref mc) = self.mem_tracker { + let capacity_diff = new_data.capacity() as i64 - self.data.capacity() as i64; + mc.alloc(capacity_diff * self.type_length as i64); + } + self.data = new_data; + } + + /// Resizes underlying data in place to a new length `new_size`. + /// + /// If `new_size` is less than current length, data is truncated, otherwise, it is + /// extended to `new_size` with provided default value `init_value`. + /// + /// Memory tracker is also updated, if available. + #[inline] + pub fn resize(&mut self, new_size: usize, init_value: T) { + let old_capacity = self.data.capacity(); + self.data.resize(new_size, init_value); + if let Some(ref mc) = self.mem_tracker { + let capacity_diff = self.data.capacity() as i64 - old_capacity as i64; + mc.alloc(capacity_diff * self.type_length as i64); + } + } + + /// Clears underlying data. + #[inline] + pub fn clear(&mut self) { + self.data.clear() + } + + /// Reserves capacity `additional_capacity` for underlying data vector. + /// + /// Memory tracker is also updated, if available. + #[inline] + pub fn reserve(&mut self, additional_capacity: usize) { + let old_capacity = self.data.capacity(); + self.data.reserve(additional_capacity); + if self.data.capacity() > old_capacity { + if let Some(ref mc) = self.mem_tracker { + let capacity_diff = self.data.capacity() as i64 - old_capacity as i64; + mc.alloc(capacity_diff * self.type_length as i64); + } + } + } + + /// Returns [`BufferPtr`] with buffer data. + /// Buffer data is reset. + #[inline] + pub fn consume(&mut self) -> BufferPtr { + let old_data = mem::replace(&mut self.data, vec![]); + let mut result = BufferPtr::new(old_data); + if let Some(ref mc) = self.mem_tracker { + result = result.with_mem_tracker(mc.clone()); + } + result + } + + /// Adds `value` to the buffer. + #[inline] + pub fn push(&mut self, value: T) { + self.data.push(value) + } + + /// Returns current capacity for the buffer. + #[inline] + pub fn capacity(&self) -> usize { + self.data.capacity() + } + + /// Returns current size for the buffer. + #[inline] + pub fn size(&self) -> usize { + self.data.len() + } + + /// Returns `true` if memory tracker is added to buffer, `false` otherwise. + #[inline] + pub fn is_mem_tracked(&self) -> bool { + self.mem_tracker.is_some() + } + + /// Returns memory tracker associated with this buffer. + /// This may panic, if memory tracker is not set, use method above to check if + /// memory tracker is available. + #[inline] + pub fn mem_tracker(&self) -> &MemTrackerPtr { + self.mem_tracker.as_ref().unwrap() + } +} + +impl Index for Buffer { + type Output = T; + + fn index(&self, index: usize) -> &T { + &self.data[index] + } +} + +impl IndexMut for Buffer { + fn index_mut(&mut self, index: usize) -> &mut T { + &mut self.data[index] + } +} + +// TODO: implement this for other types +impl Write for Buffer { + #[inline] + fn write(&mut self, buf: &[u8]) -> IoResult { + let old_capacity = self.data.capacity(); + let bytes_written = self.data.write(buf)?; + if let Some(ref mc) = self.mem_tracker { + if self.data.capacity() - old_capacity > 0 { + mc.alloc((self.data.capacity() - old_capacity) as i64) + } + } + Ok(bytes_written) + } + + fn flush(&mut self) -> IoResult<()> { + // No-op + self.data.flush() + } +} + +impl AsRef<[u8]> for Buffer { + fn as_ref(&self) -> &[u8] { + self.data.as_slice() + } +} + +impl Drop for Buffer { + #[inline] + fn drop(&mut self) { + if let Some(ref mc) = self.mem_tracker { + mc.alloc(-((self.data.capacity() * self.type_length) as i64)); + } + } +} + +// ---------------------------------------------------------------------- +// Immutable Buffer (BufferPtr) classes + +/// An representation of a slice on a reference-counting and read-only byte array. +/// Sub-slices can be further created from this. The byte array will be released +/// when all slices are dropped. +#[derive(Clone, Debug)] +pub struct BufferPtr { + data: Rc>, + start: usize, + len: usize, + // TODO: will this create too many references? rethink about this. + mem_tracker: Option, +} + +impl BufferPtr { + /// Creates new buffer from a vector. + pub fn new(v: Vec) -> Self { + let len = v.len(); + Self { + data: Rc::new(v), + start: 0, + len, + mem_tracker: None, + } + } + + /// Returns slice of data in this buffer. + pub fn data(&self) -> &[T] { + &self.data[self.start..self.start + self.len] + } + + /// Updates this buffer with new `start` position and length `len`. + /// + /// Range should be within current start position and length. + pub fn with_range(mut self, start: usize, len: usize) -> Self { + assert!(start <= self.len); + assert!(start + len <= self.len); + self.start = start; + self.len = len; + self + } + + /// Adds memory tracker to this buffer. + pub fn with_mem_tracker(mut self, mc: MemTrackerPtr) -> Self { + self.mem_tracker = Some(mc); + self + } + + /// Returns start position of this buffer. + pub fn start(&self) -> usize { + self.start + } + + /// Returns length of this buffer + pub fn len(&self) -> usize { + self.len + } + + /// Returns `true` if this buffer has memory tracker, `false` otherwise. + pub fn is_mem_tracked(&self) -> bool { + self.mem_tracker.is_some() + } + + /// Returns a shallow copy of the buffer. + /// Reference counted pointer to the data is copied. + pub fn all(&self) -> BufferPtr { + BufferPtr { + data: self.data.clone(), + start: self.start, + len: self.len, + mem_tracker: self.mem_tracker.as_ref().map(|p| p.clone()), + } + } + + /// Returns a shallow copy of the buffer that starts with `start` position. + pub fn start_from(&self, start: usize) -> BufferPtr { + assert!(start <= self.len); + BufferPtr { + data: self.data.clone(), + start: self.start + start, + len: self.len - start, + mem_tracker: self.mem_tracker.as_ref().map(|p| p.clone()), + } + } + + /// Returns a shallow copy that is a range slice within this buffer. + pub fn range(&self, start: usize, len: usize) -> BufferPtr { + assert!(start + len <= self.len); + BufferPtr { + data: self.data.clone(), + start: self.start + start, + len, + mem_tracker: self.mem_tracker.as_ref().map(|p| p.clone()), + } + } +} + +impl Index for BufferPtr { + type Output = T; + + fn index(&self, index: usize) -> &T { + assert!(index < self.len); + &self.data[self.start + index] + } +} + +impl Display for BufferPtr { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + write!(f, "{:?}", self.data) + } +} + +impl Drop for BufferPtr { + fn drop(&mut self) { + if self.is_mem_tracked() + && Rc::strong_count(&self.data) == 1 + && Rc::weak_count(&self.data) == 0 + { + let mc = self.mem_tracker.as_ref().unwrap(); + mc.alloc(-(self.data.capacity() as i64)); + } + } +} + +impl AsRef<[u8]> for BufferPtr { + fn as_ref(&self) -> &[u8] { + &self.data[self.start..self.start + self.len] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_byte_buffer_mem_tracker() { + let mem_tracker = Rc::new(MemTracker::new()); + + let mut buffer = ByteBuffer::new().with_mem_tracker(mem_tracker.clone()); + buffer.set_data(vec![0; 10]); + assert_eq!(mem_tracker.memory_usage(), buffer.capacity() as i64); + buffer.set_data(vec![0; 20]); + let capacity = buffer.capacity() as i64; + assert_eq!(mem_tracker.memory_usage(), capacity); + + let max_capacity = { + let mut buffer2 = ByteBuffer::new().with_mem_tracker(mem_tracker.clone()); + buffer2.reserve(30); + assert_eq!( + mem_tracker.memory_usage(), + buffer2.capacity() as i64 + capacity + ); + buffer2.set_data(vec![0; 100]); + assert_eq!( + mem_tracker.memory_usage(), + buffer2.capacity() as i64 + capacity + ); + buffer2.capacity() as i64 + capacity + }; + + assert_eq!(mem_tracker.memory_usage(), capacity); + assert_eq!(mem_tracker.max_memory_usage(), max_capacity); + + buffer.reserve(40); + assert_eq!(mem_tracker.memory_usage(), buffer.capacity() as i64); + + buffer.consume(); + assert_eq!(mem_tracker.memory_usage(), buffer.capacity() as i64); + } + + #[test] + fn test_byte_ptr_mem_tracker() { + let mem_tracker = Rc::new(MemTracker::new()); + + let mut buffer = ByteBuffer::new().with_mem_tracker(mem_tracker.clone()); + buffer.set_data(vec![0; 60]); + + { + let buffer_capacity = buffer.capacity() as i64; + let buf_ptr = buffer.consume(); + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + { + let buf_ptr1 = buf_ptr.all(); + { + let _ = buf_ptr.start_from(20); + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + } + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + let _ = buf_ptr1.range(30, 20); + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + } + assert_eq!(mem_tracker.memory_usage(), buffer_capacity); + } + assert_eq!(mem_tracker.memory_usage(), buffer.capacity() as i64); + } + + #[test] + fn test_byte_buffer() { + let mut buffer = ByteBuffer::new(); + assert_eq!(buffer.size(), 0); + assert_eq!(buffer.capacity(), 0); + + let mut buffer2 = ByteBuffer::new(); + buffer2.reserve(40); + assert_eq!(buffer2.size(), 0); + assert_eq!(buffer2.capacity(), 40); + + buffer.set_data((0..5).collect()); + assert_eq!(buffer.size(), 5); + assert_eq!(buffer[4], 4); + + buffer.set_data((0..20).collect()); + assert_eq!(buffer.size(), 20); + assert_eq!(buffer[10], 10); + + let expected: Vec = (0..20).collect(); + { + let data = buffer.data(); + assert_eq!(data, expected.as_slice()); + } + + buffer.reserve(40); + assert!(buffer.capacity() >= 40); + + let byte_ptr = buffer.consume(); + assert_eq!(buffer.size(), 0); + assert_eq!(byte_ptr.as_ref(), expected.as_slice()); + + let values: Vec = (0..30).collect(); + let _ = buffer.write(values.as_slice()); + let _ = buffer.flush(); + + assert_eq!(buffer.data(), values.as_slice()); + } + + #[test] + fn test_byte_ptr() { + let values = (0..50).collect(); + let ptr = ByteBufferPtr::new(values); + assert_eq!(ptr.len(), 50); + assert_eq!(ptr.start(), 0); + assert_eq!(ptr[40], 40); + + let ptr2 = ptr.all(); + assert_eq!(ptr2.len(), 50); + assert_eq!(ptr2.start(), 0); + assert_eq!(ptr2[40], 40); + + let ptr3 = ptr.start_from(20); + assert_eq!(ptr3.len(), 30); + assert_eq!(ptr3.start(), 20); + assert_eq!(ptr3[0], 20); + + let ptr4 = ptr3.range(10, 10); + assert_eq!(ptr4.len(), 10); + assert_eq!(ptr4.start(), 30); + assert_eq!(ptr4[0], 30); + + let expected: Vec = (30..40).collect(); + assert_eq!(ptr4.as_ref(), expected.as_slice()); + } +} diff --git a/rust/src/parquet/util/mod.rs b/rust/src/parquet/util/mod.rs new file mode 100644 index 0000000000000..669cc3c0a495c --- /dev/null +++ b/rust/src/parquet/util/mod.rs @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod io; +pub mod memory; +#[macro_use] +pub mod bit_util; +mod bit_packing; +pub mod hash_util; + +#[cfg(test)] +pub mod test_common; diff --git a/rust/src/parquet/util/test_common.rs b/rust/src/parquet/util/test_common.rs new file mode 100644 index 0000000000000..f9b1af4a5cef4 --- /dev/null +++ b/rust/src/parquet/util/test_common.rs @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use rand::{ + distributions::{range::SampleRange, Distribution, Standard}, + thread_rng, Rng, +}; +use std::{env, fs, io::Write, path::PathBuf, str::FromStr}; + +use crate::parquet::data_type::*; +use crate::parquet::util::memory::ByteBufferPtr; + +/// Random generator of data type `T` values and sequences. +pub trait RandGen { + fn gen(len: i32) -> T::T; + + fn gen_vec(len: i32, total: usize) -> Vec { + let mut result = vec![]; + for _ in 0..total { + result.push(Self::gen(len)) + } + result + } +} + +impl RandGen for T { + default fn gen(_: i32) -> T::T { + panic!("Unsupported data type"); + } +} + +impl RandGen for BoolType { + fn gen(_: i32) -> bool { + thread_rng().gen::() + } +} + +impl RandGen for Int32Type { + fn gen(_: i32) -> i32 { + thread_rng().gen::() + } +} + +impl RandGen for Int64Type { + fn gen(_: i32) -> i64 { + thread_rng().gen::() + } +} + +impl RandGen for Int96Type { + fn gen(_: i32) -> Int96 { + let mut rng = thread_rng(); + let mut result = Int96::new(); + result.set_data(rng.gen::(), rng.gen::(), rng.gen::()); + result + } +} + +impl RandGen for FloatType { + fn gen(_: i32) -> f32 { + thread_rng().gen::() + } +} + +impl RandGen for DoubleType { + fn gen(_: i32) -> f64 { + thread_rng().gen::() + } +} + +impl RandGen for ByteArrayType { + fn gen(_: i32) -> ByteArray { + let mut rng = thread_rng(); + let mut result = ByteArray::new(); + let mut value = vec![]; + let len = rng.gen_range::(0, 128); + for _ in 0..len { + value.push(rng.gen_range(0, 255) & 0xFF); + } + result.set_data(ByteBufferPtr::new(value)); + result + } +} + +impl RandGen for FixedLenByteArrayType { + fn gen(len: i32) -> ByteArray { + let mut rng = thread_rng(); + let value_len = if len < 0 { + rng.gen_range::(0, 128) + } else { + len as usize + }; + let value = random_bytes(value_len); + ByteArray::from(value) + } +} + +pub fn random_bytes(n: usize) -> Vec { + let mut result = vec![]; + let mut rng = thread_rng(); + for _ in 0..n { + result.push(rng.gen_range(0, 255) & 0xFF); + } + result +} + +pub fn random_bools(n: usize) -> Vec { + let mut result = vec![]; + let mut rng = thread_rng(); + for _ in 0..n { + result.push(rng.gen::()); + } + result +} + +pub fn random_numbers(n: usize) -> Vec +where + Standard: Distribution, +{ + let mut rng = thread_rng(); + Standard.sample_iter(&mut rng).take(n).collect() +} + +pub fn random_numbers_range(n: usize, low: T, high: T, result: &mut Vec) +where + T: PartialOrd + SampleRange + Copy, +{ + let mut rng = thread_rng(); + for _ in 0..n { + result.push(rng.gen_range(low, high)); + } +} + +/// Returns path to the test parquet file in 'data' directory +pub fn get_test_path(file_name: &str) -> PathBuf { + let result = env::var("PARQUET_TEST_DATA"); + if result.is_err() { + panic!("Please point PARQUET_TEST_DATA environment variable to the test data directory"); + } + let mut pathbuf = PathBuf::from_str(result.unwrap().as_str()).unwrap(); + pathbuf.push(file_name); + pathbuf +} + +/// Returns file handle for a test parquet file from 'data' directory +pub fn get_test_file(file_name: &str) -> fs::File { + let file = fs::File::open(get_test_path(file_name).as_path()); + if file.is_err() { + panic!("Test file {} not found", file_name) + } + file.unwrap() +} + +/// Returns file handle for a temp file in 'target' directory with a provided content +pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File { + // build tmp path to a file in "target/debug/testdata" + let mut path_buf = env::current_dir().unwrap(); + path_buf.push("target"); + path_buf.push("debug"); + path_buf.push("testdata"); + fs::create_dir_all(&path_buf).unwrap(); + path_buf.push(file_name); + + // write file content + let mut tmp_file = fs::File::create(path_buf.as_path()).unwrap(); + tmp_file.write_all(content).unwrap(); + tmp_file.sync_all().unwrap(); + + // return file handle for both read and write + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .open(path_buf.as_path()); + assert!(file.is_ok()); + file.unwrap() +} diff --git a/rust/src/record_batch.rs b/rust/src/record_batch.rs index 2666770460e84..e6a8e79500f08 100644 --- a/rust/src/record_batch.rs +++ b/rust/src/record_batch.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; + use crate::array::*; use crate::datatypes::*; -use std::sync::Arc; /// A batch of column-oriented data pub struct RecordBatch { @@ -67,6 +68,7 @@ unsafe impl Sync for RecordBatch {} #[cfg(test)] mod tests { use super::*; + use crate::array_data::*; use crate::buffer::*; diff --git a/rust/src/tensor.rs b/rust/src/tensor.rs index 175b68d81f188..7272a2cf14631 100644 --- a/rust/src/tensor.rs +++ b/rust/src/tensor.rs @@ -216,6 +216,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { #[cfg(test)] mod tests { use super::*; + use crate::buffer::Buffer; use crate::builder::*; From 5a5d807bc9ccebc4fd9ec733788aede00a8bdd71 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 18 Dec 2018 15:29:27 +0100 Subject: [PATCH 23/80] [C++] Make Doxygen less verbose (#3213) --- cpp/apidoc/Doxyfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index e5285873c9e02..e7eefba130140 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -741,7 +741,7 @@ CITE_BIB_FILES = # messages are off. # The default value is: NO. -QUIET = NO +QUIET = YES # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES From d432cb4a27ce40ed4cf414c8267081c8dff89d82 Mon Sep 17 00:00:00 2001 From: Paddy Horan Date: Tue, 18 Dec 2018 15:37:51 +0100 Subject: [PATCH 24/80] ARROW-2560: [Rust] The Rust README should include Rust-specific information on contributing Author: Paddy Horan Closes #3210 from paddyhoran/ARROW-2560 and squashes the following commits: 8f81cb15 Updated README with parquet/rustfmt info --- rust/README.md | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/rust/README.md b/rust/README.md index f8908f8e6e64d..cbfd4dd684a0f 100644 --- a/rust/README.md +++ b/rust/README.md @@ -24,7 +24,8 @@ ## Status -This is a native Rust implementation of Apache Arrow. The current status is: +This is a native Rust implementation of Apache Arrow. Currently the project +is developed and tested against nightly Rust. The current status is: - [x] Primitive Arrays - [x] List Arrays @@ -36,6 +37,13 @@ This is a native Rust implementation of Apache Arrow. The current status is: - [ ] Arrow IPC - [ ] Interop tests with other implementations +## Dependencies + +Parquet support for Apache Arrow requires LLVM. Our windows CI image +includes LLVM but to build the libraries locally windows users will have +to install LLVM. Follow [this](https://github.com/appveyor/ci/issues/2651) +link for info. + ## Examples The examples folder shows how to construct some different types of Arrow @@ -51,8 +59,24 @@ cargo run --example read_csv ## Run Tests +Parquet support in Arrow requires data to test against, this data is in a +git submodule. To pull down this data run the following: + +```bash +git submodule update --init +``` + +The data can then be found in `cpp/submodules/parquet_testing/data`. +Create a new environment variable called `PARQUET_TEST_DATA` to point +to this location and then `cargo test` as usual. + +Our CI uses `rustfmt` to check code formatting. Although the project is +built and tested against nightly rust we use the stable version of +`rustfmt`. So before submitting a PR be sure to run the following +and check for lint issues: + ```bash -cargo test +cargo +stable fmt --all -- --check ``` # Publishing to crates.io From 36ded49568b8c3d664f0f14d06ec199ef5286857 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 15:47:09 +0100 Subject: [PATCH 25/80] ARROW-3058: [Python] Raise more helpful better error message when writing a pandas.DataFrame to Feather format that requires a chunked layout Author: Wes McKinney Closes #3178 from wesm/ARROW-3058 and squashes the following commits: 4a10687f Raise more helpful better error message when a large binary/string column yields ChunkedArray on conversion to pyarrow.Table --- python/pyarrow/feather.py | 26 +++++++++++++++++++++----- python/pyarrow/tests/test_feather.py | 18 ++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index faa2f7d892ee0..3713c1f135036 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -23,7 +23,7 @@ from pyarrow.compat import pdapi from pyarrow.lib import FeatherError # noqa -from pyarrow.lib import RecordBatch, concat_tables +from pyarrow.lib import Table, concat_tables import pyarrow.lib as ext @@ -62,6 +62,21 @@ def read_pandas(self, columns=None, use_threads=True): use_threads=use_threads) +def check_chunked_overflow(col): + if col.data.num_chunks == 1: + return + + if col.type in (ext.binary(), ext.string()): + raise ValueError("Column '{0}' exceeds 2GB maximum capacity of " + "a Feather binary column. This restriction may be " + "lifted in the future".format(col.name)) + else: + # TODO(wesm): Not sure when else this might be reached + raise ValueError("Column '{0}' of type {1} was chunked on conversion " + "to Arrow and cannot be currently written to " + "Feather format".format(col.name, str(col.type))) + + class FeatherWriter(object): def __init__(self, dest): @@ -78,10 +93,11 @@ def write(self, df): # TODO(wesm): Remove this length check, see ARROW-1732 if len(df.columns) > 0: - batch = RecordBatch.from_pandas(df, preserve_index=False) - for i, name in enumerate(batch.schema.names): - col = batch[i] - self.writer.write_array(name, col) + table = Table.from_pandas(df, preserve_index=False) + for i, name in enumerate(table.schema.names): + col = table[i] + check_chunked_overflow(col) + self.writer.write_array(name, col.data.chunk(0)) self.writer.close() diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 01b567216bfcf..d144f989d0f0a 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import io import os import sys import tempfile @@ -535,3 +536,20 @@ def test_unsupported(self): def test_large_dataframe(self): df = pd.DataFrame({'A': np.arange(400000000)}) self._check_pandas_roundtrip(df) + + +@pytest.mark.large_memory +def test_chunked_binary_error_message(): + # ARROW-3058: As Feather does not yet support chunked columns, we at least + # make sure it's clear to the user what is going on + + # 2^31 + 1 bytes + values = [b'x'] + [ + b'x' * (1 << 20) + ] * 2 * (1 << 10) + df = pd.DataFrame({'byte_col': values}) + + with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " + "capacity of a Feather binary column. This restriction " + "may be lifted in the future"): + write_feather(df, io.BytesIO()) From e832df36c2d44d02273de851db3cfcd8c231f479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 18 Dec 2018 16:43:39 +0100 Subject: [PATCH 26/80] ARROW-3387: [C++] Implement Binary to String cast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: François Saint-Jacques Closes #3211 from fsaintjacques/ARROW-3387-cast-binary-to-string and squashes the following commits: 16cdb8ef ARROE-3387: clang-format 1949d377 ARROW-3387: Address review comments 31092b9f ARROW-3387: Implement Binary to String cast f045d64f ARROW-3387: Partition compute-test.cc in separate files 5358148e ARROW-3387: Rename CopyData to ZeroCopyData --- cpp/src/arrow/compute/compute-test.cc | 1551 +---------------- cpp/src/arrow/compute/kernels/CMakeLists.txt | 4 + cpp/src/arrow/compute/kernels/boolean-test.cc | 157 ++ cpp/src/arrow/compute/kernels/cast-test.cc | 1197 +++++++++++++ cpp/src/arrow/compute/kernels/cast.cc | 106 +- cpp/src/arrow/compute/kernels/cast.h | 9 +- cpp/src/arrow/compute/kernels/hash-test.cc | 344 ++++ cpp/src/arrow/compute/kernels/util-internal.h | 4 +- cpp/src/arrow/compute/test-util.h | 57 + cpp/src/arrow/util/utf8.h | 8 + 10 files changed, 1873 insertions(+), 1564 deletions(-) create mode 100644 cpp/src/arrow/compute/kernels/boolean-test.cc create mode 100644 cpp/src/arrow/compute/kernels/cast-test.cc create mode 100644 cpp/src/arrow/compute/kernels/hash-test.cc create mode 100644 cpp/src/arrow/compute/test-util.h diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index e34a086d8e2d9..8129441b41fa1 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -39,10 +39,8 @@ #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" -#include "arrow/compute/kernels/boolean.h" -#include "arrow/compute/kernels/cast.h" -#include "arrow/compute/kernels/hash.h" #include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" using std::shared_ptr; using std::vector; @@ -50,26 +48,6 @@ using std::vector; namespace arrow { namespace compute { -class ComputeFixture { - public: - ComputeFixture() : ctx_(default_memory_pool()) {} - - protected: - FunctionContext ctx_; -}; - -template -shared_ptr _MakeArray(const shared_ptr& type, const vector& values, - const vector& is_valid) { - shared_ptr result; - if (is_valid.size() > 0) { - ArrayFromVector(type, is_valid, values, &result); - } else { - ArrayFromVector(type, values, &result); - } - return result; -} - // ---------------------------------------------------------------------- // Datum @@ -91,1533 +69,6 @@ TEST(TestDatum, ImplicitConstructors) { CheckImplicitConstructor
(Datum::TABLE); } -// ---------------------------------------------------------------------- -// Cast - -static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) { - ASSERT_EQ(left.data()->buffers[buffer_index].get(), - right.data()->buffers[buffer_index].get()); -} - -class TestCast : public ComputeFixture, public TestBase { - public: - void CheckPass(const Array& input, const Array& expected, - const shared_ptr& out_type, const CastOptions& options) { - shared_ptr result; - ASSERT_OK(Cast(&ctx_, input, out_type, options, &result)); - ASSERT_ARRAYS_EQUAL(expected, *result); - } - - template - void CheckFails(const shared_ptr& in_type, const vector& in_values, - const vector& is_valid, const shared_ptr& out_type, - const CastOptions& options) { - shared_ptr input, result; - if (is_valid.size() > 0) { - ArrayFromVector(in_type, is_valid, in_values, &input); - } else { - ArrayFromVector(in_type, in_values, &input); - } - ASSERT_RAISES(Invalid, Cast(&ctx_, *input, out_type, options, &result)); - } - - void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { - shared_ptr result; - ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); - AssertBufferSame(input, *result, 0); - AssertBufferSame(input, *result, 1); - } - - template - void CheckCase(const shared_ptr& in_type, const vector& in_values, - const vector& is_valid, const shared_ptr& out_type, - const vector& out_values, const CastOptions& options) { - DCHECK_EQ(in_values.size(), out_values.size()); - shared_ptr input, expected; - if (is_valid.size() > 0) { - DCHECK_EQ(is_valid.size(), out_values.size()); - ArrayFromVector(in_type, is_valid, in_values, &input); - ArrayFromVector(out_type, is_valid, out_values, &expected); - } else { - ArrayFromVector(in_type, in_values, &input); - ArrayFromVector(out_type, out_values, &expected); - } - CheckPass(*input, *expected, out_type, options); - - // Check a sliced variant - if (input->length() > 1) { - CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); - } - } -}; - -TEST_F(TestCast, SameTypeZeroCopy) { - vector is_valid = {true, false, true, true, true}; - vector v1 = {0, 1, 2, 3, 4}; - - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - shared_ptr result; - ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); - - AssertBufferSame(*arr, *result, 0); - AssertBufferSame(*arr, *result, 1); -} - -TEST_F(TestCast, ToBoolean) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // int8, should suffice for other integers - vector v1 = {0, 1, 127, -1, 0}; - vector e1 = {false, true, true, true, false}; - CheckCase(int8(), v1, is_valid, boolean(), e1, - options); - - // floating point - vector v2 = {1.0, 0, 0, -1.0, 5.0}; - vector e2 = {true, false, false, true, true}; - CheckCase(float64(), v2, is_valid, boolean(), e2, - options); -} - -TEST_F(TestCast, ToIntUpcast) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - // int8 to int32 - vector v1 = {0, 1, 127, -1, 0}; - vector e1 = {0, 1, 127, -1, 0}; - CheckCase(int8(), v1, is_valid, int32(), e1, - options); - - // bool to int8 - vector v2 = {false, true, false, true, true}; - vector e2 = {0, 1, 0, 1, 1}; - CheckCase(boolean(), v2, is_valid, int8(), e2, - options); - - // uint8 to int16, no overflow/underrun - vector v3 = {0, 100, 200, 255, 0}; - vector e3 = {0, 100, 200, 255, 0}; - CheckCase(uint8(), v3, is_valid, int16(), e3, - options); -} - -TEST_F(TestCast, OverflowInNullSlot) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - vector v11 = {0, 70000, 2000, 1000, 0}; - vector e11 = {0, 0, 2000, 1000, 0}; - - shared_ptr expected; - ArrayFromVector(int16(), is_valid, e11, &expected); - - auto buf = Buffer::Wrap(v11.data(), v11.size()); - Int32Array tmp11(5, buf, expected->null_bitmap(), -1); - - CheckPass(tmp11, *expected, int16(), options); -} - -TEST_F(TestCast, ToIntDowncastSafe) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - // int16 to uint8, no overflow/underrun - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, uint8(), e1, - options); - - // int16 to uint8, with overflow - vector v2 = {0, 100, 256, 0, 0}; - CheckFails(int16(), v2, is_valid, uint8(), options); - - // underflow - vector v3 = {0, 100, -1, 0, 0}; - CheckFails(int16(), v3, is_valid, uint8(), options); - - // int32 to int16, no overflow - vector v4 = {0, 1000, 2000, 1, 2}; - vector e4 = {0, 1000, 2000, 1, 2}; - CheckCase(int32(), v4, is_valid, int16(), e4, - options); - - // int32 to int16, overflow - vector v5 = {0, 1000, 2000, 70000, 0}; - CheckFails(int32(), v5, is_valid, int16(), options); - - // underflow - vector v6 = {0, 1000, 2000, -70000, 0}; - CheckFails(int32(), v6, is_valid, int16(), options); - - vector v7 = {0, 1000, 2000, -70000, 0}; - CheckFails(int32(), v7, is_valid, uint8(), options); -} - -template -std::vector UnsafeVectorCast(const std::vector& v) { - size_t n_elems = v.size(); - std::vector result(n_elems); - - for (size_t i = 0; i < v.size(); i++) result[i] = static_cast(v[i]); - - return std::move(result); -} - -TEST_F(TestCast, IntegerSignedToUnsigned) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - vector v1 = {INT32_MIN, 100, -1, UINT16_MAX, INT32_MAX}; - - // Same width - CheckFails(int32(), v1, is_valid, uint32(), options); - // Wider - CheckFails(int32(), v1, is_valid, uint64(), options); - // Narrower - CheckFails(int32(), v1, is_valid, uint16(), options); - // Fail because of overflow (instead of underflow). - vector over = {0, -11, 0, UINT16_MAX + 1, INT32_MAX}; - CheckFails(int32(), over, is_valid, uint16(), options); - - options.allow_int_overflow = true; - - CheckCase( - int32(), v1, is_valid, uint32(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), v1, is_valid, uint64(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), v1, is_valid, uint16(), UnsafeVectorCast(v1), options); - CheckCase( - int32(), over, is_valid, uint16(), UnsafeVectorCast(over), - options); -} - -TEST_F(TestCast, IntegerUnsignedToSigned) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, true, true}; - - vector v1 = {0, INT16_MAX + 1, UINT32_MAX}; - vector v2 = {0, INT16_MAX + 1, 2}; - // Same width - CheckFails(uint32(), v1, is_valid, int32(), options); - // Narrower - CheckFails(uint32(), v1, is_valid, int16(), options); - CheckFails(uint32(), v2, is_valid, int16(), options); - - options.allow_int_overflow = true; - - CheckCase( - uint32(), v1, is_valid, int32(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v1, is_valid, int64(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v1, is_valid, int16(), UnsafeVectorCast(v1), options); - CheckCase( - uint32(), v2, is_valid, int16(), UnsafeVectorCast(v2), options); -} - -TEST_F(TestCast, ToIntDowncastUnsafe) { - CastOptions options; - options.allow_int_overflow = true; - - vector is_valid = {true, false, true, true, true}; - - // int16 to uint8, no overflow/underrun - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, uint8(), e1, - options); - - // int16 to uint8, with overflow - vector v2 = {0, 100, 256, 0, 0}; - vector e2 = {0, 100, 0, 0, 0}; - CheckCase(int16(), v2, is_valid, uint8(), e2, - options); - - // underflow - vector v3 = {0, 100, -1, 0, 0}; - vector e3 = {0, 100, 255, 0, 0}; - CheckCase(int16(), v3, is_valid, uint8(), e3, - options); - - // int32 to int16, no overflow - vector v4 = {0, 1000, 2000, 1, 2}; - vector e4 = {0, 1000, 2000, 1, 2}; - CheckCase(int32(), v4, is_valid, int16(), e4, - options); - - // int32 to int16, overflow - // TODO(wesm): do we want to allow this? we could set to null - vector v5 = {0, 1000, 2000, 70000, 0}; - vector e5 = {0, 1000, 2000, 4464, 0}; - CheckCase(int32(), v5, is_valid, int16(), e5, - options); - - // underflow - // TODO(wesm): do we want to allow this? we could set overflow to null - vector v6 = {0, 1000, 2000, -70000, 0}; - vector e6 = {0, 1000, 2000, -4464, 0}; - CheckCase(int32(), v6, is_valid, int16(), e6, - options); -} - -TEST_F(TestCast, FloatingPointToInt) { - // which means allow_float_truncate == false - auto options = CastOptions::Safe(); - - vector is_valid = {true, false, true, true, true}; - vector all_valid = {true, true, true, true, true}; - - // float32 to int32 no truncation - vector v1 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e1 = {1, 0, 0, -1, 5}; - CheckCase(float32(), v1, is_valid, int32(), e1, - options); - CheckCase(float32(), v1, all_valid, int32(), e1, - options); - - // float64 to int32 no truncation - vector v2 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e2 = {1, 0, 0, -1, 5}; - CheckCase(float64(), v2, is_valid, int32(), e2, - options); - CheckCase(float64(), v2, all_valid, int32(), e2, - options); - - // float64 to int64 no truncation - vector v3 = {1.0, 0, 0.0, -1.0, 5.0}; - vector e3 = {1, 0, 0, -1, 5}; - CheckCase(float64(), v3, is_valid, int64(), e3, - options); - CheckCase(float64(), v3, all_valid, int64(), e3, - options); - - // float64 to int32 truncate - vector v4 = {1.5, 0, 0.5, -1.5, 5.5}; - vector e4 = {1, 0, 0, -1, 5}; - - options.allow_float_truncate = false; - CheckFails(float64(), v4, is_valid, int32(), options); - CheckFails(float64(), v4, all_valid, int32(), options); - - options.allow_float_truncate = true; - CheckCase(float64(), v4, is_valid, int32(), e4, - options); - CheckCase(float64(), v4, all_valid, int32(), e4, - options); - - // float64 to int64 truncate - vector v5 = {1.5, 0, 0.5, -1.5, 5.5}; - vector e5 = {1, 0, 0, -1, 5}; - - options.allow_float_truncate = false; - CheckFails(float64(), v5, is_valid, int64(), options); - CheckFails(float64(), v5, all_valid, int64(), options); - - options.allow_float_truncate = true; - CheckCase(float64(), v5, is_valid, int64(), e5, - options); - CheckCase(float64(), v5, all_valid, int64(), e5, - options); -} - -TEST_F(TestCast, IntToFloatingPoint) { - auto options = CastOptions::Safe(); - - vector all_valid = {true, true, true, true, true}; - vector all_invalid = {false, false, false, false, false}; - - vector v1 = {INT64_MIN, INT64_MIN + 1, 0, INT64_MAX - 1, INT64_MAX}; - CheckFails(int64(), v1, all_valid, float32(), options); - - // While it's not safe to convert, all values are null. - CheckCase(int64(), v1, all_invalid, float64(), - UnsafeVectorCast(v1), - options); -} - -TEST_F(TestCast, TimestampToTimestamp) { - CastOptions options; - - auto CheckTimestampCast = - [this](const CastOptions& options, TimeUnit::type from_unit, TimeUnit::type to_unit, - const vector& from_values, const vector& to_values, - const vector& is_valid) { - CheckCase( - timestamp(from_unit), from_values, is_valid, timestamp(to_unit), to_values, - options); - }; - - vector is_valid = {true, false, true, true, true}; - - // Multiply promotions - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MILLI, v1, e1, is_valid); - - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MICRO, v2, e2, is_valid); - - vector v3 = {0, 100, 200, 1, 2}; - vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::NANO, v3, e3, is_valid); - - vector v4 = {0, 100, 200, 1, 2}; - vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::MICRO, v4, e4, is_valid); - - vector v5 = {0, 100, 200, 1, 2}; - vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::NANO, v5, e5, is_valid); - - vector v6 = {0, 100, 200, 1, 2}; - vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); - - // Zero copy - vector v7 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(timestamp(TimeUnit::SECOND), is_valid, v7, - &arr); - CheckZeroCopy(*arr, timestamp(TimeUnit::SECOND)); - - // ARROW-1773, cast to integer - CheckZeroCopy(*arr, int64()); - - // Divide, truncate - vector v8 = {0, 100123, 200456, 1123, 2456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::SECOND, v8, e8, is_valid); - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::MILLI, v8, e8, is_valid); - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MICRO, v8, e8, is_valid); - - vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - vector e9 = {0, 100, 200, 1, 2}; - CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::SECOND, v9, e9, is_valid); - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MILLI, v9, e9, is_valid); - - vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - vector e10 = {0, 100, 200, 1, 2}; - CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::SECOND, v10, e10, is_valid); - - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(timestamp(TimeUnit::MILLI), v8, is_valid, - timestamp(TimeUnit::SECOND), options); - CheckFails(timestamp(TimeUnit::MICRO), v8, is_valid, - timestamp(TimeUnit::MILLI), options); - CheckFails(timestamp(TimeUnit::NANO), v8, is_valid, - timestamp(TimeUnit::MICRO), options); - CheckFails(timestamp(TimeUnit::MICRO), v9, is_valid, - timestamp(TimeUnit::SECOND), options); - CheckFails(timestamp(TimeUnit::NANO), v9, is_valid, - timestamp(TimeUnit::MILLI), options); - CheckFails(timestamp(TimeUnit::NANO), v10, is_valid, - timestamp(TimeUnit::SECOND), options); -} - -TEST_F(TestCast, TimestampToDate32_Date64) { - CastOptions options; - - vector is_valid = {true, true, false}; - - // 2000-01-01, 2000-01-02, null - vector v_nano = {946684800000000000, 946771200000000000, 0}; - vector v_micro = {946684800000000, 946771200000000, 0}; - vector v_milli = {946684800000, 946771200000, 0}; - vector v_second = {946684800, 946771200, 0}; - vector v_day = {10957, 10958, 0}; - - // Simple conversions - CheckCase( - timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, options); - - CheckCase( - timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, options); - - // Disallow truncate, failures - vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; - vector v_micro_fail = {946684800000001, 946771200000001, 0}; - vector v_milli_fail = {946684800001, 946771200001, 0}; - vector v_second_fail = {946684801, 946771201, 0}; - - options.allow_time_truncate = false; - CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), - options); - CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, - date64(), options); - - CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), - options); - CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, - date32(), options); - - // Make sure that nulls are excluded from the truncation checks - vector v_second_nofail = {946684800, 946771200, 1}; - CheckCase( - timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), v_milli, options); - CheckCase( - timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, options); -} - -TEST_F(TestCast, TimeToCompatible) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // Multiply promotions - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time32(TimeUnit::SECOND), v1, is_valid, time32(TimeUnit::MILLI), e1, options); - - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckCase( - time32(TimeUnit::SECOND), v2, is_valid, time64(TimeUnit::MICRO), e2, options); - - vector v3 = {0, 100, 200, 1, 2}; - vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; - CheckCase( - time32(TimeUnit::SECOND), v3, is_valid, time64(TimeUnit::NANO), e3, options); - - vector v4 = {0, 100, 200, 1, 2}; - vector e4 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time32(TimeUnit::MILLI), v4, is_valid, time64(TimeUnit::MICRO), e4, options); - - vector v5 = {0, 100, 200, 1, 2}; - vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; - CheckCase( - time32(TimeUnit::MILLI), v5, is_valid, time64(TimeUnit::NANO), e5, options); - - vector v6 = {0, 100, 200, 1, 2}; - vector e6 = {0, 100000, 200000, 1000, 2000}; - CheckCase( - time64(TimeUnit::MICRO), v6, is_valid, time64(TimeUnit::NANO), e6, options); - - // Zero copy - vector v7 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(time64(TimeUnit::MICRO), is_valid, v7, &arr); - CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); - - // ARROW-1773: cast to int64 - CheckZeroCopy(*arr, int64()); - - vector v7_2 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(time32(TimeUnit::SECOND), is_valid, v7_2, &arr); - CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); - - // ARROW-1773: cast to int64 - CheckZeroCopy(*arr, int32()); - - // Divide, truncate - vector v8 = {0, 100123, 200456, 1123, 2456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckCase( - time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), e8, options); - CheckCase( - time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), e8, options); - CheckCase( - time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), e8, options); - - vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; - vector e9 = {0, 100, 200, 1, 2}; - CheckCase( - time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), e9, options); - CheckCase( - time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), e9, options); - - vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; - vector e10 = {0, 100, 200, 1, 2}; - CheckCase( - time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), e10, options); - - // Disallow truncate, failures - - options.allow_time_truncate = false; - CheckFails(time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), - options); - CheckFails(time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), - options); - CheckFails(time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), - options); - CheckFails(time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), - options); - CheckFails(time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), - options); - CheckFails(time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), - options); -} - -TEST_F(TestCast, PrimitiveZeroCopy) { - shared_ptr arr; - - ArrayFromVector(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint8()); - ArrayFromVector(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int8()); - - ArrayFromVector(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint16()); - ArrayFromVector(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int16()); - - ArrayFromVector(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint32()); - ArrayFromVector(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int32()); - - ArrayFromVector(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint64()); - ArrayFromVector(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int64()); - - ArrayFromVector(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float32()); - - ArrayFromVector(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float64()); -} - -TEST_F(TestCast, DateToCompatible) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - constexpr int64_t F = 86400000; - - // Multiply promotion - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100 * F, 200 * F, F, 2 * F}; - CheckCase(date32(), v1, is_valid, date64(), - e1, options); - - // Zero copy - vector v2 = {0, 70000, 2000, 1000, 0}; - vector v3 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(date32(), is_valid, v2, &arr); - CheckZeroCopy(*arr, date32()); - - // ARROW-1773: zero copy cast to integer - CheckZeroCopy(*arr, int32()); - - ArrayFromVector(date64(), is_valid, v3, &arr); - CheckZeroCopy(*arr, date64()); - - // ARROW-1773: zero copy cast to integer - CheckZeroCopy(*arr, int64()); - - // Divide, truncate - vector v8 = {0, 100 * F + 123, 200 * F + 456, F + 123, 2 * F + 456}; - vector e8 = {0, 100, 200, 1, 2}; - - options.allow_time_truncate = true; - CheckCase(date64(), v8, is_valid, date32(), - e8, options); - - // Disallow truncate, failures - options.allow_time_truncate = false; - CheckFails(date64(), v8, is_valid, date32(), options); -} - -TEST_F(TestCast, ToDouble) { - CastOptions options; - vector is_valid = {true, false, true, true, true}; - - // int16 to double - vector v1 = {0, 100, 200, 1, 2}; - vector e1 = {0, 100, 200, 1, 2}; - CheckCase(int16(), v1, is_valid, float64(), e1, - options); - - // float to double - vector v2 = {0, 100, 200, 1, 2}; - vector e2 = {0, 100, 200, 1, 2}; - CheckCase(float32(), v2, is_valid, float64(), e2, - options); - - // bool to double - vector v3 = {true, true, false, false, true}; - vector e3 = {1, 1, 0, 0, 1}; - CheckCase(boolean(), v3, is_valid, float64(), e3, - options); -} - -TEST_F(TestCast, ChunkedArray) { - vector values1 = {0, 1, 2}; - vector values2 = {3, 4, 5}; - - auto type = int16(); - auto out_type = int64(); - - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - ArrayVector arrays = {a1, a2}; - auto carr = std::make_shared(arrays); - - CastOptions options; - - Datum out; - ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); - - auto out_carr = out.chunked_array(); - - vector ex_values1 = {0, 1, 2}; - vector ex_values2 = {3, 4, 5}; - auto a3 = _MakeArray(out_type, ex_values1, {}); - auto a4 = _MakeArray(out_type, ex_values2, {}); - - ArrayVector ex_arrays = {a3, a4}; - auto ex_carr = std::make_shared(ex_arrays); - - ASSERT_TRUE(out.chunked_array()->Equals(*ex_carr)); -} - -TEST_F(TestCast, UnsupportedTarget) { - vector is_valid = {true, false, true, true, true}; - vector v1 = {0, 1, 2, 3, 4}; - - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - shared_ptr result; - ASSERT_RAISES(NotImplemented, Cast(&this->ctx_, *arr, utf8(), {}, &result)); -} - -TEST_F(TestCast, DateTimeZeroCopy) { - vector is_valid = {true, false, true, true, true}; - - vector v1 = {0, 70000, 2000, 1000, 0}; - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); - - CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); - CheckZeroCopy(*arr, date32()); - - vector v2 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(int64(), is_valid, v2, &arr); - - CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); - CheckZeroCopy(*arr, date64()); - CheckZeroCopy(*arr, timestamp(TimeUnit::NANO)); -} - -TEST_F(TestCast, FromNull) { - // Null casts to everything - const int length = 10; - - NullArray arr(length); - - shared_ptr result; - ASSERT_OK(Cast(&ctx_, arr, int32(), {}, &result)); - - ASSERT_EQ(length, result->length()); - ASSERT_EQ(length, result->null_count()); - - // OK to look at bitmaps - ASSERT_ARRAYS_EQUAL(*result, *result); -} - -TEST_F(TestCast, PreallocatedMemory) { - CastOptions options; - options.allow_int_overflow = false; - - vector is_valid = {true, false, true, true, true}; - - const int64_t length = 5; - - shared_ptr arr; - vector v1 = {0, 70000, 2000, 1000, 0}; - vector e1 = {0, 70000, 2000, 1000, 0}; - ArrayFromVector(int32(), is_valid, v1, &arr); - - auto out_type = int64(); - - std::unique_ptr kernel; - ASSERT_OK(GetCastFunction(*int32(), out_type, options, &kernel)); - - auto out_data = ArrayData::Make(out_type, length); - - shared_ptr out_values; - ASSERT_OK(this->ctx_.Allocate(length * sizeof(int64_t), &out_values)); - - out_data->buffers.push_back(nullptr); - out_data->buffers.push_back(out_values); - - Datum out(out_data); - ASSERT_OK(kernel->Call(&this->ctx_, arr, &out)); - - // Buffer address unchanged - ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); - - shared_ptr result = MakeArray(out_data); - shared_ptr expected; - ArrayFromVector(int64(), is_valid, e1, &expected); - - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -template -void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr& in_type, - const vector& in_values, - const std::shared_ptr& out_type, - const vector& out_values) { - using OutTraits = TypeTraits; - - CastOptions options; - - const int64_t length = static_cast(in_values.size()); - - shared_ptr arr, expected; - ArrayFromVector(in_type, in_values, &arr); - ArrayFromVector(out_type, out_values, &expected); - - shared_ptr out_buffer; - ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer)); - - std::unique_ptr kernel; - ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel)); - - const int64_t first_half = length / 2; - - auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer}); - auto out_second_data = out_data->Copy(); - out_second_data->offset = first_half; - - Datum out_first(out_data); - Datum out_second(out_second_data); - - // Cast each bit - ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first)); - ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second)); - - shared_ptr result = MakeArray(out_data); - - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -TEST_F(TestCast, OffsetOutputBuffer) { - // ARROW-1735 - vector v1 = {0, 10000, 2000, 1000, 0}; - vector e1 = {0, 10000, 2000, 1000, 0}; - - auto in_type = int32(); - auto out_type = int64(); - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - out_type, e1); - - vector e2 = {false, true, true, true, false}; - - out_type = boolean(); - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - boolean(), e2); - - vector e3 = {0, 10000, 2000, 1000, 0}; - CheckOffsetOutputCase(&this->ctx_, in_type, v1, - int16(), e3); -} - -TEST_F(TestCast, StringToBoolean) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - vector v1 = {"False", "true", "true", "True", "false"}; - vector v2 = {"0", "1", "1", "1", "0"}; - vector e = {false, true, true, true, false}; - CheckCase(utf8(), v1, is_valid, boolean(), - e, options); - CheckCase(utf8(), v2, is_valid, boolean(), - e, options); -} - -TEST_F(TestCast, StringToBooleanErrors) { - CastOptions options; - - vector is_valid = {true}; - - CheckFails(utf8(), {"false "}, is_valid, boolean(), options); - CheckFails(utf8(), {"T"}, is_valid, boolean(), options); -} - -TEST_F(TestCast, StringToNumber) { - CastOptions options; - - vector is_valid = {true, false, true, true, true}; - - // string to int - vector v_int = {"0", "1", "127", "-1", "0"}; - vector e_int8 = {0, 1, 127, -1, 0}; - vector e_int16 = {0, 1, 127, -1, 0}; - vector e_int32 = {0, 1, 127, -1, 0}; - vector e_int64 = {0, 1, 127, -1, 0}; - CheckCase(utf8(), v_int, is_valid, int8(), - e_int8, options); - CheckCase(utf8(), v_int, is_valid, int16(), - e_int16, options); - CheckCase(utf8(), v_int, is_valid, int32(), - e_int32, options); - CheckCase(utf8(), v_int, is_valid, int64(), - e_int64, options); - - v_int = {"2147483647", "0", "-2147483648", "0", "0"}; - e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; - CheckCase(utf8(), v_int, is_valid, int32(), - e_int32, options); - v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; - e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; - CheckCase(utf8(), v_int, is_valid, int64(), - e_int64, options); - - // string to uint - vector v_uint = {"0", "1", "127", "255", "0"}; - vector e_uint8 = {0, 1, 127, 255, 0}; - vector e_uint16 = {0, 1, 127, 255, 0}; - vector e_uint32 = {0, 1, 127, 255, 0}; - vector e_uint64 = {0, 1, 127, 255, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint8(), e_uint8, options); - CheckCase(utf8(), v_uint, is_valid, - uint16(), e_uint16, options); - CheckCase(utf8(), v_uint, is_valid, - uint32(), e_uint32, options); - CheckCase(utf8(), v_uint, is_valid, - uint64(), e_uint64, options); - - v_uint = {"4294967295", "0", "0", "0", "0"}; - e_uint32 = {4294967295, 0, 0, 0, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint32(), e_uint32, options); - v_uint = {"18446744073709551615", "0", "0", "0", "0"}; - e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; - CheckCase(utf8(), v_uint, is_valid, - uint64(), e_uint64, options); - - // string to float - vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; - vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; - vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; - CheckCase(utf8(), v_float, is_valid, - float32(), e_float, options); - CheckCase(utf8(), v_float, is_valid, - float64(), e_double, options); - - // Test that casting is locale-independent - auto global_locale = std::locale(); - try { - // French locale uses the comma as decimal point - std::locale::global(std::locale("fr_FR.UTF-8")); - } catch (std::runtime_error&) { - // Locale unavailable, ignore - } - CheckCase(utf8(), v_float, is_valid, - float32(), e_float, options); - CheckCase(utf8(), v_float, is_valid, - float64(), e_double, options); - std::locale::global(global_locale); -} - -TEST_F(TestCast, StringToNumberErrors) { - CastOptions options; - - vector is_valid = {true}; - - CheckFails(utf8(), {"z"}, is_valid, int8(), options); - CheckFails(utf8(), {"12 z"}, is_valid, int8(), options); - CheckFails(utf8(), {"128"}, is_valid, int8(), options); - CheckFails(utf8(), {"-129"}, is_valid, int8(), options); - CheckFails(utf8(), {"0.5"}, is_valid, int8(), options); - - CheckFails(utf8(), {"256"}, is_valid, uint8(), options); - CheckFails(utf8(), {"-1"}, is_valid, uint8(), options); - - CheckFails(utf8(), {"z"}, is_valid, float32(), options); -} - -TEST_F(TestCast, StringToTimestamp) { - CastOptions options; - - vector is_valid = {true, false, true}; - vector strings = {"1970-01-01", "xxx", "2000-02-29"}; - - auto type = timestamp(TimeUnit::SECOND); - vector e = {0, 0, 951782400}; - CheckCase(utf8(), strings, is_valid, - type, e, options); - - type = timestamp(TimeUnit::MICRO); - e = {0, 0, 951782400000000LL}; - CheckCase(utf8(), strings, is_valid, - type, e, options); - - // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc -} - -TEST_F(TestCast, StringToTimestampErrors) { - CastOptions options; - - vector is_valid = {true}; - - for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { - auto type = timestamp(unit); - CheckFails(utf8(), {""}, is_valid, type, options); - CheckFails(utf8(), {"xxx"}, is_valid, type, options); - } -} - -template -class TestDictionaryCast : public TestCast {}; - -typedef ::testing::Types - TestTypes; - -TYPED_TEST_CASE(TestDictionaryCast, TestTypes); - -TYPED_TEST(TestDictionaryCast, Basic) { - CastOptions options; - shared_ptr plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - - Datum out; - ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &out)); - - this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); -} - -TEST_F(TestCast, DictToNonDictNoNulls) { - vector dict_values = {"foo", "bar", "baz"}; - auto ex_dict = _MakeArray(utf8(), dict_values, {}); - auto dict_type = dictionary(int32(), ex_dict); - - // Explicitly construct with nullptr for the null_bitmap_data - std::vector i1 = {1, 0, 1}; - std::vector i2 = {2, 1, 0, 1}; - auto c1 = std::make_shared>(3, Buffer::Wrap(i1)); - auto c2 = std::make_shared>(4, Buffer::Wrap(i2)); - - ArrayVector dict_arrays = {std::make_shared(dict_type, c1), - std::make_shared(dict_type, c2)}; - auto dict_carr = std::make_shared(dict_arrays); - - Datum cast_input(dict_carr); - Datum cast_output; - // Ensure that casting works even when the null_bitmap_data array is a nullptr - ASSERT_OK(Cast(&this->ctx_, cast_input, - static_cast(*dict_type).dictionary()->type(), - CastOptions(), &cast_output)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, cast_output.kind()); - - auto e1 = _MakeArray(utf8(), {"bar", "foo", "bar"}, {}); - auto e2 = _MakeArray(utf8(), {"baz", "bar", "foo", "bar"}, {}); - - auto chunks = cast_output.chunked_array()->chunks(); - ASSERT_EQ(chunks.size(), 2); - ASSERT_ARRAYS_EQUAL(*e1, *chunks[0]); - ASSERT_ARRAYS_EQUAL(*e2, *chunks[1]); -} - -/*TYPED_TEST(TestDictionaryCast, Reverse) { - CastOptions options; - shared_ptr plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - - shared_ptr dict_array; - ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array)); - - this->CheckPass(*plain_array, *dict_array, dict_array->type(), options); -}*/ - -TEST_F(TestCast, ListToList) { - CastOptions options; - std::shared_ptr offsets; - - vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; - std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; - ArrayFromVector(offsets_is_valid, offsets_values, &offsets); - - shared_ptr int32_plain_array = - TestBase::MakeRandomArray::ArrayType>(10, 2); - std::shared_ptr int32_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); - - std::shared_ptr int64_plain_array; - ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array)); - std::shared_ptr int64_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *int64_plain_array, pool_, &int64_list_array)); - - std::shared_ptr float64_plain_array; - ASSERT_OK( - Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); - std::shared_ptr float64_list_array; - ASSERT_OK( - ListArray::FromArrays(*offsets, *float64_plain_array, pool_, &float64_list_array)); - - CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), options); - CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options); - CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), options); - CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), options); - - options.allow_float_truncate = true; - CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options); - CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); -} - -// ---------------------------------------------------------------------- -// Dictionary tests - -template -void CheckUnique(FunctionContext* ctx, const shared_ptr& type, - const vector& in_values, const vector& in_is_valid, - const vector& out_values, const vector& out_is_valid) { - shared_ptr input = _MakeArray(type, in_values, in_is_valid); - shared_ptr expected = _MakeArray(type, out_values, out_is_valid); - - shared_ptr result; - ASSERT_OK(Unique(ctx, input, &result)); - ASSERT_ARRAYS_EQUAL(*expected, *result); -} - -template -void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, - const vector& in_values, const vector& in_is_valid, - const vector& out_values, const vector& out_is_valid, - const vector& out_indices) { - shared_ptr input = _MakeArray(type, in_values, in_is_valid); - shared_ptr ex_dict = _MakeArray(type, out_values, out_is_valid); - shared_ptr ex_indices = - _MakeArray(int32(), out_indices, in_is_valid); - - DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); - - Datum datum_out; - ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); - shared_ptr result = MakeArray(datum_out.array()); - - ASSERT_ARRAYS_EQUAL(expected, *result); -} - -class TestHashKernel : public ComputeFixture, public TestBase {}; - -template -class TestHashKernelPrimitive : public ComputeFixture, public TestBase {}; - -typedef ::testing::Types - PrimitiveDictionaries; - -TYPED_TEST_CASE(TestHashKernelPrimitive, PrimitiveDictionaries); - -TYPED_TEST(TestHashKernelPrimitive, Unique) { - using T = typename TypeParam::c_type; - auto type = TypeTraits::type_singleton(); - CheckUnique(&this->ctx_, type, {2, 1, 2, 1}, {true, false, true, true}, - {2, 1}, {}); - CheckUnique(&this->ctx_, type, {2, 1, 3, 1}, {false, false, true, true}, - {3, 1}, {}); -} - -TYPED_TEST(TestHashKernelPrimitive, DictEncode) { - using T = typename TypeParam::c_type; - auto type = TypeTraits::type_singleton(); - CheckDictEncode(&this->ctx_, type, {2, 1, 2, 1, 2, 3}, - {true, false, true, true, true, true}, {2, 1, 3}, {}, - {0, 0, 0, 1, 0, 2}); -} - -TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { - using T = typename TypeParam::c_type; - // Skip this test for (u)int8 - if (sizeof(Scalar) == 1) { - return; - } - - const int64_t kTotalValues = 1000000; - const int64_t kRepeats = 5; - - vector values; - vector uniques; - vector indices; - for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { - const auto val = static_cast(i % kTotalValues); - values.push_back(val); - - if (i < kTotalValues) { - uniques.push_back(val); - } - indices.push_back(static_cast(i % kTotalValues)); - } - - auto type = TypeTraits::type_singleton(); - CheckUnique(&this->ctx_, type, values, {}, uniques, {}); - - CheckDictEncode(&this->ctx_, type, values, {}, uniques, {}, indices); -} - -TEST_F(TestHashKernel, UniqueTimeTimestamp) { - CheckUnique(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1}, {}); - - CheckUnique(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1}, {}); - - CheckUnique(&this->ctx_, timestamp(TimeUnit::NANO), - {2, 1, 2, 1}, {true, false, true, true}, {2, 1}, - {}); -} - -TEST_F(TestHashKernel, UniqueBoolean) { - CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, - {true, false, true, true}, {true, false}, {}); - - CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, - {true, false, true, true}, {false, true}, {}); - - // No nulls - CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, {}, - {true, false}, {}); - - CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, {}, - {false, true}, {}); -} - -TEST_F(TestHashKernel, DictEncodeBoolean) { - CheckDictEncode( - &this->ctx_, boolean(), {true, true, false, true, false}, - {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1}); - - CheckDictEncode( - &this->ctx_, boolean(), {false, true, false, true, false}, - {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0}); - - // No nulls - CheckDictEncode(&this->ctx_, boolean(), - {true, true, false, true, false}, {}, {true, false}, - {}, {0, 0, 1, 0, 1}); - - CheckDictEncode(&this->ctx_, boolean(), - {false, true, false, true, false}, {}, {false, true}, - {}, {0, 1, 0, 1, 0}); -} - -TEST_F(TestHashKernel, UniqueBinary) { - CheckUnique(&this->ctx_, binary(), - {"test", "", "test2", "test"}, - {true, false, true, true}, {"test", "test2"}, {}); - - CheckUnique(&this->ctx_, utf8(), {"test", "", "test2", "test"}, - {true, false, true, true}, {"test", "test2"}, {}); -} - -TEST_F(TestHashKernel, DictEncodeBinary) { - CheckDictEncode( - &this->ctx_, binary(), {"test", "", "test2", "test", "baz"}, - {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); - - CheckDictEncode( - &this->ctx_, utf8(), {"test", "", "test2", "test", "baz"}, - {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); -} - -TEST_F(TestHashKernel, BinaryResizeTable) { - const int32_t kTotalValues = 10000; -#if !defined(ARROW_VALGRIND) - const int32_t kRepeats = 10; -#else - // Mitigate Valgrind's slowness - const int32_t kRepeats = 3; -#endif - - vector values; - vector uniques; - vector indices; - char buf[20] = "test"; - - for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { - int32_t index = i % kTotalValues; - - ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); - values.emplace_back(buf); - - if (i < kTotalValues) { - uniques.push_back(values.back()); - } - indices.push_back(index); - } - - CheckUnique(&this->ctx_, binary(), values, {}, uniques, {}); - CheckDictEncode(&this->ctx_, binary(), values, {}, uniques, {}, - indices); - - CheckUnique(&this->ctx_, utf8(), values, {}, uniques, {}); - CheckDictEncode(&this->ctx_, utf8(), values, {}, uniques, {}, - indices); -} - -TEST_F(TestHashKernel, UniqueFixedSizeBinary) { - CheckUnique( - &this->ctx_, fixed_size_binary(5), {"aaaaa", "", "bbbbb", "aaaaa"}, - {true, false, true, true}, {"aaaaa", "bbbbb"}, {}); -} - -TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) { - CheckDictEncode( - &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, - {true, false, true, true, true}, {"bbbbb", "aaaaa", "ccccc"}, {}, {0, 0, 0, 1, 2}); -} - -TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) { - const int32_t kTotalValues = 10000; -#if !defined(ARROW_VALGRIND) - const int32_t kRepeats = 10; -#else - // Mitigate Valgrind's slowness - const int32_t kRepeats = 3; -#endif - - vector values; - vector uniques; - vector indices; - char buf[7] = "test.."; - - for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { - int32_t index = i % kTotalValues; - - buf[4] = static_cast(index / 128); - buf[5] = static_cast(index % 128); - values.emplace_back(buf, 6); - - if (i < kTotalValues) { - uniques.push_back(values.back()); - } - indices.push_back(index); - } - - auto type = fixed_size_binary(6); - CheckUnique(&this->ctx_, type, values, {}, uniques, - {}); - CheckDictEncode(&this->ctx_, type, values, {}, - uniques, {}, indices); -} - -TEST_F(TestHashKernel, UniqueDecimal) { - vector values{12, 12, 11, 12}; - vector expected{12, 11}; - - CheckUnique(&this->ctx_, decimal(2, 0), values, - {true, false, true, true}, expected, {}); -} - -TEST_F(TestHashKernel, DictEncodeDecimal) { - vector values{12, 12, 11, 12, 13}; - vector expected{12, 11, 13}; - - CheckDictEncode(&this->ctx_, decimal(2, 0), values, - {true, false, true, true, true}, expected, - {}, {0, 0, 1, 0, 2}); -} - -TEST_F(TestHashKernel, ChunkedArrayInvoke) { - vector values1 = {"foo", "bar", "foo"}; - vector values2 = {"bar", "baz", "quuux", "foo"}; - - auto type = utf8(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - vector dict_values = {"foo", "bar", "baz", "quuux"}; - auto ex_dict = _MakeArray(type, dict_values, {}); - - ArrayVector arrays = {a1, a2}; - auto carr = std::make_shared(arrays); - - // Unique - shared_ptr result; - ASSERT_OK(Unique(&this->ctx_, carr, &result)); - ASSERT_ARRAYS_EQUAL(*ex_dict, *result); - - // Dictionary encode - auto dict_type = dictionary(int32(), ex_dict); - - auto i1 = _MakeArray(int32(), {0, 1, 0}, {}); - auto i2 = _MakeArray(int32(), {1, 2, 3, 0}, {}); - - ArrayVector dict_arrays = {std::make_shared(dict_type, i1), - std::make_shared(dict_type, i2)}; - auto dict_carr = std::make_shared(dict_arrays); - - Datum encoded_out; - ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); - - AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); -} - -using BinaryKernelFunc = - std::function; - -class TestBooleanKernel : public ComputeFixture, public TestBase { - public: - void TestArrayBinary(const BinaryKernelFunc& kernel, const std::shared_ptr& left, - const std::shared_ptr& right, - const std::shared_ptr& expected) { - Datum result; - ASSERT_OK(kernel(&this->ctx_, left, right, &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - std::shared_ptr result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(expected)); - } - - void TestChunkedArrayBinary(const BinaryKernelFunc& kernel, - const std::shared_ptr& left, - const std::shared_ptr& right, - const std::shared_ptr& expected) { - Datum result; - std::shared_ptr result_array; - ASSERT_OK(kernel(&this->ctx_, left, right, &result)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); - std::shared_ptr result_ca = result.chunked_array(); - ASSERT_TRUE(result_ca->Equals(expected)); - } - - void TestBinaryKernel(const BinaryKernelFunc& kernel, const std::vector& values1, - const std::vector& values2, - const std::vector& values3, - const std::vector& values3_nulls) { - auto type = boolean(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - auto a3 = _MakeArray(type, values3, {}); - auto a1_nulls = _MakeArray(type, values1, values1); - auto a2_nulls = _MakeArray(type, values2, values2); - auto a3_nulls = _MakeArray(type, values3, values3_nulls); - - TestArrayBinary(kernel, a1, a2, a3); - TestArrayBinary(kernel, a1_nulls, a2_nulls, a3_nulls); - TestArrayBinary(kernel, a1->Slice(1), a2->Slice(1), a3->Slice(1)); - TestArrayBinary(kernel, a1_nulls->Slice(1), a2_nulls->Slice(1), a3_nulls->Slice(1)); - - // ChunkedArray - std::vector> ca1_arrs = {a1, a1->Slice(1)}; - auto ca1 = std::make_shared(ca1_arrs); - std::vector> ca2_arrs = {a2, a2->Slice(1)}; - auto ca2 = std::make_shared(ca2_arrs); - std::vector> ca3_arrs = {a3, a3->Slice(1)}; - auto ca3 = std::make_shared(ca3_arrs); - TestChunkedArrayBinary(kernel, ca1, ca2, ca3); - - // ChunkedArray with different chunks - std::vector> ca4_arrs = {a1->Slice(0, 1), a1->Slice(1), - a1->Slice(1, 1), a1->Slice(2)}; - auto ca4 = std::make_shared(ca4_arrs); - TestChunkedArrayBinary(kernel, ca4, ca2, ca3); - } -}; - -TEST_F(TestBooleanKernel, Invert) { - vector values1 = {true, false, true}; - vector values2 = {false, true, false}; - - auto type = boolean(); - auto a1 = _MakeArray(type, values1, {}); - auto a2 = _MakeArray(type, values2, {}); - - // Plain array - Datum result; - ASSERT_OK(Invert(&this->ctx_, a1, &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - std::shared_ptr result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(a2)); - - // Array with offset - ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result)); - ASSERT_EQ(Datum::ARRAY, result.kind()); - result_array = result.make_array(); - ASSERT_TRUE(result_array->Equals(a2->Slice(1))); - - // ChunkedArray - std::vector> ca1_arrs = {a1, a1->Slice(1)}; - auto ca1 = std::make_shared(ca1_arrs); - std::vector> ca2_arrs = {a2, a2->Slice(1)}; - auto ca2 = std::make_shared(ca2_arrs); - ASSERT_OK(Invert(&this->ctx_, ca1, &result)); - ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); - std::shared_ptr result_ca = result.chunked_array(); - ASSERT_TRUE(result_ca->Equals(ca2)); -} - -TEST_F(TestBooleanKernel, And) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {true, false, false, false, true, false}; - TestBinaryKernel(And, values1, values2, values3, values3); -} - -TEST_F(TestBooleanKernel, Or) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {true, true, true, false, true, true}; - vector values3_nulls = {true, false, false, false, true, false}; - TestBinaryKernel(Or, values1, values2, values3, values3_nulls); -} - -TEST_F(TestBooleanKernel, Xor) { - vector values1 = {true, false, true, false, true, true}; - vector values2 = {true, true, false, false, true, false}; - vector values3 = {false, true, true, false, false, true}; - vector values3_nulls = {true, false, false, false, true, false}; - TestBinaryKernel(Xor, values1, values2, values3, values3_nulls); -} - class TestInvokeBinaryKernel : public ComputeFixture, public TestBase {}; class DummyBinaryKernel : public BinaryKernel { diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index a5a142b5c28ce..4d508aacb9990 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -16,3 +16,7 @@ # under the License. ARROW_INSTALL_ALL_HEADERS("arrow/compute/kernels") + +ADD_ARROW_TEST(boolean-test PREFIX "arrow-compute") +ADD_ARROW_TEST(cast-test PREFIX "arrow-compute") +ADD_ARROW_TEST(hash-test PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/boolean-test.cc b/cpp/src/arrow/compute/kernels/boolean-test.cc new file mode 100644 index 0000000000000..24b3c68aa1cfb --- /dev/null +++ b/cpp/src/arrow/compute/kernels/boolean-test.cc @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +#include "arrow/test-common.h" +#include "arrow/test-util.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/boolean.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +using BinaryKernelFunc = + std::function; + +class TestBooleanKernel : public ComputeFixture, public TestBase { + public: + void TestArrayBinary(const BinaryKernelFunc& kernel, const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + Datum result; + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + std::shared_ptr result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(expected)); + } + + void TestChunkedArrayBinary(const BinaryKernelFunc& kernel, + const std::shared_ptr& left, + const std::shared_ptr& right, + const std::shared_ptr& expected) { + Datum result; + std::shared_ptr result_array; + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); + std::shared_ptr result_ca = result.chunked_array(); + ASSERT_TRUE(result_ca->Equals(expected)); + } + + void TestBinaryKernel(const BinaryKernelFunc& kernel, const std::vector& values1, + const std::vector& values2, + const std::vector& values3, + const std::vector& values3_nulls) { + auto type = boolean(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + auto a3 = _MakeArray(type, values3, {}); + auto a1_nulls = _MakeArray(type, values1, values1); + auto a2_nulls = _MakeArray(type, values2, values2); + auto a3_nulls = _MakeArray(type, values3, values3_nulls); + + TestArrayBinary(kernel, a1, a2, a3); + TestArrayBinary(kernel, a1_nulls, a2_nulls, a3_nulls); + TestArrayBinary(kernel, a1->Slice(1), a2->Slice(1), a3->Slice(1)); + TestArrayBinary(kernel, a1_nulls->Slice(1), a2_nulls->Slice(1), a3_nulls->Slice(1)); + + // ChunkedArray + std::vector> ca1_arrs = {a1, a1->Slice(1)}; + auto ca1 = std::make_shared(ca1_arrs); + std::vector> ca2_arrs = {a2, a2->Slice(1)}; + auto ca2 = std::make_shared(ca2_arrs); + std::vector> ca3_arrs = {a3, a3->Slice(1)}; + auto ca3 = std::make_shared(ca3_arrs); + TestChunkedArrayBinary(kernel, ca1, ca2, ca3); + + // ChunkedArray with different chunks + std::vector> ca4_arrs = {a1->Slice(0, 1), a1->Slice(1), + a1->Slice(1, 1), a1->Slice(2)}; + auto ca4 = std::make_shared(ca4_arrs); + TestChunkedArrayBinary(kernel, ca4, ca2, ca3); + } +}; + +TEST_F(TestBooleanKernel, Invert) { + vector values1 = {true, false, true}; + vector values2 = {false, true, false}; + + auto type = boolean(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + // Plain array + Datum result; + ASSERT_OK(Invert(&this->ctx_, a1, &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + std::shared_ptr result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(a2)); + + // Array with offset + ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result)); + ASSERT_EQ(Datum::ARRAY, result.kind()); + result_array = result.make_array(); + ASSERT_TRUE(result_array->Equals(a2->Slice(1))); + + // ChunkedArray + std::vector> ca1_arrs = {a1, a1->Slice(1)}; + auto ca1 = std::make_shared(ca1_arrs); + std::vector> ca2_arrs = {a2, a2->Slice(1)}; + auto ca2 = std::make_shared(ca2_arrs); + ASSERT_OK(Invert(&this->ctx_, ca1, &result)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); + std::shared_ptr result_ca = result.chunked_array(); + ASSERT_TRUE(result_ca->Equals(ca2)); +} + +TEST_F(TestBooleanKernel, And) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {true, false, false, false, true, false}; + TestBinaryKernel(And, values1, values2, values3, values3); +} + +TEST_F(TestBooleanKernel, Or) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {true, true, true, false, true, true}; + vector values3_nulls = {true, false, false, false, true, false}; + TestBinaryKernel(Or, values1, values2, values3, values3_nulls); +} + +TEST_F(TestBooleanKernel, Xor) { + vector values1 = {true, false, true, false, true, true}; + vector values2 = {true, true, false, false, true, false}; + vector values3 = {false, true, true, false, false, true}; + vector values3_nulls = {true, false, false, false, true, false}; + TestBinaryKernel(Xor, values1, values2, values3, values3_nulls); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc new file mode 100644 index 0000000000000..4c3992868ef6d --- /dev/null +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -0,0 +1,1197 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/test-common.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/hash.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) { + ASSERT_EQ(left.data()->buffers[buffer_index].get(), + right.data()->buffers[buffer_index].get()); +} + +class TestCast : public ComputeFixture, public TestBase { + public: + void CheckPass(const Array& input, const Array& expected, + const shared_ptr& out_type, const CastOptions& options) { + shared_ptr result; + ASSERT_OK(Cast(&ctx_, input, out_type, options, &result)); + ASSERT_ARRAYS_EQUAL(expected, *result); + } + + template + void CheckFails(const shared_ptr& in_type, const vector& in_values, + const vector& is_valid, const shared_ptr& out_type, + const CastOptions& options) { + shared_ptr input, result; + if (is_valid.size() > 0) { + ArrayFromVector(in_type, is_valid, in_values, &input); + } else { + ArrayFromVector(in_type, in_values, &input); + } + ASSERT_RAISES(Invalid, Cast(&ctx_, *input, out_type, options, &result)); + } + + void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { + shared_ptr result; + ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); + AssertBufferSame(input, *result, 0); + AssertBufferSame(input, *result, 1); + } + + template + void CheckCase(const shared_ptr& in_type, const vector& in_values, + const vector& is_valid, const shared_ptr& out_type, + const vector& out_values, const CastOptions& options) { + DCHECK_EQ(in_values.size(), out_values.size()); + shared_ptr input, expected; + if (is_valid.size() > 0) { + DCHECK_EQ(is_valid.size(), out_values.size()); + ArrayFromVector(in_type, is_valid, in_values, &input); + ArrayFromVector(out_type, is_valid, out_values, &expected); + } else { + ArrayFromVector(in_type, in_values, &input); + ArrayFromVector(out_type, out_values, &expected); + } + CheckPass(*input, *expected, out_type, options); + + // Check a sliced variant + if (input->length() > 1) { + CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); + } + } +}; + +TEST_F(TestCast, SameTypeZeroCopy) { + vector is_valid = {true, false, true, true, true}; + vector v1 = {0, 1, 2, 3, 4}; + + shared_ptr arr; + ArrayFromVector(int32(), is_valid, v1, &arr); + + shared_ptr result; + ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); + + AssertBufferSame(*arr, *result, 0); + AssertBufferSame(*arr, *result, 1); +} + +TEST_F(TestCast, ToBoolean) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + // int8, should suffice for other integers + vector v1 = {0, 1, 127, -1, 0}; + vector e1 = {false, true, true, true, false}; + CheckCase(int8(), v1, is_valid, boolean(), e1, + options); + + // floating point + vector v2 = {1.0, 0, 0, -1.0, 5.0}; + vector e2 = {true, false, false, true, true}; + CheckCase(float64(), v2, is_valid, boolean(), e2, + options); +} + +TEST_F(TestCast, ToIntUpcast) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + // int8 to int32 + vector v1 = {0, 1, 127, -1, 0}; + vector e1 = {0, 1, 127, -1, 0}; + CheckCase(int8(), v1, is_valid, int32(), e1, + options); + + // bool to int8 + vector v2 = {false, true, false, true, true}; + vector e2 = {0, 1, 0, 1, 1}; + CheckCase(boolean(), v2, is_valid, int8(), e2, + options); + + // uint8 to int16, no overflow/underrun + vector v3 = {0, 100, 200, 255, 0}; + vector e3 = {0, 100, 200, 255, 0}; + CheckCase(uint8(), v3, is_valid, int16(), e3, + options); +} + +TEST_F(TestCast, OverflowInNullSlot) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + vector v11 = {0, 70000, 2000, 1000, 0}; + vector e11 = {0, 0, 2000, 1000, 0}; + + shared_ptr expected; + ArrayFromVector(int16(), is_valid, e11, &expected); + + auto buf = Buffer::Wrap(v11.data(), v11.size()); + Int32Array tmp11(5, buf, expected->null_bitmap(), -1); + + CheckPass(tmp11, *expected, int16(), options); +} + +TEST_F(TestCast, ToIntDowncastSafe) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + // int16 to uint8, no overflow/underrun + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, uint8(), e1, + options); + + // int16 to uint8, with overflow + vector v2 = {0, 100, 256, 0, 0}; + CheckFails(int16(), v2, is_valid, uint8(), options); + + // underflow + vector v3 = {0, 100, -1, 0, 0}; + CheckFails(int16(), v3, is_valid, uint8(), options); + + // int32 to int16, no overflow + vector v4 = {0, 1000, 2000, 1, 2}; + vector e4 = {0, 1000, 2000, 1, 2}; + CheckCase(int32(), v4, is_valid, int16(), e4, + options); + + // int32 to int16, overflow + vector v5 = {0, 1000, 2000, 70000, 0}; + CheckFails(int32(), v5, is_valid, int16(), options); + + // underflow + vector v6 = {0, 1000, 2000, -70000, 0}; + CheckFails(int32(), v6, is_valid, int16(), options); + + vector v7 = {0, 1000, 2000, -70000, 0}; + CheckFails(int32(), v7, is_valid, uint8(), options); +} + +template +std::vector UnsafeVectorCast(const std::vector& v) { + size_t n_elems = v.size(); + std::vector result(n_elems); + + for (size_t i = 0; i < v.size(); i++) result[i] = static_cast(v[i]); + + return std::move(result); +} + +TEST_F(TestCast, IntegerSignedToUnsigned) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + vector v1 = {INT32_MIN, 100, -1, UINT16_MAX, INT32_MAX}; + + // Same width + CheckFails(int32(), v1, is_valid, uint32(), options); + // Wider + CheckFails(int32(), v1, is_valid, uint64(), options); + // Narrower + CheckFails(int32(), v1, is_valid, uint16(), options); + // Fail because of overflow (instead of underflow). + vector over = {0, -11, 0, UINT16_MAX + 1, INT32_MAX}; + CheckFails(int32(), over, is_valid, uint16(), options); + + options.allow_int_overflow = true; + + CheckCase( + int32(), v1, is_valid, uint32(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), v1, is_valid, uint64(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), v1, is_valid, uint16(), UnsafeVectorCast(v1), options); + CheckCase( + int32(), over, is_valid, uint16(), UnsafeVectorCast(over), + options); +} + +TEST_F(TestCast, IntegerUnsignedToSigned) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, true, true}; + + vector v1 = {0, INT16_MAX + 1, UINT32_MAX}; + vector v2 = {0, INT16_MAX + 1, 2}; + // Same width + CheckFails(uint32(), v1, is_valid, int32(), options); + // Narrower + CheckFails(uint32(), v1, is_valid, int16(), options); + CheckFails(uint32(), v2, is_valid, int16(), options); + + options.allow_int_overflow = true; + + CheckCase( + uint32(), v1, is_valid, int32(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v1, is_valid, int64(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v1, is_valid, int16(), UnsafeVectorCast(v1), options); + CheckCase( + uint32(), v2, is_valid, int16(), UnsafeVectorCast(v2), options); +} + +TEST_F(TestCast, ToIntDowncastUnsafe) { + CastOptions options; + options.allow_int_overflow = true; + + vector is_valid = {true, false, true, true, true}; + + // int16 to uint8, no overflow/underrun + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, uint8(), e1, + options); + + // int16 to uint8, with overflow + vector v2 = {0, 100, 256, 0, 0}; + vector e2 = {0, 100, 0, 0, 0}; + CheckCase(int16(), v2, is_valid, uint8(), e2, + options); + + // underflow + vector v3 = {0, 100, -1, 0, 0}; + vector e3 = {0, 100, 255, 0, 0}; + CheckCase(int16(), v3, is_valid, uint8(), e3, + options); + + // int32 to int16, no overflow + vector v4 = {0, 1000, 2000, 1, 2}; + vector e4 = {0, 1000, 2000, 1, 2}; + CheckCase(int32(), v4, is_valid, int16(), e4, + options); + + // int32 to int16, overflow + // TODO(wesm): do we want to allow this? we could set to null + vector v5 = {0, 1000, 2000, 70000, 0}; + vector e5 = {0, 1000, 2000, 4464, 0}; + CheckCase(int32(), v5, is_valid, int16(), e5, + options); + + // underflow + // TODO(wesm): do we want to allow this? we could set overflow to null + vector v6 = {0, 1000, 2000, -70000, 0}; + vector e6 = {0, 1000, 2000, -4464, 0}; + CheckCase(int32(), v6, is_valid, int16(), e6, + options); +} + +TEST_F(TestCast, FloatingPointToInt) { + // which means allow_float_truncate == false + auto options = CastOptions::Safe(); + + vector is_valid = {true, false, true, true, true}; + vector all_valid = {true, true, true, true, true}; + + // float32 to int32 no truncation + vector v1 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e1 = {1, 0, 0, -1, 5}; + CheckCase(float32(), v1, is_valid, int32(), e1, + options); + CheckCase(float32(), v1, all_valid, int32(), e1, + options); + + // float64 to int32 no truncation + vector v2 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e2 = {1, 0, 0, -1, 5}; + CheckCase(float64(), v2, is_valid, int32(), e2, + options); + CheckCase(float64(), v2, all_valid, int32(), e2, + options); + + // float64 to int64 no truncation + vector v3 = {1.0, 0, 0.0, -1.0, 5.0}; + vector e3 = {1, 0, 0, -1, 5}; + CheckCase(float64(), v3, is_valid, int64(), e3, + options); + CheckCase(float64(), v3, all_valid, int64(), e3, + options); + + // float64 to int32 truncate + vector v4 = {1.5, 0, 0.5, -1.5, 5.5}; + vector e4 = {1, 0, 0, -1, 5}; + + options.allow_float_truncate = false; + CheckFails(float64(), v4, is_valid, int32(), options); + CheckFails(float64(), v4, all_valid, int32(), options); + + options.allow_float_truncate = true; + CheckCase(float64(), v4, is_valid, int32(), e4, + options); + CheckCase(float64(), v4, all_valid, int32(), e4, + options); + + // float64 to int64 truncate + vector v5 = {1.5, 0, 0.5, -1.5, 5.5}; + vector e5 = {1, 0, 0, -1, 5}; + + options.allow_float_truncate = false; + CheckFails(float64(), v5, is_valid, int64(), options); + CheckFails(float64(), v5, all_valid, int64(), options); + + options.allow_float_truncate = true; + CheckCase(float64(), v5, is_valid, int64(), e5, + options); + CheckCase(float64(), v5, all_valid, int64(), e5, + options); +} + +TEST_F(TestCast, IntToFloatingPoint) { + auto options = CastOptions::Safe(); + + vector all_valid = {true, true, true, true, true}; + vector all_invalid = {false, false, false, false, false}; + + vector v1 = {INT64_MIN, INT64_MIN + 1, 0, INT64_MAX - 1, INT64_MAX}; + CheckFails(int64(), v1, all_valid, float32(), options); + + // While it's not safe to convert, all values are null. + CheckCase(int64(), v1, all_invalid, float64(), + UnsafeVectorCast(v1), + options); +} + +TEST_F(TestCast, TimestampToTimestamp) { + CastOptions options; + + auto CheckTimestampCast = + [this](const CastOptions& options, TimeUnit::type from_unit, TimeUnit::type to_unit, + const vector& from_values, const vector& to_values, + const vector& is_valid) { + CheckCase( + timestamp(from_unit), from_values, is_valid, timestamp(to_unit), to_values, + options); + }; + + vector is_valid = {true, false, true, true, true}; + + // Multiply promotions + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MILLI, v1, e1, is_valid); + + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::MICRO, v2, e2, is_valid); + + vector v3 = {0, 100, 200, 1, 2}; + vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; + CheckTimestampCast(options, TimeUnit::SECOND, TimeUnit::NANO, v3, e3, is_valid); + + vector v4 = {0, 100, 200, 1, 2}; + vector e4 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::MICRO, v4, e4, is_valid); + + vector v5 = {0, 100, 200, 1, 2}; + vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::NANO, v5, e5, is_valid); + + vector v6 = {0, 100, 200, 1, 2}; + vector e6 = {0, 100000, 200000, 1000, 2000}; + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::NANO, v6, e6, is_valid); + + // Zero copy + vector v7 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(timestamp(TimeUnit::SECOND), is_valid, v7, + &arr); + CheckZeroCopy(*arr, timestamp(TimeUnit::SECOND)); + + // ARROW-1773, cast to integer + CheckZeroCopy(*arr, int64()); + + // Divide, truncate + vector v8 = {0, 100123, 200456, 1123, 2456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckTimestampCast(options, TimeUnit::MILLI, TimeUnit::SECOND, v8, e8, is_valid); + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::MILLI, v8, e8, is_valid); + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MICRO, v8, e8, is_valid); + + vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; + vector e9 = {0, 100, 200, 1, 2}; + CheckTimestampCast(options, TimeUnit::MICRO, TimeUnit::SECOND, v9, e9, is_valid); + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::MILLI, v9, e9, is_valid); + + vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; + vector e10 = {0, 100, 200, 1, 2}; + CheckTimestampCast(options, TimeUnit::NANO, TimeUnit::SECOND, v10, e10, is_valid); + + // Disallow truncate, failures + options.allow_time_truncate = false; + CheckFails(timestamp(TimeUnit::MILLI), v8, is_valid, + timestamp(TimeUnit::SECOND), options); + CheckFails(timestamp(TimeUnit::MICRO), v8, is_valid, + timestamp(TimeUnit::MILLI), options); + CheckFails(timestamp(TimeUnit::NANO), v8, is_valid, + timestamp(TimeUnit::MICRO), options); + CheckFails(timestamp(TimeUnit::MICRO), v9, is_valid, + timestamp(TimeUnit::SECOND), options); + CheckFails(timestamp(TimeUnit::NANO), v9, is_valid, + timestamp(TimeUnit::MILLI), options); + CheckFails(timestamp(TimeUnit::NANO), v10, is_valid, + timestamp(TimeUnit::SECOND), options); +} + +TEST_F(TestCast, TimestampToDate32_Date64) { + CastOptions options; + + vector is_valid = {true, true, false}; + + // 2000-01-01, 2000-01-02, null + vector v_nano = {946684800000000000, 946771200000000000, 0}; + vector v_micro = {946684800000000, 946771200000000, 0}; + vector v_milli = {946684800000, 946771200000, 0}; + vector v_second = {946684800, 946771200, 0}; + vector v_day = {10957, 10958, 0}; + + // Simple conversions + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date64(), v_milli, options); + + CheckCase( + timestamp(TimeUnit::NANO), v_nano, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MICRO), v_micro, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::MILLI), v_milli, is_valid, date32(), v_day, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second, is_valid, date32(), v_day, options); + + // Disallow truncate, failures + vector v_nano_fail = {946684800000000001, 946771200000000001, 0}; + vector v_micro_fail = {946684800000001, 946771200000001, 0}; + vector v_milli_fail = {946684800001, 946771200001, 0}; + vector v_second_fail = {946684801, 946771201, 0}; + + options.allow_time_truncate = false; + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date64(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date64(), options); + + CheckFails(timestamp(TimeUnit::NANO), v_nano_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MICRO), v_micro_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::MILLI), v_milli_fail, is_valid, date32(), + options); + CheckFails(timestamp(TimeUnit::SECOND), v_second_fail, is_valid, + date32(), options); + + // Make sure that nulls are excluded from the truncation checks + vector v_second_nofail = {946684800, 946771200, 1}; + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date64(), v_milli, options); + CheckCase( + timestamp(TimeUnit::SECOND), v_second_nofail, is_valid, date32(), v_day, options); +} + +TEST_F(TestCast, TimeToCompatible) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + // Multiply promotions + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time32(TimeUnit::SECOND), v1, is_valid, time32(TimeUnit::MILLI), e1, options); + + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckCase( + time32(TimeUnit::SECOND), v2, is_valid, time64(TimeUnit::MICRO), e2, options); + + vector v3 = {0, 100, 200, 1, 2}; + vector e3 = {0, 100000000000L, 200000000000L, 1000000000L, 2000000000L}; + CheckCase( + time32(TimeUnit::SECOND), v3, is_valid, time64(TimeUnit::NANO), e3, options); + + vector v4 = {0, 100, 200, 1, 2}; + vector e4 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time32(TimeUnit::MILLI), v4, is_valid, time64(TimeUnit::MICRO), e4, options); + + vector v5 = {0, 100, 200, 1, 2}; + vector e5 = {0, 100000000L, 200000000L, 1000000, 2000000}; + CheckCase( + time32(TimeUnit::MILLI), v5, is_valid, time64(TimeUnit::NANO), e5, options); + + vector v6 = {0, 100, 200, 1, 2}; + vector e6 = {0, 100000, 200000, 1000, 2000}; + CheckCase( + time64(TimeUnit::MICRO), v6, is_valid, time64(TimeUnit::NANO), e6, options); + + // Zero copy + vector v7 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(time64(TimeUnit::MICRO), is_valid, v7, &arr); + CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); + + // ARROW-1773: cast to int64 + CheckZeroCopy(*arr, int64()); + + vector v7_2 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(time32(TimeUnit::SECOND), is_valid, v7_2, &arr); + CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); + + // ARROW-1773: cast to int64 + CheckZeroCopy(*arr, int32()); + + // Divide, truncate + vector v8 = {0, 100123, 200456, 1123, 2456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckCase( + time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), e8, options); + CheckCase( + time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), e8, options); + CheckCase( + time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), e8, options); + + vector v9 = {0, 100123000, 200456000, 1123000, 2456000}; + vector e9 = {0, 100, 200, 1, 2}; + CheckCase( + time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), e9, options); + CheckCase( + time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), e9, options); + + vector v10 = {0, 100123000000L, 200456000000L, 1123000000L, 2456000000}; + vector e10 = {0, 100, 200, 1, 2}; + CheckCase( + time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), e10, options); + + // Disallow truncate, failures + + options.allow_time_truncate = false; + CheckFails(time32(TimeUnit::MILLI), v8, is_valid, time32(TimeUnit::SECOND), + options); + CheckFails(time64(TimeUnit::MICRO), v8, is_valid, time32(TimeUnit::MILLI), + options); + CheckFails(time64(TimeUnit::NANO), v8, is_valid, time64(TimeUnit::MICRO), + options); + CheckFails(time64(TimeUnit::MICRO), v9, is_valid, time32(TimeUnit::SECOND), + options); + CheckFails(time64(TimeUnit::NANO), v9, is_valid, time32(TimeUnit::MILLI), + options); + CheckFails(time64(TimeUnit::NANO), v10, is_valid, time32(TimeUnit::SECOND), + options); +} + +TEST_F(TestCast, PrimitiveZeroCopy) { + shared_ptr arr; + + ArrayFromVector(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, uint8()); + ArrayFromVector(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, int8()); + + ArrayFromVector(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, uint16()); + ArrayFromVector(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, int16()); + + ArrayFromVector(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, uint32()); + ArrayFromVector(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, int32()); + + ArrayFromVector(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, uint64()); + ArrayFromVector(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, int64()); + + ArrayFromVector(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, float32()); + + ArrayFromVector(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); + CheckZeroCopy(*arr, float64()); +} + +TEST_F(TestCast, DateToCompatible) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + constexpr int64_t F = 86400000; + + // Multiply promotion + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100 * F, 200 * F, F, 2 * F}; + CheckCase(date32(), v1, is_valid, date64(), + e1, options); + + // Zero copy + vector v2 = {0, 70000, 2000, 1000, 0}; + vector v3 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(date32(), is_valid, v2, &arr); + CheckZeroCopy(*arr, date32()); + + // ARROW-1773: zero copy cast to integer + CheckZeroCopy(*arr, int32()); + + ArrayFromVector(date64(), is_valid, v3, &arr); + CheckZeroCopy(*arr, date64()); + + // ARROW-1773: zero copy cast to integer + CheckZeroCopy(*arr, int64()); + + // Divide, truncate + vector v8 = {0, 100 * F + 123, 200 * F + 456, F + 123, 2 * F + 456}; + vector e8 = {0, 100, 200, 1, 2}; + + options.allow_time_truncate = true; + CheckCase(date64(), v8, is_valid, date32(), + e8, options); + + // Disallow truncate, failures + options.allow_time_truncate = false; + CheckFails(date64(), v8, is_valid, date32(), options); +} + +TEST_F(TestCast, ToDouble) { + CastOptions options; + vector is_valid = {true, false, true, true, true}; + + // int16 to double + vector v1 = {0, 100, 200, 1, 2}; + vector e1 = {0, 100, 200, 1, 2}; + CheckCase(int16(), v1, is_valid, float64(), e1, + options); + + // float to double + vector v2 = {0, 100, 200, 1, 2}; + vector e2 = {0, 100, 200, 1, 2}; + CheckCase(float32(), v2, is_valid, float64(), e2, + options); + + // bool to double + vector v3 = {true, true, false, false, true}; + vector e3 = {1, 1, 0, 0, 1}; + CheckCase(boolean(), v3, is_valid, float64(), e3, + options); +} + +TEST_F(TestCast, ChunkedArray) { + vector values1 = {0, 1, 2}; + vector values2 = {3, 4, 5}; + + auto type = int16(); + auto out_type = int64(); + + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared(arrays); + + CastOptions options; + + Datum out; + ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); + + auto out_carr = out.chunked_array(); + + vector ex_values1 = {0, 1, 2}; + vector ex_values2 = {3, 4, 5}; + auto a3 = _MakeArray(out_type, ex_values1, {}); + auto a4 = _MakeArray(out_type, ex_values2, {}); + + ArrayVector ex_arrays = {a3, a4}; + auto ex_carr = std::make_shared(ex_arrays); + + ASSERT_TRUE(out.chunked_array()->Equals(*ex_carr)); +} + +TEST_F(TestCast, UnsupportedTarget) { + vector is_valid = {true, false, true, true, true}; + vector v1 = {0, 1, 2, 3, 4}; + + shared_ptr arr; + ArrayFromVector(int32(), is_valid, v1, &arr); + + shared_ptr result; + ASSERT_RAISES(NotImplemented, Cast(&this->ctx_, *arr, utf8(), {}, &result)); +} + +TEST_F(TestCast, DateTimeZeroCopy) { + vector is_valid = {true, false, true, true, true}; + + vector v1 = {0, 70000, 2000, 1000, 0}; + shared_ptr arr; + ArrayFromVector(int32(), is_valid, v1, &arr); + + CheckZeroCopy(*arr, time32(TimeUnit::SECOND)); + CheckZeroCopy(*arr, date32()); + + vector v2 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(int64(), is_valid, v2, &arr); + + CheckZeroCopy(*arr, time64(TimeUnit::MICRO)); + CheckZeroCopy(*arr, date64()); + CheckZeroCopy(*arr, timestamp(TimeUnit::NANO)); +} + +TEST_F(TestCast, FromNull) { + // Null casts to everything + const int length = 10; + + NullArray arr(length); + + shared_ptr result; + ASSERT_OK(Cast(&ctx_, arr, int32(), {}, &result)); + + ASSERT_EQ(length, result->length()); + ASSERT_EQ(length, result->null_count()); + + // OK to look at bitmaps + ASSERT_ARRAYS_EQUAL(*result, *result); +} + +TEST_F(TestCast, PreallocatedMemory) { + CastOptions options; + options.allow_int_overflow = false; + + vector is_valid = {true, false, true, true, true}; + + const int64_t length = 5; + + shared_ptr arr; + vector v1 = {0, 70000, 2000, 1000, 0}; + vector e1 = {0, 70000, 2000, 1000, 0}; + ArrayFromVector(int32(), is_valid, v1, &arr); + + auto out_type = int64(); + + std::unique_ptr kernel; + ASSERT_OK(GetCastFunction(*int32(), out_type, options, &kernel)); + + auto out_data = ArrayData::Make(out_type, length); + + shared_ptr out_values; + ASSERT_OK(this->ctx_.Allocate(length * sizeof(int64_t), &out_values)); + + out_data->buffers.push_back(nullptr); + out_data->buffers.push_back(out_values); + + Datum out(out_data); + ASSERT_OK(kernel->Call(&this->ctx_, arr, &out)); + + // Buffer address unchanged + ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); + + shared_ptr result = MakeArray(out_data); + shared_ptr expected; + ArrayFromVector(int64(), is_valid, e1, &expected); + + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +template +void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr& in_type, + const vector& in_values, + const std::shared_ptr& out_type, + const vector& out_values) { + using OutTraits = TypeTraits; + + CastOptions options; + + const int64_t length = static_cast(in_values.size()); + + shared_ptr arr, expected; + ArrayFromVector(in_type, in_values, &arr); + ArrayFromVector(out_type, out_values, &expected); + + shared_ptr out_buffer; + ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer)); + + std::unique_ptr kernel; + ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel)); + + const int64_t first_half = length / 2; + + auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer}); + auto out_second_data = out_data->Copy(); + out_second_data->offset = first_half; + + Datum out_first(out_data); + Datum out_second(out_second_data); + + // Cast each bit + ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second)); + + shared_ptr result = MakeArray(out_data); + + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +TEST_F(TestCast, OffsetOutputBuffer) { + // ARROW-1735 + vector v1 = {0, 10000, 2000, 1000, 0}; + vector e1 = {0, 10000, 2000, 1000, 0}; + + auto in_type = int32(); + auto out_type = int64(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + out_type, e1); + + vector e2 = {false, true, true, true, false}; + + out_type = boolean(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + boolean(), e2); + + vector e3 = {0, 10000, 2000, 1000, 0}; + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + int16(), e3); +} + +TEST_F(TestCast, StringToBoolean) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + vector v1 = {"False", "true", "true", "True", "false"}; + vector v2 = {"0", "1", "1", "1", "0"}; + vector e = {false, true, true, true, false}; + CheckCase(utf8(), v1, is_valid, boolean(), + e, options); + CheckCase(utf8(), v2, is_valid, boolean(), + e, options); +} + +TEST_F(TestCast, StringToBooleanErrors) { + CastOptions options; + + vector is_valid = {true}; + + CheckFails(utf8(), {"false "}, is_valid, boolean(), options); + CheckFails(utf8(), {"T"}, is_valid, boolean(), options); +} + +TEST_F(TestCast, StringToNumber) { + CastOptions options; + + vector is_valid = {true, false, true, true, true}; + + // string to int + vector v_int = {"0", "1", "127", "-1", "0"}; + vector e_int8 = {0, 1, 127, -1, 0}; + vector e_int16 = {0, 1, 127, -1, 0}; + vector e_int32 = {0, 1, 127, -1, 0}; + vector e_int64 = {0, 1, 127, -1, 0}; + CheckCase(utf8(), v_int, is_valid, int8(), + e_int8, options); + CheckCase(utf8(), v_int, is_valid, int16(), + e_int16, options); + CheckCase(utf8(), v_int, is_valid, int32(), + e_int32, options); + CheckCase(utf8(), v_int, is_valid, int64(), + e_int64, options); + + v_int = {"2147483647", "0", "-2147483648", "0", "0"}; + e_int32 = {2147483647, 0, -2147483648LL, 0, 0}; + CheckCase(utf8(), v_int, is_valid, int32(), + e_int32, options); + v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"}; + e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0}; + CheckCase(utf8(), v_int, is_valid, int64(), + e_int64, options); + + // string to uint + vector v_uint = {"0", "1", "127", "255", "0"}; + vector e_uint8 = {0, 1, 127, 255, 0}; + vector e_uint16 = {0, 1, 127, 255, 0}; + vector e_uint32 = {0, 1, 127, 255, 0}; + vector e_uint64 = {0, 1, 127, 255, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint8(), e_uint8, options); + CheckCase(utf8(), v_uint, is_valid, + uint16(), e_uint16, options); + CheckCase(utf8(), v_uint, is_valid, + uint32(), e_uint32, options); + CheckCase(utf8(), v_uint, is_valid, + uint64(), e_uint64, options); + + v_uint = {"4294967295", "0", "0", "0", "0"}; + e_uint32 = {4294967295, 0, 0, 0, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint32(), e_uint32, options); + v_uint = {"18446744073709551615", "0", "0", "0", "0"}; + e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0}; + CheckCase(utf8(), v_uint, is_valid, + uint64(), e_uint64, options); + + // string to float + vector v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"}; + vector e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f}; + vector e_double = {0.1, 1.2, 127.3, 200.4, 0.5}; + CheckCase(utf8(), v_float, is_valid, + float32(), e_float, options); + CheckCase(utf8(), v_float, is_valid, + float64(), e_double, options); + + // Test that casting is locale-independent + auto global_locale = std::locale(); + try { + // French locale uses the comma as decimal point + std::locale::global(std::locale("fr_FR.UTF-8")); + } catch (std::runtime_error&) { + // Locale unavailable, ignore + } + CheckCase(utf8(), v_float, is_valid, + float32(), e_float, options); + CheckCase(utf8(), v_float, is_valid, + float64(), e_double, options); + std::locale::global(global_locale); +} + +TEST_F(TestCast, StringToNumberErrors) { + CastOptions options; + + vector is_valid = {true}; + + CheckFails(utf8(), {"z"}, is_valid, int8(), options); + CheckFails(utf8(), {"12 z"}, is_valid, int8(), options); + CheckFails(utf8(), {"128"}, is_valid, int8(), options); + CheckFails(utf8(), {"-129"}, is_valid, int8(), options); + CheckFails(utf8(), {"0.5"}, is_valid, int8(), options); + + CheckFails(utf8(), {"256"}, is_valid, uint8(), options); + CheckFails(utf8(), {"-1"}, is_valid, uint8(), options); + + CheckFails(utf8(), {"z"}, is_valid, float32(), options); +} + +TEST_F(TestCast, StringToTimestamp) { + CastOptions options; + + vector is_valid = {true, false, true}; + vector strings = {"1970-01-01", "xxx", "2000-02-29"}; + + auto type = timestamp(TimeUnit::SECOND); + vector e = {0, 0, 951782400}; + CheckCase(utf8(), strings, is_valid, + type, e, options); + + type = timestamp(TimeUnit::MICRO); + e = {0, 0, 951782400000000LL}; + CheckCase(utf8(), strings, is_valid, + type, e, options); + + // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc +} + +TEST_F(TestCast, StringToTimestampErrors) { + CastOptions options; + + vector is_valid = {true}; + + for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { + auto type = timestamp(unit); + CheckFails(utf8(), {""}, is_valid, type, options); + CheckFails(utf8(), {"xxx"}, is_valid, type, options); + } +} + +constexpr const char* kInvalidUtf8 = "\xa0\xa1"; + +TEST_F(TestCast, BinaryToString) { + CastOptions options; + + // All valid except the last one + vector all = {1, 1, 1, 1, 1}; + vector valid = {1, 1, 1, 1, 0}; + vector strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8}; + + std::shared_ptr array; + + // Should accept when invalid but null. + ArrayFromVector(binary(), valid, strings, &array); + CheckZeroCopy(*array, utf8()); + + // Should refuse due to invalid utf8 payload + CheckFails(binary(), strings, all, utf8(), options); + + // Should accept due to option override + options.allow_invalid_utf8 = true; + CheckCase(binary(), strings, all, + utf8(), strings, options); +} + +template +class TestDictionaryCast : public TestCast {}; + +typedef ::testing::Types + TestTypes; + +TYPED_TEST_CASE(TestDictionaryCast, TestTypes); + +TYPED_TEST(TestDictionaryCast, Basic) { + CastOptions options; + shared_ptr plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + + Datum out; + ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &out)); + + this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); +} + +TEST_F(TestCast, DictToNonDictNoNulls) { + vector dict_values = {"foo", "bar", "baz"}; + auto ex_dict = _MakeArray(utf8(), dict_values, {}); + auto dict_type = dictionary(int32(), ex_dict); + + // Explicitly construct with nullptr for the null_bitmap_data + std::vector i1 = {1, 0, 1}; + std::vector i2 = {2, 1, 0, 1}; + auto c1 = std::make_shared>(3, Buffer::Wrap(i1)); + auto c2 = std::make_shared>(4, Buffer::Wrap(i2)); + + ArrayVector dict_arrays = {std::make_shared(dict_type, c1), + std::make_shared(dict_type, c2)}; + auto dict_carr = std::make_shared(dict_arrays); + + Datum cast_input(dict_carr); + Datum cast_output; + // Ensure that casting works even when the null_bitmap_data array is a nullptr + ASSERT_OK(Cast(&this->ctx_, cast_input, + static_cast(*dict_type).dictionary()->type(), + CastOptions(), &cast_output)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, cast_output.kind()); + + auto e1 = _MakeArray(utf8(), {"bar", "foo", "bar"}, {}); + auto e2 = _MakeArray(utf8(), {"baz", "bar", "foo", "bar"}, {}); + + auto chunks = cast_output.chunked_array()->chunks(); + ASSERT_EQ(chunks.size(), 2); + ASSERT_ARRAYS_EQUAL(*e1, *chunks[0]); + ASSERT_ARRAYS_EQUAL(*e2, *chunks[1]); +} + +/*TYPED_TEST(TestDictionaryCast, Reverse) { + CastOptions options; + shared_ptr plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + + shared_ptr dict_array; + ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array)); + + this->CheckPass(*plain_array, *dict_array, dict_array->type(), options); +}*/ + +TEST_F(TestCast, ListToList) { + CastOptions options; + std::shared_ptr offsets; + + vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; + std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; + ArrayFromVector(offsets_is_valid, offsets_values, &offsets); + + shared_ptr int32_plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + std::shared_ptr int32_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); + + std::shared_ptr int64_plain_array; + ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array)); + std::shared_ptr int64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int64_plain_array, pool_, &int64_list_array)); + + std::shared_ptr float64_plain_array; + ASSERT_OK( + Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); + std::shared_ptr float64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *float64_plain_array, pool_, &float64_list_array)); + + CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), options); + CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options); + CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), options); + CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), options); + + options.allow_float_truncate = true; + CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options); + CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 4f7d7f822b3ab..b148486bd212f 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -37,6 +37,7 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/parsing.h" // IWYU pragma: keep +#include "arrow/util/utf8.h" #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" @@ -77,6 +78,19 @@ namespace compute { constexpr int64_t kMillisecondsInDay = 86400000; +template +struct is_binary_to_string { + static constexpr bool value = false; +}; + +template +struct is_binary_to_string< + O, I, + typename std::enable_if::value && + std::is_base_of::value>::type> { + static constexpr bool value = true; +}; + // ---------------------------------------------------------------------- // Zero copy casts @@ -112,15 +126,30 @@ struct is_zero_copy_cast< static constexpr bool value = sizeof(O_T) == sizeof(I_T); }; +// Binary to String doesn't require copying, the payload only needs to be +// validated. +template +struct is_zero_copy_cast< + O, I, + typename std::enable_if::value && + is_binary_to_string::value>::type> { + static constexpr bool value = true; +}; + template struct CastFunctor {}; // Indicated no computation required +// +// The case BinaryType -> StringType is special cased due to validation +// requirements. template -struct CastFunctor::value>::type> { +struct CastFunctor::value && + !is_binary_to_string::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - CopyData(input, output); + ZeroCopyData(input, output); } }; @@ -532,7 +561,7 @@ struct CastFunctor { const auto& out_type = checked_cast(*output->type); if (in_type.unit() == out_type.unit()) { - CopyData(input, output); + ZeroCopyData(input, output); return; } @@ -625,7 +654,7 @@ struct CastFunctor(*output->type); if (in_type.unit() == out_type.unit()) { - CopyData(input, output); + ZeroCopyData(input, output); return; } @@ -998,7 +1027,7 @@ struct CastFunctor { continue; } - auto str = input_array.GetView(i); + const auto str = input_array.GetView(i); if (!converter(str.data(), str.length(), out_data)) { std::stringstream ss; ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); @@ -1009,6 +1038,52 @@ struct CastFunctor { } }; +// ---------------------------------------------------------------------- +// Binary to String +// + +template +struct CastFunctor< + StringType, I, + typename std::enable_if::value>::type> { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + BinaryArray binary(input.Copy()); + + if (options.allow_invalid_utf8) { + ZeroCopyData(input, output); + return; + } + + util::InitializeUTF8(); + + if (binary.null_count() != 0) { + for (int64_t i = 0; i < input.length; i++) { + if (binary.IsNull(i)) { + continue; + } + + const auto str = binary.GetView(i); + if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); + return; + } + } + + } else { + for (int64_t i = 0; i < input.length; i++) { + const auto str = binary.GetView(i); + if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 payload")); + return; + } + } + } + + ZeroCopyData(input, output); + } +}; + // ---------------------------------------------------------------------- typedef std::functionkind() == Datum::NONE) { - out->value = ArrayData::Make(out_type_, in_data.length); + switch (out->kind()) { + case Datum::NONE: + out->value = ArrayData::Make(out_type_, in_data.length); + break; + case Datum::ARRAY: + break; + default: + return Status::NotImplemented("CastKernel only supports Datum::ARRAY output"); } - result = out->array().get(); - + ArrayData* result = out->array().get(); if (!is_zero_copy_) { RETURN_NOT_OK( AllocateIfNotPreallocated(ctx, in_data, can_pre_allocate_values_, result)); @@ -1187,6 +1267,8 @@ class CastKernel : public UnaryKernel { FN(TimestampType, Date64Type); \ FN(TimestampType, Int64Type); +#define BINARY_CASES(FN, IN_TYPE) FN(BinaryType, StringType); + #define STRING_CASES(FN, IN_TYPE) \ FN(StringType, StringType); \ FN(StringType, BooleanType); \ @@ -1259,6 +1341,7 @@ GET_CAST_FUNCTION(DATE64_CASES, Date64Type); GET_CAST_FUNCTION(TIME32_CASES, Time32Type); GET_CAST_FUNCTION(TIME64_CASES, Time64Type); GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); +GET_CAST_FUNCTION(BINARY_CASES, BinaryType); GET_CAST_FUNCTION(STRING_CASES, StringType); GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); @@ -1307,6 +1390,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& CAST_FUNCTION_CASE(Time32Type); CAST_FUNCTION_CASE(Time64Type); CAST_FUNCTION_CASE(TimestampType); + CAST_FUNCTION_CASE(BinaryType); CAST_FUNCTION_CASE(StringType); CAST_FUNCTION_CASE(DictionaryType); case Type::LIST: diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h index 65c70bf14aa88..8c42f07bda7f1 100644 --- a/cpp/src/arrow/compute/kernels/cast.h +++ b/cpp/src/arrow/compute/kernels/cast.h @@ -38,12 +38,14 @@ struct ARROW_EXPORT CastOptions { CastOptions() : allow_int_overflow(false), allow_time_truncate(false), - allow_float_truncate(false) {} + allow_float_truncate(false), + allow_invalid_utf8(false) {} explicit CastOptions(bool safe) : allow_int_overflow(!safe), allow_time_truncate(!safe), - allow_float_truncate(!safe) {} + allow_float_truncate(!safe), + allow_invalid_utf8(!safe) {} static CastOptions Safe() { return CastOptions(true); } @@ -52,6 +54,9 @@ struct ARROW_EXPORT CastOptions { bool allow_int_overflow; bool allow_time_truncate; bool allow_float_truncate; + // Indicate if conversions from Binary/FixedSizeBinary to string must + // validate the utf8 payload. + bool allow_invalid_utf8; }; /// \since 0.7.0 diff --git a/cpp/src/arrow/compute/kernels/hash-test.cc b/cpp/src/arrow/compute/kernels/hash-test.cc new file mode 100644 index 0000000000000..f20575f621b4c --- /dev/null +++ b/cpp/src/arrow/compute/kernels/hash-test.cc @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/test-common.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/hash.h" +#include "arrow/compute/kernels/util-internal.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +// ---------------------------------------------------------------------- +// Dictionary tests + +template +void CheckUnique(FunctionContext* ctx, const shared_ptr& type, + const vector& in_values, const vector& in_is_valid, + const vector& out_values, const vector& out_is_valid) { + shared_ptr input = _MakeArray(type, in_values, in_is_valid); + shared_ptr expected = _MakeArray(type, out_values, out_is_valid); + + shared_ptr result; + ASSERT_OK(Unique(ctx, input, &result)); + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +template +void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, + const vector& in_values, const vector& in_is_valid, + const vector& out_values, const vector& out_is_valid, + const vector& out_indices) { + shared_ptr input = _MakeArray(type, in_values, in_is_valid); + shared_ptr ex_dict = _MakeArray(type, out_values, out_is_valid); + shared_ptr ex_indices = + _MakeArray(int32(), out_indices, in_is_valid); + + DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); + + Datum datum_out; + ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); + shared_ptr result = MakeArray(datum_out.array()); + + ASSERT_ARRAYS_EQUAL(expected, *result); +} + +class TestHashKernel : public ComputeFixture, public TestBase {}; + +template +class TestHashKernelPrimitive : public ComputeFixture, public TestBase {}; + +typedef ::testing::Types + PrimitiveDictionaries; + +TYPED_TEST_CASE(TestHashKernelPrimitive, PrimitiveDictionaries); + +TYPED_TEST(TestHashKernelPrimitive, Unique) { + using T = typename TypeParam::c_type; + auto type = TypeTraits::type_singleton(); + CheckUnique(&this->ctx_, type, {2, 1, 2, 1}, {true, false, true, true}, + {2, 1}, {}); + CheckUnique(&this->ctx_, type, {2, 1, 3, 1}, {false, false, true, true}, + {3, 1}, {}); +} + +TYPED_TEST(TestHashKernelPrimitive, DictEncode) { + using T = typename TypeParam::c_type; + auto type = TypeTraits::type_singleton(); + CheckDictEncode(&this->ctx_, type, {2, 1, 2, 1, 2, 3}, + {true, false, true, true, true, true}, {2, 1, 3}, {}, + {0, 0, 0, 1, 0, 2}); +} + +TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { + using T = typename TypeParam::c_type; + // Skip this test for (u)int8 + if (sizeof(Scalar) == 1) { + return; + } + + const int64_t kTotalValues = 1000000; + const int64_t kRepeats = 5; + + vector values; + vector uniques; + vector indices; + for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { + const auto val = static_cast(i % kTotalValues); + values.push_back(val); + + if (i < kTotalValues) { + uniques.push_back(val); + } + indices.push_back(static_cast(i % kTotalValues)); + } + + auto type = TypeTraits::type_singleton(); + CheckUnique(&this->ctx_, type, values, {}, uniques, {}); + + CheckDictEncode(&this->ctx_, type, values, {}, uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueTimeTimestamp) { + CheckUnique(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1}, + {true, false, true, true}, {2, 1}, {}); + + CheckUnique(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1}, + {true, false, true, true}, {2, 1}, {}); + + CheckUnique(&this->ctx_, timestamp(TimeUnit::NANO), + {2, 1, 2, 1}, {true, false, true, true}, {2, 1}, + {}); +} + +TEST_F(TestHashKernel, UniqueBoolean) { + CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, + {true, false, true, true}, {true, false}, {}); + + CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, + {true, false, true, true}, {false, true}, {}); + + // No nulls + CheckUnique(&this->ctx_, boolean(), {true, true, false, true}, {}, + {true, false}, {}); + + CheckUnique(&this->ctx_, boolean(), {false, true, false, true}, {}, + {false, true}, {}); +} + +TEST_F(TestHashKernel, DictEncodeBoolean) { + CheckDictEncode( + &this->ctx_, boolean(), {true, true, false, true, false}, + {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1}); + + CheckDictEncode( + &this->ctx_, boolean(), {false, true, false, true, false}, + {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0}); + + // No nulls + CheckDictEncode(&this->ctx_, boolean(), + {true, true, false, true, false}, {}, {true, false}, + {}, {0, 0, 1, 0, 1}); + + CheckDictEncode(&this->ctx_, boolean(), + {false, true, false, true, false}, {}, {false, true}, + {}, {0, 1, 0, 1, 0}); +} + +TEST_F(TestHashKernel, UniqueBinary) { + CheckUnique(&this->ctx_, binary(), + {"test", "", "test2", "test"}, + {true, false, true, true}, {"test", "test2"}, {}); + + CheckUnique(&this->ctx_, utf8(), {"test", "", "test2", "test"}, + {true, false, true, true}, {"test", "test2"}, {}); +} + +TEST_F(TestHashKernel, DictEncodeBinary) { + CheckDictEncode( + &this->ctx_, binary(), {"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); + + CheckDictEncode( + &this->ctx_, utf8(), {"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, BinaryResizeTable) { + const int32_t kTotalValues = 10000; +#if !defined(ARROW_VALGRIND) + const int32_t kRepeats = 10; +#else + // Mitigate Valgrind's slowness + const int32_t kRepeats = 3; +#endif + + vector values; + vector uniques; + vector indices; + char buf[20] = "test"; + + for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { + int32_t index = i % kTotalValues; + + ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); + values.emplace_back(buf); + + if (i < kTotalValues) { + uniques.push_back(values.back()); + } + indices.push_back(index); + } + + CheckUnique(&this->ctx_, binary(), values, {}, uniques, {}); + CheckDictEncode(&this->ctx_, binary(), values, {}, uniques, {}, + indices); + + CheckUnique(&this->ctx_, utf8(), values, {}, uniques, {}); + CheckDictEncode(&this->ctx_, utf8(), values, {}, uniques, {}, + indices); +} + +TEST_F(TestHashKernel, UniqueFixedSizeBinary) { + CheckUnique( + &this->ctx_, fixed_size_binary(5), {"aaaaa", "", "bbbbb", "aaaaa"}, + {true, false, true, true}, {"aaaaa", "bbbbb"}, {}); +} + +TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) { + CheckDictEncode( + &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, + {true, false, true, true, true}, {"bbbbb", "aaaaa", "ccccc"}, {}, {0, 0, 0, 1, 2}); +} + +TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) { + const int32_t kTotalValues = 10000; +#if !defined(ARROW_VALGRIND) + const int32_t kRepeats = 10; +#else + // Mitigate Valgrind's slowness + const int32_t kRepeats = 3; +#endif + + vector values; + vector uniques; + vector indices; + char buf[7] = "test.."; + + for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { + int32_t index = i % kTotalValues; + + buf[4] = static_cast(index / 128); + buf[5] = static_cast(index % 128); + values.emplace_back(buf, 6); + + if (i < kTotalValues) { + uniques.push_back(values.back()); + } + indices.push_back(index); + } + + auto type = fixed_size_binary(6); + CheckUnique(&this->ctx_, type, values, {}, uniques, + {}); + CheckDictEncode(&this->ctx_, type, values, {}, + uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueDecimal) { + vector values{12, 12, 11, 12}; + vector expected{12, 11}; + + CheckUnique(&this->ctx_, decimal(2, 0), values, + {true, false, true, true}, expected, {}); +} + +TEST_F(TestHashKernel, DictEncodeDecimal) { + vector values{12, 12, 11, 12, 13}; + vector expected{12, 11, 13}; + + CheckDictEncode(&this->ctx_, decimal(2, 0), values, + {true, false, true, true, true}, expected, + {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, ChunkedArrayInvoke) { + vector values1 = {"foo", "bar", "foo"}; + vector values2 = {"bar", "baz", "quuux", "foo"}; + + auto type = utf8(); + auto a1 = _MakeArray(type, values1, {}); + auto a2 = _MakeArray(type, values2, {}); + + vector dict_values = {"foo", "bar", "baz", "quuux"}; + auto ex_dict = _MakeArray(type, dict_values, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared(arrays); + + // Unique + shared_ptr result; + ASSERT_OK(Unique(&this->ctx_, carr, &result)); + ASSERT_ARRAYS_EQUAL(*ex_dict, *result); + + // Dictionary encode + auto dict_type = dictionary(int32(), ex_dict); + + auto i1 = _MakeArray(int32(), {0, 1, 0}, {}); + auto i2 = _MakeArray(int32(), {1, 2, 3, 0}, {}); + + ArrayVector dict_arrays = {std::make_shared(dict_type, i1), + std::make_shared(dict_type, i2)}; + auto dict_carr = std::make_shared(dict_arrays); + + Datum encoded_out; + ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); + + AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/util-internal.h b/cpp/src/arrow/compute/kernels/util-internal.h index 23ed4fd7ee7d7..d71e36d9c42b4 100644 --- a/cpp/src/arrow/compute/kernels/util-internal.h +++ b/cpp/src/arrow/compute/kernels/util-internal.h @@ -32,7 +32,9 @@ namespace compute { class FunctionContext; -static inline void CopyData(const ArrayData& input, ArrayData* output) { +// \brief Make a copy of the buffers into a destination array without carrying +// the type. +static inline void ZeroCopyData(const ArrayData& input, ArrayData* output) { output->length = input.length; output->null_count = input.null_count; output->buffers = input.buffers; diff --git a/cpp/src/arrow/compute/test-util.h b/cpp/src/arrow/compute/test-util.h new file mode 100644 index 0000000000000..e2bda698a9bff --- /dev/null +++ b/cpp/src/arrow/compute/test-util.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_TEST_UTIL_H +#define ARROW_COMPUTE_TEST_UTIL_H + +#include +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/type.h" + +#include "arrow/compute/context.h" + +namespace arrow { +namespace compute { + +class ComputeFixture { + public: + ComputeFixture() : ctx_(default_memory_pool()) {} + + protected: + FunctionContext ctx_; +}; + +template +std::shared_ptr _MakeArray(const std::shared_ptr& type, + const std::vector& values, + const std::vector& is_valid) { + std::shared_ptr result; + if (is_valid.size() > 0) { + ArrayFromVector(type, is_valid, values, &result); + } else { + ArrayFromVector(type, values, &result); + } + return result; +} + +} // namespace compute +} // namespace arrow + +#endif diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index f5a18be05a92f..072c2188f7081 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -24,6 +24,7 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/string_view.h" #include "arrow/util/visibility.h" namespace arrow { @@ -157,6 +158,13 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) { return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept); } +inline bool ValidateUTF8(const util::string_view& str) { + const uint8_t* data = reinterpret_cast(str.data()); + const size_t length = str.size(); + + return ValidateUTF8(data, length); +} + } // namespace util } // namespace arrow From 781e251a150ec52f3072188f2291ec4a70995ebf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 18 Dec 2018 10:53:16 -0600 Subject: [PATCH 27/80] ARROW-4055: [Python] Fails to convert pytz.utc with versions 2018.3 and earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Closes #3207 from kszucs/ARROW-4055 and squashes the following commits: 2edb3b219 fix import order 49b381fa4 hypothesis test c3d68b379 explicitly check against pytz.utc --- python/pyarrow/tests/test_convert_pandas.py | 21 +++++++++++++++------ python/pyarrow/types.pxi | 4 +++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 4d283b3150606..41bcae83db516 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -20,9 +20,13 @@ import decimal import json import multiprocessing as mp + from collections import OrderedDict from datetime import date, datetime, time, timedelta +import hypothesis as h +import hypothesis.extra.pytz as tzst +import hypothesis.strategies as st import numpy as np import numpy.testing as npt import pandas as pd @@ -31,9 +35,6 @@ import pytz import pyarrow as pa -import pyarrow.types as patypes -from pyarrow.compat import PY2 - from .pandas_examples import dataframe_with_arrays, dataframe_with_lists @@ -94,7 +95,7 @@ def _check_series_roundtrip(s, type_=None, expected_pa_type=None): assert arr.type == expected_pa_type result = pd.Series(arr.to_pandas(), name=s.name) - if patypes.is_timestamp(arr.type) and arr.type.tz is not None: + if pa.types.is_timestamp(arr.type) and arr.type.tz is not None: result = (result.dt.tz_localize('utc') .dt.tz_convert(arr.type.tz)) @@ -255,12 +256,14 @@ def test_string_column_index(self): column_indexes, = js['column_indexes'] assert column_indexes['name'] == 'stringz' assert column_indexes['name'] == column_indexes['field_name'] - assert column_indexes['pandas_type'] == ('bytes' if PY2 else 'unicode') assert column_indexes['numpy_type'] == 'object' + assert column_indexes['pandas_type'] == ( + 'bytes' if six.PY2 else 'unicode' + ) md = column_indexes['metadata'] - if not PY2: + if not six.PY2: assert len(md) == 1 assert md['encoding'] == 'UTF-8' else: @@ -840,6 +843,12 @@ def test_python_datetime_with_pytz_tzinfo(self): df = pd.DataFrame({'datetime': values}) _check_pandas_roundtrip(df) + @h.given(st.none() | tzst.timezones()) + def test_python_datetime_with_pytz_timezone(self, tz): + values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)] + df = pd.DataFrame({'datetime': values}) + _check_pandas_roundtrip(df) + @pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since ' 'python version 3.2') def test_python_datetime_with_timezone_tzinfo(self): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index f69190c1c2eaa..9ec36bff3a6fe 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1002,7 +1002,9 @@ def tzinfo_to_string(tz): raise ValueError('Offset must represent whole number of minutes') return '{}{:02d}:{:02d}'.format(sign, hours, minutes) - if isinstance(tz, pytz.tzinfo.BaseTzInfo): + if tz is pytz.utc: + return tz.zone # ARROW-4055 + elif isinstance(tz, pytz.tzinfo.BaseTzInfo): return tz.zone elif isinstance(tz, pytz._FixedOffset): return fixed_offset_to_string(tz) From 758bd557584107cb336cbc3422744dacd93978af Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 12:23:55 -0600 Subject: [PATCH 28/80] ARROW-2919: [C++/Python] Improve HdfsFile error messages, fix Python unit test suite This also resolves ARROW-3957 and ARROW-4053. Summary: * Properly initialize NativeFile when opening from HDFS. This was broken when the "closed" property was added and some other refactoring, and wasn't caught because these tests aren't being run regularly * Slightly improves the handling of filesystem URIs -- there were some tests that failed without these changes because the docker-compose HDFS containers don't allow writes from $USER * Improve error message when calling "info" on a file that does not exist * Improve error message when calling `ls` on a directory that does not exist * Suggest checking whether you are connecting to the right HDFS port when getting errno 255 Author: Wes McKinney Closes #3209 from wesm/ARROW-2919 and squashes the following commits: b11e5b665 Restore arrow_dependencies to Gandiva dependencies 20e8784f6 Code review comments 4ba93bbb1 More helpful error messages when GetPathInfo or ListDirectory fails due to non-existent file or bad port 3c67ea6f0 Basic fixes to get Python unit tests passing again --- cpp/src/arrow/io/hdfs-test.cc | 19 ++++++++++ cpp/src/arrow/io/hdfs.cc | 51 +++++++++++++++++++-------- cpp/src/gandiva/CMakeLists.txt | 2 +- python/pyarrow/filesystem.py | 4 +-- python/pyarrow/io-hdfs.pxi | 3 ++ python/pyarrow/parquet.py | 58 ++++++++++++++++++++----------- python/pyarrow/tests/test_hdfs.py | 13 ++++--- 7 files changed, 108 insertions(+), 42 deletions(-) diff --git a/cpp/src/arrow/io/hdfs-test.cc b/cpp/src/arrow/io/hdfs-test.cc index c853b2012666e..08a7e13a1f8a2 100644 --- a/cpp/src/arrow/io/hdfs-test.cc +++ b/cpp/src/arrow/io/hdfs-test.cc @@ -257,6 +257,23 @@ TYPED_TEST(TestHadoopFileSystem, GetPathInfo) { ASSERT_EQ(size, info.size); } +TYPED_TEST(TestHadoopFileSystem, GetPathInfoNotExist) { + // ARROW-2919: Test that the error message is reasonable + SKIP_IF_NO_DRIVER(); + + ASSERT_OK(this->MakeScratchDir()); + auto path = this->ScratchPath("path-does-not-exist"); + + HdfsPathInfo info; + Status s = this->client_->GetPathInfo(path, &info); + ASSERT_TRUE(s.IsIOError()); + + const std::string error_message = s.ToString(); + + // Check that the file path is found in the error message + ASSERT_LT(error_message.find(path), std::string::npos); +} + TYPED_TEST(TestHadoopFileSystem, AppendToFile) { SKIP_IF_NO_DRIVER(); @@ -377,6 +394,8 @@ TYPED_TEST(TestHadoopFileSystem, LargeFile) { std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(path, &file)); + ASSERT_FALSE(file->closed()); + std::shared_ptr buffer; ASSERT_OK(AllocateBuffer(nullptr, size, &buffer)); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 6f01f75eec3c1..030b84853da60 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -43,14 +43,27 @@ using std::size_t; namespace arrow { namespace io { -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - std::stringstream ss; \ - ss << "HDFS " << WHAT << " failed, errno: " << errno << " (" << strerror(errno) \ - << ")"; \ - return Status::IOError(ss.str()); \ - } \ +namespace { + +std::string TranslateErrno(int error_code) { + std::stringstream ss; + ss << error_code << " (" << strerror(error_code) << ")"; + if (error_code == 255) { + // Unknown error can occur if the host is correct but the port is not + ss << " Please check that you are connecting to the correct HDFS RPC port"; + } + return ss.str(); +} + +} // namespace + +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + std::stringstream ss; \ + ss << "HDFS " << WHAT << " failed, errno: " << TranslateErrno(errno); \ + return Status::IOError(ss.str()); \ + } \ } while (0) static constexpr int kDefaultHdfsBufferSize = 1 << 16; @@ -99,6 +112,16 @@ class HdfsAnyFileImpl { bool is_open_; }; +namespace { + +Status GetPathInfoFailed(const std::string& path) { + std::stringstream ss; + ss << "Calling GetPathInfo for " << path << " failed. errno: " << TranslateErrno(errno); + return Status::IOError(ss.str()); +} + +} // namespace + // Private implementation for read-only files class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { public: @@ -180,7 +203,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { Status GetSize(int64_t* size) { hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path_.c_str()); if (entry == nullptr) { - return Status::IOError("HDFS: GetPathInfo failed"); + return GetPathInfoFailed(path_); } *size = entry->mSize; @@ -204,7 +227,7 @@ HdfsReadableFile::HdfsReadableFile(MemoryPool* pool) { impl_.reset(new HdfsReadableFileImpl(pool)); } -HdfsReadableFile::~HdfsReadableFile() { DCHECK(impl_->Close().ok()); } +HdfsReadableFile::~HdfsReadableFile() { DCHECK_OK(impl_->Close()); } Status HdfsReadableFile::Close() { return impl_->Close(); } @@ -272,7 +295,7 @@ class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { HdfsOutputStream::HdfsOutputStream() { impl_.reset(new HdfsOutputStreamImpl()); } -HdfsOutputStream::~HdfsOutputStream() { DCHECK(impl_->Close().ok()); } +HdfsOutputStream::~HdfsOutputStream() { DCHECK_OK(impl_->Close()); } Status HdfsOutputStream::Close() { return impl_->Close(); } @@ -399,7 +422,7 @@ class HadoopFileSystem::HadoopFileSystemImpl { hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path.c_str()); if (entry == nullptr) { - return Status::IOError("HDFS: GetPathInfo failed"); + return GetPathInfoFailed(path); } SetPathInfo(entry, info); @@ -444,8 +467,8 @@ class HadoopFileSystem::HadoopFileSystemImpl { num_entries = 0; } else { std::stringstream ss; - ss << "HDFS list directory failed, errno: " << errno << " (" << strerror(errno) - << ")"; + ss << "HDFS list directory of " << path + << " failed, errno: " << TranslateErrno(errno); return Status::IOError(ss.str()); } } diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 8052db5e8545d..23ad93e201e71 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -83,7 +83,7 @@ endif() ADD_ARROW_LIB(gandiva SOURCES ${SRC_FILES} OUTPUTS GANDIVA_LIBRARIES - DEPENDENCIES precompiled + DEPENDENCIES arrow_dependencies precompiled EXTRA_INCLUDES $ SHARED_LINK_LIBS arrow_shared diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 8188a2607e21a..98efb1e3ec374 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -390,7 +390,7 @@ def _ensure_filesystem(fs): return fs -def _get_fs_from_path(path): +def get_filesystem_from_uri(path): """ return filesystem from path which could be an HDFS URI """ @@ -411,4 +411,4 @@ def _get_fs_from_path(path): else: fs = LocalFileSystem.get_instance() - return fs + return fs, parsed_uri.path diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi index e7a322ea469bb..d93bd790eaa1e 100644 --- a/python/pyarrow/io-hdfs.pxi +++ b/python/pyarrow/io-hdfs.pxi @@ -433,6 +433,9 @@ cdef class HadoopFileSystem: out.set_random_access_file( rd_handle) + out.is_readable = True + + assert not out.closed if c_buffer_size == 0: c_buffer_size = 2 ** 16 diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index feaa890fc6cd9..a520acece972e 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -18,6 +18,7 @@ from collections import defaultdict from concurrent import futures +from six.moves.urllib.parse import urlparse import json import numpy as np import os @@ -34,10 +35,24 @@ ParquetSchema, ColumnSchema) from pyarrow.compat import guid from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem, - _get_fs_from_path) + get_filesystem_from_uri) from pyarrow.util import _is_path_like, _stringify_path +def _parse_uri(path): + path = _stringify_path(path) + return urlparse(path).path + + +def _get_filesystem_and_path(passed_filesystem, path): + if passed_filesystem is None: + return get_filesystem_from_uri(path) + else: + passed_filesystem = _ensure_filesystem(passed_filesystem) + parsed_path = _parse_uri(path) + return passed_filesystem, parsed_path + + def _check_contains_null(val): if isinstance(val, six.binary_type): for byte in val: @@ -316,7 +331,8 @@ def __init__(self, where, schema, flavor=None, version='1.0', use_dictionary=True, compression='snappy', - use_deprecated_int96_timestamps=None, **options): + use_deprecated_int96_timestamps=None, + filesystem=None, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark if flavor is not None and 'spark' in flavor: @@ -338,8 +354,8 @@ def __init__(self, where, schema, flavor=None, self.file_handle = None if _is_path_like(where): - fs = _get_fs_from_path(where) - sink = self.file_handle = fs.open(where, 'wb') + fs, path = _get_filesystem_and_path(filesystem, where) + sink = self.file_handle = fs.open(path, 'wb') else: sink = where @@ -681,7 +697,8 @@ class ParquetManifest(object): """ def __init__(self, dirpath, filesystem=None, pathsep='/', partition_scheme='hive', metadata_nthreads=1): - self.filesystem = filesystem or _get_fs_from_path(dirpath) + filesystem, dirpath = _get_filesystem_and_path(filesystem, dirpath) + self.filesystem = filesystem self.pathsep = pathsep self.dirpath = _stringify_path(dirpath) self.partition_scheme = partition_scheme @@ -845,15 +862,15 @@ class ParquetDataset(object): def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1): - if filesystem is None: - a_path = path_or_paths - if isinstance(a_path, list): - a_path = a_path[0] - self.fs = _get_fs_from_path(a_path) - else: - self.fs = _ensure_filesystem(filesystem) + a_path = path_or_paths + if isinstance(a_path, list): + a_path = a_path[0] - self.paths = path_or_paths + self.fs, _ = _get_filesystem_and_path(filesystem, a_path) + if isinstance(path_or_paths, list): + self.paths = [_parse_uri(path) for path in path_or_paths] + else: + self.paths = _parse_uri(path_or_paths) (self.pieces, self.partitions, @@ -1070,10 +1087,11 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1): def read_table(source, columns=None, use_threads=True, metadata=None, - use_pandas_metadata=False, memory_map=True): + use_pandas_metadata=False, memory_map=True, + filesystem=None): if _is_path_like(source): - fs = _get_fs_from_path(source) - return fs.read_parquet(source, columns=columns, + fs, path = _get_filesystem_and_path(filesystem, source) + return fs.read_parquet(path, columns=columns, use_threads=use_threads, metadata=metadata, use_pandas_metadata=use_pandas_metadata) @@ -1113,12 +1131,13 @@ def write_table(table, where, row_group_size=None, version='1.0', use_deprecated_int96_timestamps=None, coerce_timestamps=None, allow_truncated_timestamps=False, - flavor=None, **kwargs): + flavor=None, filesystem=None, **kwargs): row_group_size = kwargs.pop('chunk_size', row_group_size) use_int96 = use_deprecated_int96_timestamps try: with ParquetWriter( where, table.schema, + filesystem=filesystem, version=version, flavor=flavor, use_dictionary=use_dictionary, @@ -1192,10 +1211,7 @@ def write_to_dataset(table, root_path, partition_cols=None, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ - if filesystem is None: - fs = _get_fs_from_path(root_path) - else: - fs = _ensure_filesystem(filesystem) + fs, root_path = _get_filesystem_and_path(filesystem, root_path) _mkdir_if_not_exists(fs, root_path) diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index f218a1604a9d9..1af841f2ecbb1 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -216,7 +216,7 @@ def test_ls(self): self.hdfs.mkdir(dir_path) f = self.hdfs.open(f1_path, 'wb') - f.write('a' * 10) + f.write(b'a' * 10) contents = sorted(self.hdfs.ls(base_path, False)) assert contents == [dir_path, f1_path] @@ -341,9 +341,9 @@ def test_read_write_parquet_files_with_uri(self): df['uint32'] = df['uint32'].astype(np.int64) table = pa.Table.from_pandas(df, preserve_index=False) - pq.write_table(table, path) + pq.write_table(table, path, filesystem=self.hdfs) - result = pq.read_table(path).to_pandas() + result = pq.read_table(path, filesystem=self.hdfs).to_pandas() pdt.assert_frame_equal(result, df) @@ -380,7 +380,7 @@ def check_driver(cls): def test_orphaned_file(self): hdfs = hdfs_test_client() file_path = self._make_test_file(hdfs, 'orphaned_file_test', 'fname', - 'foobarbaz') + b'foobarbaz') f = hdfs.open(file_path) hdfs = None @@ -413,6 +413,11 @@ def _get_hdfs_uri(path): @pytest.mark.fastparquet @pytest.mark.parametrize('client', ['libhdfs', 'libhdfs3']) def test_fastparquet_read_with_hdfs(client): + try: + import snappy # noqa + except ImportError: + pytest.skip('fastparquet test requires snappy') + import pyarrow.parquet as pq fastparquet = pytest.importorskip('fastparquet') From 1a5991c99ef9092f439bed7e0bcf707a7247b419 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 18:44:40 -0600 Subject: [PATCH 29/80] ARROW-4069: [Python] Add tests for casting binary -> string/utf8. Add pyarrow.utf8() type factory alias for readability This is the Python side of ARROW-3387 to make sure all is in order there Author: Wes McKinney Closes #3215 from wesm/ARROW-4069 and squashes the following commits: eaf0cf403 Add tests for casting binary -> string/utf8. Add pyarrow.utf8() alias for pyarrow.string() for readability --- docs/source/python/api.rst | 1 + python/pyarrow/__init__.py | 2 +- python/pyarrow/tests/test_array.py | 20 ++++++++++++++++++++ python/pyarrow/types.pxi | 7 +++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 06863964978b3..064a3e9740543 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -50,6 +50,7 @@ Type and Schema Factory Functions date64 binary string + utf8 decimal128 list_ struct diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 63ed53e0ebab5..3121db68b9322 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -57,7 +57,7 @@ def parse_git(root, **kwargs): uint8, uint16, uint32, uint64, time32, time64, timestamp, date32, date64, float16, float32, float64, - binary, string, decimal128, + binary, string, utf8, decimal128, list_, struct, union, dictionary, field, type_for_alias, DataType, diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f9bd06ee04ef7..95a60435e3460 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -725,6 +725,26 @@ def test_cast_date32_to_int(): assert result2.equals(arr) +def test_cast_binary_to_utf8(): + binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary()) + utf8_arr = binary_arr.cast(pa.utf8()) + expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) + + assert utf8_arr.equals(expected) + + non_utf8_values = [(u'mañana').encode('utf-16-le')] + non_utf8_binary = pa.array(non_utf8_values) + assert non_utf8_binary.type == pa.binary() + with pytest.raises(ValueError): + non_utf8_binary.cast(pa.string()) + + non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]), + type=pa.binary()) + # No error + casted = non_utf8_all_null.cast(pa.string()) + assert casted.null_count == 1 + + def test_cast_date64_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.date64()) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 9ec36bff3a6fe..d367a8a85673f 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1237,6 +1237,13 @@ def string(): return primitive_type(_Type_STRING) +def utf8(): + """ + Alias for string() + """ + return string() + + def binary(int length=-1): """ Create variable-length binary type From bfa7f11cffa58dcf44f7e1278846e373e63d1dfe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 18:56:24 -0600 Subject: [PATCH 30/80] ARROW-4070: [C++] Enable use of ARROW_BOOST_VENDORED with ninja-build It seems that ninja-build is a lot stricter about the dependency graph -- it seeks the root dependency of the `boost_*_static` libraries and finds targets (the absolute paths to the static libraries) that it doesn't know how to build. Setting these as the BUILD_BYPRODUCTS of the ExternalProject fixes the issue. I need this fix in ARROW-3803 so I'm going to cherry pick it there, and I can rebase later Author: Wes McKinney Closes #3217 from wesm/ARROW-4070 and squashes the following commits: aac135daa Use static library paths as BOOST_BUILD_PRODUCTS so that ninja-build can understand the dependency graph --- cpp/CMakeLists.txt | 3 ++- cpp/cmake_modules/ThirdpartyToolchain.cmake | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 60cbe85d10b6d..1672245924fb5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -239,7 +239,8 @@ static|shared (default shared)") ON) option(ARROW_BOOST_VENDORED - "Use vendored Boost instead of existing Boost" + "Use vendored Boost instead of existing Boost. \ +Note that this requires linking Boost statically" OFF) option(ARROW_PROTOBUF_USE_SHARED diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index d493de75a55f5..db0b69be460ce 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -369,15 +369,16 @@ if (ARROW_BOOST_VENDORED) set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) set(BOOST_REGEX_LIBRARY boost_regex_static) + if (ARROW_BOOST_HEADER_ONLY) set(BOOST_BUILD_PRODUCTS) set(BOOST_CONFIGURE_COMMAND "") set(BOOST_BUILD_COMMAND "") else() set(BOOST_BUILD_PRODUCTS - ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_REGEX_LIBRARY}) + ${BOOST_STATIC_SYSTEM_LIBRARY} + ${BOOST_STATIC_FILESYSTEM_LIBRARY} + ${BOOST_STATIC_REGEX_LIBRARY}) set(BOOST_CONFIGURE_COMMAND "./bootstrap.sh" "--prefix=${BOOST_PREFIX}" From 25b6a6c2c85c6afde2453459fd13ae00aa692028 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Dec 2018 19:45:27 -0600 Subject: [PATCH 31/80] ARROW-4073: [Python] Fix URI parsing on Windows. Also fix test for get_library_dirs when using ARROW_HOME to develop Resolves ARROW-4074 Author: Wes McKinney Closes #3218 from wesm/ARROW-4073 and squashes the following commits: 683b68fda lint 5f2c3404b Fix URI parsing on Windows. Also fix ARROW-4074 where windows .lib files are installed in ARROW_HOME and not the usual conda/pip locations --- python/pyarrow/__init__.py | 4 ++++ python/pyarrow/parquet.py | 10 +++++++++- python/pyarrow/tests/test_misc.py | 6 ++---- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 3121db68b9322..7f0a371b4bfd2 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -231,4 +231,8 @@ def get_library_dirs(): if _os.path.exists(_os.path.join(library_lib, 'arrow.lib')): library_dirs.append(library_lib) + # ARROW-4074: Allow for ARROW_HOME to be set to some other directory + if 'ARROW_HOME' in _os.environ: + library_dirs.append(_os.path.join(_os.environ['ARROW_HOME'], 'lib')) + return library_dirs diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index a520acece972e..b8dae65a5de78 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -38,10 +38,18 @@ get_filesystem_from_uri) from pyarrow.util import _is_path_like, _stringify_path +_URI_STRIP_SCHEMES = ('hdfs',) + def _parse_uri(path): path = _stringify_path(path) - return urlparse(path).path + parsed_uri = urlparse(path) + if parsed_uri.scheme in _URI_STRIP_SCHEMES: + return parsed_uri.path + else: + # ARROW-4073: On Windows returning the path with the scheme + # stripped removes the drive letter, if any + return path def _get_filesystem_and_path(passed_filesystem, path): diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 1c384f35d72b0..f7c316a8bafcd 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -28,10 +28,8 @@ def test_get_include(): @pytest.mark.skipif('sys.platform != "win32"') def test_get_library_dirs_win32(): - library_dirs = pa.get_library_dirs() - - library_lib = library_dirs[-1] - assert os.path.exists(os.path.join(library_lib, 'arrow.lib')) + assert any(os.path.exists(os.path.join(directory, 'arrow.lib')) + for directory in pa.get_library_dirs()) def test_cpu_count(): From 944b9e319a5f208c0fc45953d1f10972b1433020 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Wed, 19 Dec 2018 11:39:09 +0900 Subject: [PATCH 32/80] ARROW-4051: [Gandiva] [GLib] Add support for null literal - Add `#GGandivaNullLiteralNode`. - Remove `return_type` property in `#GGandivaFunctionNode` to use `ggandiva_node_get_return_type()`. Author: Yosuke Shiro Author: Kouhei Sutou Closes #3197 from shiro615/glib-support-null-literal-node and squashes the following commits: 4f0a39f7 Fix a typo e93cd085 Simplify be30df17 Add tests for return-type of literal nodes 8b3244e7 Use data type from GArrowField as return type fcc0d8f8 Raise error for invalid input 41aada5e Fix variable names 415df817 Add return-type property in ggandiva_field_node_new_raw() c337f122 Call g_object_unref(data_type) to prevent a memory leak 64cef1d8 Use the given return_type to create GGandivaNullLiteralNode b17f5e25 Reuse return-type property instead of defining ggandiva_node_get_return_type() 3e25b0d5 Refactor null check 0ac03b4d Add missing null check 285f64b8 Fix orders of class 49d1044a Remove return_type property for using ggandiva_node_get_return_type() f78881cf Add ggandiva_node_get_return_type() 5896f0bb Add GGandivaNullLiteralNode --- c_glib/gandiva-glib/node.cpp | 292 ++++++++++++------ c_glib/gandiva-glib/node.h | 17 + c_glib/gandiva-glib/node.hpp | 3 +- .../test/gandiva/test-binary-literal-node.rb | 27 +- .../test/gandiva/test-boolean-literal-node.rb | 10 +- .../test/gandiva/test-double-literal-node.rb | 10 +- c_glib/test/gandiva/test-field-node.rb | 10 +- .../test/gandiva/test-float-literal-node.rb | 14 +- .../test/gandiva/test-int16-literal-node.rb | 10 +- .../test/gandiva/test-int32-literal-node.rb | 10 +- .../test/gandiva/test-int64-literal-node.rb | 10 +- c_glib/test/gandiva/test-int8-literal-node.rb | 10 +- c_glib/test/gandiva/test-null-literal-node.rb | 38 +++ .../test/gandiva/test-string-literal-node.rb | 10 +- .../test/gandiva/test-uint16-literal-node.rb | 10 +- .../test/gandiva/test-uint32-literal-node.rb | 10 +- .../test/gandiva/test-uint64-literal-node.rb | 10 +- .../test/gandiva/test-uint8-literal-node.rb | 10 +- 18 files changed, 372 insertions(+), 139 deletions(-) create mode 100644 c_glib/test/gandiva/test-null-literal-node.rb diff --git a/c_glib/gandiva-glib/node.cpp b/c_glib/gandiva-glib/node.cpp index cdb9724d7ebbf..709836524d848 100644 --- a/c_glib/gandiva-glib/node.cpp +++ b/c_glib/gandiva-glib/node.cpp @@ -22,6 +22,7 @@ #endif #include +#include #include #include @@ -52,6 +53,9 @@ G_BEGIN_DECLS * #GGandivaLiteralNode is a base class for a node in the expression tree, * representing a literal. * + * #GGandivaNullLiteralNode is a class for a node in the expression tree, + * representing a null literal. + * * #GGandivaBooleanLiteralNode is a class for a node in the expression tree, * representing a boolean literal. * @@ -96,10 +100,12 @@ G_BEGIN_DECLS typedef struct GGandivaNodePrivate_ { std::shared_ptr node; + GArrowDataType *return_type; } GGandivaNodePrivate; enum { - PROP_NODE = 1 + PROP_NODE = 1, + PROP_RETURN_TYPE }; G_DEFINE_TYPE_WITH_PRIVATE(GGandivaNode, @@ -111,6 +117,19 @@ G_DEFINE_TYPE_WITH_PRIVATE(GGandivaNode, ggandiva_node_get_instance_private( \ GGANDIVA_NODE(object))) +static void +ggandiva_node_dispose(GObject *object) +{ + auto priv = GGANDIVA_NODE_GET_PRIVATE(object); + + if (priv->return_type) { + g_object_unref(priv->return_type); + priv->return_type = nullptr; + } + + G_OBJECT_CLASS(ggandiva_node_parent_class)->dispose(object); +} + static void ggandiva_node_finalize(GObject *object) { @@ -134,6 +153,27 @@ ggandiva_node_set_property(GObject *object, priv->node = *static_cast *>(g_value_get_pointer(value)); break; + case PROP_RETURN_TYPE: + priv->return_type = GARROW_DATA_TYPE(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +ggandiva_node_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GGANDIVA_NODE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_RETURN_TYPE: + g_value_set_object(value, priv->return_type); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -148,19 +188,28 @@ ggandiva_node_init(GGandivaNode *object) static void ggandiva_node_class_init(GGandivaNodeClass *klass) { - GParamSpec *spec; - auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = ggandiva_node_dispose; gobject_class->finalize = ggandiva_node_finalize; gobject_class->set_property = ggandiva_node_set_property; + gobject_class->get_property = ggandiva_node_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("node", "Node", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_NODE, spec); + + spec = g_param_spec_object("return-type", + "Return type", + "The return type", + GARROW_TYPE_DATA_TYPE, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_RETURN_TYPE, spec); } @@ -274,12 +323,10 @@ ggandiva_field_node_new(GArrowField *field) typedef struct GGandivaFunctionNodePrivate_ { gchar *name; GList *parameters; - GArrowDataType *return_type; } GGandivaFunctionNodePrivate; enum { - PROP_NAME = 1, - PROP_RETURN_TYPE + PROP_NAME = 1 }; G_DEFINE_TYPE_WITH_PRIVATE(GGandivaFunctionNode, @@ -305,11 +352,6 @@ ggandiva_function_node_dispose(GObject *object) priv->parameters = nullptr; } - if (priv->return_type) { - g_object_unref(priv->return_type); - priv->return_type = nullptr; - } - G_OBJECT_CLASS(ggandiva_function_node_parent_class)->dispose(object); } @@ -335,9 +377,6 @@ ggandiva_function_node_set_property(GObject *object, case PROP_NAME: priv->name = g_value_dup_string(value); break; - case PROP_RETURN_TYPE: - priv->return_type = GARROW_DATA_TYPE(g_value_dup_object(value)); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -356,9 +395,6 @@ ggandiva_function_node_get_property(GObject *object, case PROP_NAME: g_value_set_string(value, priv->name); break; - case PROP_RETURN_TYPE: - g_value_set_object(value, priv->return_type); - break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -390,14 +426,6 @@ ggandiva_function_node_class_init(GGandivaFunctionNodeClass *klass) static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_NAME, spec); - - spec = g_param_spec_object("return-type", - "Return type", - "The return type of the function", - GARROW_TYPE_DATA_TYPE, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_RETURN_TYPE, spec); } /** @@ -462,6 +490,50 @@ ggandiva_literal_node_class_init(GGandivaLiteralNodeClass *klass) } +G_DEFINE_TYPE(GGandivaNullLiteralNode, + ggandiva_null_literal_node, + GGANDIVA_TYPE_LITERAL_NODE) + +static void +ggandiva_null_literal_node_init(GGandivaNullLiteralNode *null_literal_node) +{ +} + +static void +ggandiva_null_literal_node_class_init(GGandivaNullLiteralNodeClass *klass) +{ +} + +/** + * ggandiva_null_literal_node_new: + * @return_type: A #GArrowDataType. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GGandivaNullLiteralNode for + * the type or %NULL on error. + * + * Since: 0.12.0 + */ +GGandivaNullLiteralNode * +ggandiva_null_literal_node_new(GArrowDataType *return_type, + GError **error) +{ + auto arrow_return_type = garrow_data_type_get_raw(return_type); + auto gandiva_node = gandiva::TreeExprBuilder::MakeNull(arrow_return_type); + if (!gandiva_node) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "[gandiva][null-literal-node][new] " + "failed to create: <%s>", + arrow_return_type->ToString().c_str()); + return NULL; + } + return GGANDIVA_NULL_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + return_type)); +} + + G_DEFINE_TYPE(GGandivaBooleanLiteralNode, ggandiva_boolean_literal_node, GGANDIVA_TYPE_LITERAL_NODE) @@ -488,7 +560,8 @@ GGandivaBooleanLiteralNode * ggandiva_boolean_literal_node_new(gboolean value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(static_cast(value)); - return GGANDIVA_BOOLEAN_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_BOOLEAN_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -533,7 +606,8 @@ GGandivaInt8LiteralNode * ggandiva_int8_literal_node_new(gint8 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_INT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_INT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -577,7 +651,8 @@ GGandivaUInt8LiteralNode * ggandiva_uint8_literal_node_new(guint8 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_UINT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_UINT8_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -621,7 +696,8 @@ GGandivaInt16LiteralNode * ggandiva_int16_literal_node_new(gint16 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_INT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_INT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -665,7 +741,8 @@ GGandivaUInt16LiteralNode * ggandiva_uint16_literal_node_new(guint16 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_UINT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_UINT16_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -709,7 +786,8 @@ GGandivaInt32LiteralNode * ggandiva_int32_literal_node_new(gint32 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_INT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_INT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -753,7 +831,8 @@ GGandivaUInt32LiteralNode * ggandiva_uint32_literal_node_new(guint32 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_UINT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_UINT32_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -797,7 +876,8 @@ GGandivaInt64LiteralNode * ggandiva_int64_literal_node_new(gint64 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_INT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -841,7 +921,8 @@ GGandivaUInt64LiteralNode * ggandiva_uint64_literal_node_new(guint64 value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_UINT64_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -885,7 +966,8 @@ GGandivaFloatLiteralNode * ggandiva_float_literal_node_new(gfloat value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_FLOAT_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_FLOAT_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -929,7 +1011,8 @@ GGandivaDoubleLiteralNode * ggandiva_double_literal_node_new(gdouble value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeLiteral(value); - return GGANDIVA_DOUBLE_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_DOUBLE_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -1002,7 +1085,8 @@ ggandiva_binary_literal_node_new(const guint8 *value, auto gandiva_node = gandiva::TreeExprBuilder::MakeBinaryLiteral(std::string(reinterpret_cast(value), size)); - return GGANDIVA_BINARY_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_BINARY_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -1022,7 +1106,8 @@ ggandiva_binary_literal_node_new_bytes(GBytes *value) gandiva::TreeExprBuilder::MakeBinaryLiteral( std::string(reinterpret_cast(raw_value), value_size)); - auto literal_node = ggandiva_literal_node_new_raw(&gandiva_node); + auto literal_node = ggandiva_literal_node_new_raw(&gandiva_node, + NULL); auto priv = GGANDIVA_BINARY_LITERAL_NODE_GET_PRIVATE(literal_node); priv->value = value; g_bytes_ref(priv->value); @@ -1076,7 +1161,8 @@ GGandivaStringLiteralNode * ggandiva_string_literal_node_new(const gchar *value) { auto gandiva_node = gandiva::TreeExprBuilder::MakeStringLiteral(value); - return GGANDIVA_STRING_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node)); + return GGANDIVA_STRING_LITERAL_NODE(ggandiva_literal_node_new_raw(&gandiva_node, + NULL)); } /** @@ -1107,10 +1193,14 @@ GGandivaFieldNode * ggandiva_field_node_new_raw(std::shared_ptr *gandiva_node, GArrowField *field) { + auto arrow_return_type = (*gandiva_node)->return_type(); + auto return_type = garrow_field_get_data_type(field); auto field_node = g_object_new(GGANDIVA_TYPE_FIELD_NODE, "node", gandiva_node, "field", field, + "return-type", return_type, NULL); + g_object_unref(return_type); return GGANDIVA_FIELD_NODE(field_node); } @@ -1135,56 +1225,84 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, } GGandivaLiteralNode * -ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node) +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node, + GArrowDataType *return_type) { - GType type; + auto gandiva_literal_node = + std::static_pointer_cast(*gandiva_node); + + GGandivaLiteralNode *literal_node; + if (gandiva_literal_node->is_null()) { + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(GGANDIVA_TYPE_NULL_LITERAL_NODE, + "node", gandiva_node, + "return-type", return_type, + NULL)); + } else { + GType type; + + auto arrow_return_type = gandiva_literal_node->return_type(); + switch (arrow_return_type->id()) { + case arrow::Type::BOOL: + type = GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE; + break; + case arrow::Type::type::UINT8: + type = GGANDIVA_TYPE_UINT8_LITERAL_NODE; + break; + case arrow::Type::type::UINT16: + type = GGANDIVA_TYPE_UINT16_LITERAL_NODE; + break; + case arrow::Type::type::UINT32: + type = GGANDIVA_TYPE_UINT32_LITERAL_NODE; + break; + case arrow::Type::type::UINT64: + type = GGANDIVA_TYPE_UINT64_LITERAL_NODE; + break; + case arrow::Type::type::INT8: + type = GGANDIVA_TYPE_INT8_LITERAL_NODE; + break; + case arrow::Type::type::INT16: + type = GGANDIVA_TYPE_INT16_LITERAL_NODE; + break; + case arrow::Type::type::INT32: + type = GGANDIVA_TYPE_INT32_LITERAL_NODE; + break; + case arrow::Type::type::INT64: + type = GGANDIVA_TYPE_INT64_LITERAL_NODE; + break; + case arrow::Type::type::FLOAT: + type = GGANDIVA_TYPE_FLOAT_LITERAL_NODE; + break; + case arrow::Type::type::DOUBLE: + type = GGANDIVA_TYPE_DOUBLE_LITERAL_NODE; + break; + case arrow::Type::type::STRING: + type = GGANDIVA_TYPE_STRING_LITERAL_NODE; + break; + case arrow::Type::type::BINARY: + type = GGANDIVA_TYPE_BINARY_LITERAL_NODE; + break; + default: + type = GGANDIVA_TYPE_LITERAL_NODE; + break; + } - switch ((*gandiva_node)->return_type()->id()) { - case arrow::Type::BOOL: - type = GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE; - break; - case arrow::Type::type::UINT8: - type = GGANDIVA_TYPE_UINT8_LITERAL_NODE; - break; - case arrow::Type::type::UINT16: - type = GGANDIVA_TYPE_UINT16_LITERAL_NODE; - break; - case arrow::Type::type::UINT32: - type = GGANDIVA_TYPE_UINT32_LITERAL_NODE; - break; - case arrow::Type::type::UINT64: - type = GGANDIVA_TYPE_UINT64_LITERAL_NODE; - break; - case arrow::Type::type::INT8: - type = GGANDIVA_TYPE_INT8_LITERAL_NODE; - break; - case arrow::Type::type::INT16: - type = GGANDIVA_TYPE_INT16_LITERAL_NODE; - break; - case arrow::Type::type::INT32: - type = GGANDIVA_TYPE_INT32_LITERAL_NODE; - break; - case arrow::Type::type::INT64: - type = GGANDIVA_TYPE_INT64_LITERAL_NODE; - break; - case arrow::Type::type::FLOAT: - type = GGANDIVA_TYPE_FLOAT_LITERAL_NODE; - break; - case arrow::Type::type::DOUBLE: - type = GGANDIVA_TYPE_DOUBLE_LITERAL_NODE; - break; - case arrow::Type::type::STRING: - type = GGANDIVA_TYPE_STRING_LITERAL_NODE; - break; - case arrow::Type::type::BINARY: - type = GGANDIVA_TYPE_BINARY_LITERAL_NODE; - break; - default: - type = GGANDIVA_TYPE_LITERAL_NODE; - break; + if (return_type) { + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(type, + "node", gandiva_node, + "return-type", return_type, + NULL)); + } else { + return_type = garrow_data_type_new_raw(&arrow_return_type); + literal_node = + GGANDIVA_LITERAL_NODE(g_object_new(type, + "node", gandiva_node, + "return-type", return_type, + NULL)); + g_object_unref(return_type); + } } - auto literal_node = GGANDIVA_LITERAL_NODE(g_object_new(type, - "node", gandiva_node, - NULL)); + return literal_node; } diff --git a/c_glib/gandiva-glib/node.h b/c_glib/gandiva-glib/node.h index 183003fd9f68a..d9e67e27b7eea 100644 --- a/c_glib/gandiva-glib/node.h +++ b/c_glib/gandiva-glib/node.h @@ -35,6 +35,7 @@ struct _GGandivaNodeClass GObjectClass parent_class; }; + #define GGANDIVA_TYPE_FIELD_NODE (ggandiva_field_node_get_type()) G_DECLARE_DERIVABLE_TYPE(GGandivaFieldNode, ggandiva_field_node, @@ -80,6 +81,22 @@ struct _GGandivaLiteralNodeClass }; +#define GGANDIVA_TYPE_NULL_LITERAL_NODE (ggandiva_null_literal_node_get_type()) +G_DECLARE_DERIVABLE_TYPE(GGandivaNullLiteralNode, + ggandiva_null_literal_node, + GGANDIVA, + NULL_LITERAL_NODE, + GGandivaLiteralNode) +struct _GGandivaNullLiteralNodeClass +{ + GGandivaLiteralNodeClass parent_class; +}; + +GGandivaNullLiteralNode * +ggandiva_null_literal_node_new(GArrowDataType *return_type, + GError **error); + + #define GGANDIVA_TYPE_BOOLEAN_LITERAL_NODE (ggandiva_boolean_literal_node_get_type()) G_DECLARE_DERIVABLE_TYPE(GGandivaBooleanLiteralNode, ggandiva_boolean_literal_node, diff --git a/c_glib/gandiva-glib/node.hpp b/c_glib/gandiva-glib/node.hpp index 7ff136003f174..40f9d1b465591 100644 --- a/c_glib/gandiva-glib/node.hpp +++ b/c_glib/gandiva-glib/node.hpp @@ -36,4 +36,5 @@ ggandiva_function_node_new_raw(std::shared_ptr *gandiva_node, GList *parameters, GArrowDataType *return_type); GGandivaLiteralNode * -ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node); +ggandiva_literal_node_new_raw(std::shared_ptr *gandiva_node, + GArrowDataType *return_type); diff --git a/c_glib/test/gandiva/test-binary-literal-node.rb b/c_glib/test/gandiva/test-binary-literal-node.rb index 93a54a361cc82..fddf74830d4ab 100644 --- a/c_glib/test/gandiva/test-binary-literal-node.rb +++ b/c_glib/test/gandiva/test-binary-literal-node.rb @@ -21,14 +21,27 @@ def setup @value = "\x00\x01\x02\x03\x04" end - def test_new - literal_node = Gandiva::BinaryLiteralNode.new(@value) - assert_equal(@value, literal_node.value.to_s) + sub_test_case(".new") do + def test_string + node = Gandiva::BinaryLiteralNode.new(@value) + assert_equal(@value, node.value.to_s) + end + + def test_bytes + bytes_value = GLib::Bytes.new(@value) + node = Gandiva::BinaryLiteralNode.new(bytes_value) + assert_equal(@value, node.value.to_s) + end end - def test_new_bytes - bytes_value = GLib::Bytes.new(@value) - literal_node = Gandiva::BinaryLiteralNode.new(bytes_value) - assert_equal(@value, literal_node.value.to_s) + sub_test_case("instance methods") do + def setup + super + @node = Gandiva::BinaryLiteralNode.new(@value) + end + + def test_return_type + assert_equal(Arrow::BinaryDataType.new, @node.return_type) + end end end diff --git a/c_glib/test/gandiva/test-boolean-literal-node.rb b/c_glib/test/gandiva/test-boolean-literal-node.rb index 3d1f10c5e81c1..6e18a76218595 100644 --- a/c_glib/test/gandiva/test-boolean-literal-node.rb +++ b/c_glib/test/gandiva/test-boolean-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaBooleanLiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = true + @node = Gandiva::BooleanLiteralNode.new(@value) end def test_value - value = true - literal_node = Gandiva::BooleanLiteralNode.new(value) - assert_equal(value, literal_node.value?) + assert_equal(@value, @node.value?) + end + + def test_return_type + assert_equal(Arrow::BooleanDataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-double-literal-node.rb b/c_glib/test/gandiva/test-double-literal-node.rb index fd4bd08e4c254..27cc3aea23b32 100644 --- a/c_glib/test/gandiva/test-double-literal-node.rb +++ b/c_glib/test/gandiva/test-double-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaDoubleLiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 1.5 + @node = Gandiva::DoubleLiteralNode.new(@value) end def test_value - value = 1.5 - literal_node = Gandiva::DoubleLiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::DoubleDataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-field-node.rb b/c_glib/test/gandiva/test-field-node.rb index c5bfe6cfc9743..51db285bcc0bf 100644 --- a/c_glib/test/gandiva/test-field-node.rb +++ b/c_glib/test/gandiva/test-field-node.rb @@ -18,11 +18,15 @@ class TestGandivaFieldNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @field = Arrow::Field.new("valid", Arrow::BooleanDataType.new) + @node = Gandiva::FieldNode.new(@field) end def test_field - field = Arrow::Field.new("valid", Arrow::BooleanDataType.new) - field_node = Gandiva::FieldNode.new(field) - assert_equal(field, field_node.field) + assert_equal(@field, @node.field) + end + + def test_return_type + assert_equal(@field.data_type, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-float-literal-node.rb b/c_glib/test/gandiva/test-float-literal-node.rb index 202ec38fc5907..4a49eb37441d1 100644 --- a/c_glib/test/gandiva/test-float-literal-node.rb +++ b/c_glib/test/gandiva/test-float-literal-node.rb @@ -18,17 +18,15 @@ class TestGandivaFloatLiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 1.5 + @node = Gandiva::FloatLiteralNode.new(@value) end - def test_new - assert_nothing_raised do - Gandiva::FloatLiteralNode.new(1.5) - end + def test_value + assert_equal(@value, @node.value) end - def test_value - value = 1.5 - literal_node = Gandiva::FloatLiteralNode.new(value) - assert_equal(value, literal_node.value) + def test_return_type + assert_equal(Arrow::FloatDataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-int16-literal-node.rb b/c_glib/test/gandiva/test-int16-literal-node.rb index 9b5bb6822ebba..f8e6b26849496 100644 --- a/c_glib/test/gandiva/test-int16-literal-node.rb +++ b/c_glib/test/gandiva/test-int16-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaInt16LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 15) + @node = Gandiva::Int16LiteralNode.new(@value) end def test_value - value = -3 - literal_node = Gandiva::Int16LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int16DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-int32-literal-node.rb b/c_glib/test/gandiva/test-int32-literal-node.rb index 9c94cdef4b125..3d1bf588cf7dc 100644 --- a/c_glib/test/gandiva/test-int32-literal-node.rb +++ b/c_glib/test/gandiva/test-int32-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaInt32LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 31) + @node = Gandiva::Int32LiteralNode.new(@value) end def test_value - value = -3 - literal_node = Gandiva::Int32LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int32DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-int64-literal-node.rb b/c_glib/test/gandiva/test-int64-literal-node.rb index e1b4b91d8c32c..b2ca3bf630b43 100644 --- a/c_glib/test/gandiva/test-int64-literal-node.rb +++ b/c_glib/test/gandiva/test-int64-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaInt64LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 63) + @node = Gandiva::Int64LiteralNode.new(@value) end def test_value - value = -3 - literal_node = Gandiva::Int64LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int64DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-int8-literal-node.rb b/c_glib/test/gandiva/test-int8-literal-node.rb index 30f11fc81a60d..8d917bd1b4dfe 100644 --- a/c_glib/test/gandiva/test-int8-literal-node.rb +++ b/c_glib/test/gandiva/test-int8-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaInt8LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = -(2 ** 7) + @node = Gandiva::Int8LiteralNode.new(@value) end def test_value - value = -3 - literal_node = Gandiva::Int8LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::Int8DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-null-literal-node.rb b/c_glib/test/gandiva/test-null-literal-node.rb new file mode 100644 index 0000000000000..ae14f3c15e411 --- /dev/null +++ b/c_glib/test/gandiva/test-null-literal-node.rb @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestGandivaNullLiteralNode < Test::Unit::TestCase + def setup + omit("Gandiva is required") unless defined?(::Gandiva) + end + + def test_invalid_type + return_type = Arrow::NullDataType.new + message = + "[gandiva][null-literal-node][new] " + + "failed to create: <#{return_type}>" + assert_raise(Arrow::Error::Invalid.new(message)) do + Gandiva::NullLiteralNode.new(return_type) + end + end + + def test_return_type + return_type = Arrow::BooleanDataType.new + literal_node = Gandiva::NullLiteralNode.new(return_type) + assert_equal(return_type, literal_node.return_type) + end +end diff --git a/c_glib/test/gandiva/test-string-literal-node.rb b/c_glib/test/gandiva/test-string-literal-node.rb index a231f6111f40f..8a397ab4d1a9b 100644 --- a/c_glib/test/gandiva/test-string-literal-node.rb +++ b/c_glib/test/gandiva/test-string-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaStringLiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = "Hello" + @node = Gandiva::StringLiteralNode.new(@value) end def test_value - value = "Hello" - literal_node = Gandiva::StringLiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::StringDataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-uint16-literal-node.rb b/c_glib/test/gandiva/test-uint16-literal-node.rb index e8bdd308969bb..971da38881df6 100644 --- a/c_glib/test/gandiva/test-uint16-literal-node.rb +++ b/c_glib/test/gandiva/test-uint16-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaUInt16LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 16 - 1 + @node = Gandiva::UInt16LiteralNode.new(@value) end def test_value - value = 3 - literal_node = Gandiva::UInt16LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt16DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-uint32-literal-node.rb b/c_glib/test/gandiva/test-uint32-literal-node.rb index 9d5995774dd97..8fcab7fefad87 100644 --- a/c_glib/test/gandiva/test-uint32-literal-node.rb +++ b/c_glib/test/gandiva/test-uint32-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaUInt32LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 32 - 1 + @node = Gandiva::UInt32LiteralNode.new(@value) end def test_value - value = 3 - literal_node = Gandiva::UInt32LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt32DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-uint64-literal-node.rb b/c_glib/test/gandiva/test-uint64-literal-node.rb index 56c46db81bd24..d5afddcd75f44 100644 --- a/c_glib/test/gandiva/test-uint64-literal-node.rb +++ b/c_glib/test/gandiva/test-uint64-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaUInt64LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 3 + @node = Gandiva::UInt64LiteralNode.new(@value) end def test_value - value = 3 - literal_node = Gandiva::UInt64LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt64DataType.new, @node.return_type) end end diff --git a/c_glib/test/gandiva/test-uint8-literal-node.rb b/c_glib/test/gandiva/test-uint8-literal-node.rb index 04f76cd76326f..8ce91d599f435 100644 --- a/c_glib/test/gandiva/test-uint8-literal-node.rb +++ b/c_glib/test/gandiva/test-uint8-literal-node.rb @@ -18,11 +18,15 @@ class TestGandivaUInt8LiteralNode < Test::Unit::TestCase def setup omit("Gandiva is required") unless defined?(::Gandiva) + @value = 2 ** 8 - 1 + @node = Gandiva::UInt8LiteralNode.new(@value) end def test_value - value = 3 - literal_node = Gandiva::UInt8LiteralNode.new(value) - assert_equal(value, literal_node.value) + assert_equal(@value, @node.value) + end + + def test_return_type + assert_equal(Arrow::UInt8DataType.new, @node.return_type) end end From b8d4477ffbe5a569521828964277e7d6ea115671 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Wed, 19 Dec 2018 13:57:13 +0100 Subject: [PATCH 33/80] ARROW-3989: [Rust] [CSV] Cast bool string to lower case in reader The csv reader currently only handles boolean types if the string is explicitly `true|false`. Excel saves bools as `TRUE|FALSE`, and Python/Pandas as `True|False`. This PR adds a condition that lowercases booleans when casting them to Arrow types. @andygrove @sunchao I believe it's ready for review. Author: Neville Dipale Closes #3214 from nevi-me/rust/boolean-case and squashes the following commits: 38d99426 move primitive array builder into Reader 9fae4428 move is_boolean_type check out of loop, remove duplicate impl Reader 2a86b527 : Cast timestamp string to lower case to handle True, TRUE ... --- rust/src/csv/reader.rs | 79 ++++++++++++++++++++---------------- rust/test/data/null_test.csv | 8 ++-- 2 files changed, 47 insertions(+), 40 deletions(-) diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs index 632aa7ae7936d..b9c46fc3217cc 100644 --- a/rust/src/csv/reader.rs +++ b/rust/src/csv/reader.rs @@ -87,32 +87,7 @@ impl Reader { batch_size, } } -} -fn build_primitive_array( - rows: &[StringRecord], - col_idx: &usize, -) -> Result { - let mut builder = PrimitiveArrayBuilder::::new(rows.len()); - for row_index in 0..rows.len() { - match rows[row_index].get(*col_idx) { - Some(s) if s.len() > 0 => match s.parse::() { - Ok(v) => builder.push(v)?, - Err(_) => { - // TODO: we should surface the underlying error here. - return Err(ArrowError::ParseError(format!( - "Error while parsing value {}", - s - ))); - } - }, - _ => builder.push_null().unwrap(), - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) -} - -impl Reader { /// Read the next batch of rows pub fn next(&mut self) -> Result> { // read a batch of rows into memory @@ -151,17 +126,17 @@ impl Reader { .map(|i| { let field = self.schema.field(*i); match field.data_type() { - &DataType::Boolean => build_primitive_array::(rows, i), - &DataType::Int8 => build_primitive_array::(rows, i), - &DataType::Int16 => build_primitive_array::(rows, i), - &DataType::Int32 => build_primitive_array::(rows, i), - &DataType::Int64 => build_primitive_array::(rows, i), - &DataType::UInt8 => build_primitive_array::(rows, i), - &DataType::UInt16 => build_primitive_array::(rows, i), - &DataType::UInt32 => build_primitive_array::(rows, i), - &DataType::UInt64 => build_primitive_array::(rows, i), - &DataType::Float32 => build_primitive_array::(rows, i), - &DataType::Float64 => build_primitive_array::(rows, i), + &DataType::Boolean => self.build_primitive_array::(rows, i), + &DataType::Int8 => self.build_primitive_array::(rows, i), + &DataType::Int16 => self.build_primitive_array::(rows, i), + &DataType::Int32 => self.build_primitive_array::(rows, i), + &DataType::Int64 => self.build_primitive_array::(rows, i), + &DataType::UInt8 => self.build_primitive_array::(rows, i), + &DataType::UInt16 => self.build_primitive_array::(rows, i), + &DataType::UInt32 => self.build_primitive_array::(rows, i), + &DataType::UInt64 => self.build_primitive_array::(rows, i), + &DataType::Float32 => self.build_primitive_array::(rows, i), + &DataType::Float64 => self.build_primitive_array::(rows, i), &DataType::Utf8 => { let values_builder: UInt8Builder = UInt8Builder::new(rows.len()); let mut list_builder = ListArrayBuilder::new(values_builder); @@ -191,6 +166,38 @@ impl Reader { Err(e) => Err(e), } } + + fn build_primitive_array( + &self, + rows: &[StringRecord], + col_idx: &usize, + ) -> Result { + let mut builder = PrimitiveArrayBuilder::::new(rows.len()); + let is_boolean_type = *self.schema.field(*col_idx).data_type() == DataType::Boolean; + for row_index in 0..rows.len() { + match rows[row_index].get(*col_idx) { + Some(s) if s.len() > 0 => { + let t = if is_boolean_type { + s.to_lowercase().parse::() + } else { + s.parse::() + }; + match t { + Ok(v) => builder.push(v)?, + Err(_) => { + // TODO: we should surface the underlying error here. + return Err(ArrowError::ParseError(format!( + "Error while parsing value {}", + s + ))); + } + } + } + _ => builder.push_null()?, + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + } } #[cfg(test)] diff --git a/rust/test/data/null_test.csv b/rust/test/data/null_test.csv index 80830606563b3..7e0dde5371429 100644 --- a/rust/test/data/null_test.csv +++ b/rust/test/data/null_test.csv @@ -1,6 +1,6 @@ c_int,c_float,c_string,c_bool -1,1.1,"1.11",true -2,2.2,"2.22",true +1,1.1,"1.11",True +2,2.2,"2.22",TRUE 3,,"3.33",true -4,4.4,,false -5,6.6,"",false \ No newline at end of file +4,4.4,,False +5,6.6,"",FALSE \ No newline at end of file From cec8d23dd48e764064adcfdfb33b13989fd3b667 Mon Sep 17 00:00:00 2001 From: cav71 Date: Wed, 19 Dec 2018 15:52:23 +0100 Subject: [PATCH 34/80] ARROW-4066: [Doc] Instructions to create Sphinx documentation Document how to build the documentation in the Python docs. Author: cav71 Author: Antoine Pitrou Closes #3198 from cav71/documentation and squashes the following commits: 9af13754 Missing word 1389e19d Remove spurious newlines 61b32356 Some improvements e21fdd7a update documentation 5ce1cf45 update documentation e5e6c4de Merge remote-tracking branch 'upstream/master' into documentation c132dffe update doc following comments from: https://github.com/apache/arrow/pull/3198 f3620520 doc doc --- docs/source/python/development.rst | 59 +++++++++++++++++++++++++++++- docs/source/python/index.rst | 17 +++++---- docs/source/python/install.rst | 4 +- 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index 4258feef79f44..1dcfda862817f 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -86,6 +86,8 @@ On Linux and OSX: --file arrow/ci/conda_env_python.yml \ python=3.6 + source activate pyarrow-dev + On Windows: .. code-block:: shell @@ -95,16 +97,18 @@ On Windows: --file arrow\ci\conda_env_python.yml ^ python=3.6 + activate pyarrow-dev + We need to set some environment variables to let Arrow's build system know about our build toolchain: .. code-block:: shell export ARROW_BUILD_TYPE=release - export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX export ARROW_HOME=$CONDA_PREFIX export PARQUET_HOME=$CONDA_PREFIX + export BOOST_HOME=$CONDA_PREFIX Using pip ~~~~~~~~~ @@ -207,9 +211,10 @@ Now, build pyarrow: .. code-block:: shell - cd arrow/python + pushd arrow/python python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ --with-parquet --with-plasma --inplace + popd If you did not build with plasma, you can omit ``--with-plasma``. @@ -352,3 +357,53 @@ Getting ``python-test.exe`` to run is a bit tricky because your set PYTHONHOME=%CONDA_PREFIX% Now ``python-test.exe`` or simply ``ctest`` (to run all tests) should work. + +Building the Documentation +========================== + +Prerequisites +------------- + +The documentation build process uses `Doxygen `_ and +`Sphinx `_ along with a few extensions. + +If you're using Conda, the required software can be installed in a single line: + +.. code-block:: shell + + conda install -c conda-forge --file ci/conda_env_sphinx.yml + +Otherwise, you'll first need to install `Doxygen `_ +yourself (for example from your distribution's official repositories, if +using Linux). Then you can install the Python-based requirements with the +following command: + +.. code-block:: shell + + pip install -r docs/requirements.txt + +Building +-------- + +These two steps are mandatory and must be executed in order. + +#. Process the C++ API using Doxygen + + .. code-block:: shell + + pushd cpp/apidoc + doxygen + popd + +#. Build the complete documentation using Sphinx + + .. code-block:: shell + + pushd docs + make html + popd + +After these steps are completed, the documentation is rendered in HTML +format in ``docs/_build/html``. In particular, you can point your browser +at ``docs/_build/html/index.html`` to read the docs and review any changes +you made. diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index 56282192b170b..cf691e37eaa25 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -18,21 +18,22 @@ Python bindings =============== -The Arrow Python bindings have first-class integration with NumPy, pandas, and -built-in Python objects. They are based on the C++ implementation of Arrow. - This is the documentation of the Python API of Apache Arrow. For more details -on the format and other language bindings see the parent documentation. -Here will we only detail the usage of the Python API for Arrow and the leaf +on the Arrow format and other language bindings see the +:doc:`parent documentation <../index>`. + +The Arrow Python bindings (also named "PyArrow") have first-class integration +with NumPy, pandas, and built-in Python objects. They are based on the C++ +implementation of Arrow. + +Here will we detail the usage of the Python API for Arrow and the leaf libraries that add additional functionality such as reading Apache Parquet files into Arrow structures. .. toctree:: :maxdepth: 2 - :caption: Getting Started install - development memory data ipc @@ -44,5 +45,5 @@ files into Arrow structures. parquet extending api + development getting_involved - diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst index d07d9004d2632..8092b6ce6a0ef 100644 --- a/docs/source/python/install.rst +++ b/docs/source/python/install.rst @@ -15,8 +15,8 @@ .. specific language governing permissions and limitations .. under the License. -Install PyArrow -=============== +Installing PyArrow +================== Conda ----- From 6bfac93ab1c133190d782683df4054f98b2007e5 Mon Sep 17 00:00:00 2001 From: shyam Date: Wed, 19 Dec 2018 09:10:16 -0600 Subject: [PATCH 35/80] ARROW-3979 : [Gandiva] fix all valgrind reported errors Fix all the issues reported by valgrind and also enable option ARROW_TRAVIS_VALGRIND. Author: shyam Closes #3201 from shyambits2004/master and squashes the following commits: 81d5b7669 ARROW-3979 : fix all valgrind reported errors --- .travis.yml | 5 +-- cpp/src/gandiva/bitmap_accumulator_test.cc | 7 ++- cpp/src/gandiva/eval_batch.h | 2 +- cpp/src/gandiva/exported_funcs_registry.h | 8 ++-- cpp/src/gandiva/local_bitmaps_holder.h | 6 +-- cpp/src/gandiva/precompiled/CMakeLists.txt | 2 +- cpp/src/gandiva/projector.cc | 6 +++ cpp/src/gandiva/selection_vector_test.cc | 51 ++++++++++------------ cpp/src/gandiva/tests/projector_test.cc | 9 ++-- cpp/valgrind.supp | 13 +++++- 10 files changed, 62 insertions(+), 47 deletions(-) diff --git a/.travis.yml b/.travis.yml index bf0261b3fa1ea..64408128fe146 100644 --- a/.travis.yml +++ b/.travis.yml @@ -114,8 +114,7 @@ matrix: - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - ARROW_CPP_BUILD_TARGETS="gandiva-all" - ARROW_TRAVIS_USE_TOOLCHAIN=1 - # ARROW-3979 temporarily disabled. - - ARROW_TRAVIS_VALGRIND=0 + - ARROW_TRAVIS_VALGRIND=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: @@ -123,7 +122,7 @@ matrix: - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh diff --git a/cpp/src/gandiva/bitmap_accumulator_test.cc b/cpp/src/gandiva/bitmap_accumulator_test.cc index fc89421344e83..53e8aaca21ff1 100644 --- a/cpp/src/gandiva/bitmap_accumulator_test.cc +++ b/cpp/src/gandiva/bitmap_accumulator_test.cc @@ -32,9 +32,8 @@ class TestBitMapAccumulator : public ::testing::Test { int nrecords); }; -void TestBitMapAccumulator::FillBitMap(uint8_t* bmap, int nrecords) { - int nbytes = nrecords / 8; - unsigned int cur; +void TestBitMapAccumulator::FillBitMap(uint8_t* bmap, int nbytes) { + unsigned int cur = 0; for (int i = 0; i < nbytes; ++i) { rand_r(&cur); @@ -62,7 +61,7 @@ TEST_F(TestBitMapAccumulator, TestIntersectBitMaps) { uint8_t expected_bitmap[length]; for (int i = 0; i < 4; i++) { - FillBitMap(src_bitmaps[i], nrecords); + FillBitMap(src_bitmaps[i], length); } for (int i = 0; i < 4; i++) { diff --git a/cpp/src/gandiva/eval_batch.h b/cpp/src/gandiva/eval_batch.h index 608f4200ce415..093968f232afb 100644 --- a/cpp/src/gandiva/eval_batch.h +++ b/cpp/src/gandiva/eval_batch.h @@ -85,7 +85,7 @@ class EvalBatch { /// An array of 'num_buffers_', each containing a buffer. The buffer /// sizes depends on the data type, but all of them have the same /// number of slots (equal to num_records_). - std::unique_ptr buffers_array_; + std::unique_ptr buffers_array_; std::unique_ptr local_bitmaps_holder_; diff --git a/cpp/src/gandiva/exported_funcs_registry.h b/cpp/src/gandiva/exported_funcs_registry.h index 511ec9c212468..35ad5c0fae516 100644 --- a/cpp/src/gandiva/exported_funcs_registry.h +++ b/cpp/src/gandiva/exported_funcs_registry.h @@ -18,6 +18,7 @@ #ifndef GANDIVA_EXPORTED_FUNCS_REGISTRY_H #define GANDIVA_EXPORTED_FUNCS_REGISTRY_H +#include #include #include @@ -30,12 +31,12 @@ class ExportedFuncsBase; /// LLVM/IR code. class ExportedFuncsRegistry { public: - using list_type = std::vector; + using list_type = std::vector>; // Add functions from all the registered classes to the engine. static void AddMappings(Engine* engine); - static bool Register(ExportedFuncsBase* entry) { + static bool Register(std::shared_ptr entry) { registered().push_back(entry); return true; } @@ -48,7 +49,8 @@ class ExportedFuncsRegistry { }; #define REGISTER_EXPORTED_FUNCS(classname) \ - static bool _registered_##classname = ExportedFuncsRegistry::Register(new classname) + static bool _registered_##classname = \ + ExportedFuncsRegistry::Register(std::make_shared()) } // namespace gandiva diff --git a/cpp/src/gandiva/local_bitmaps_holder.h b/cpp/src/gandiva/local_bitmaps_holder.h index 1dc82562e3110..ae0ba53e99003 100644 --- a/cpp/src/gandiva/local_bitmaps_holder.h +++ b/cpp/src/gandiva/local_bitmaps_holder.h @@ -50,10 +50,10 @@ class LocalBitMapsHolder { int64_t num_records_; /// A container of 'local_bitmaps_', each sized to accomodate 'num_records'. - std::vector> local_bitmaps_vec_; + std::vector> local_bitmaps_vec_; /// An array of the local bitmaps. - std::unique_ptr local_bitmaps_array_; + std::unique_ptr local_bitmaps_array_; int64_t local_bitmap_size_; }; @@ -72,7 +72,7 @@ inline LocalBitMapsHolder::LocalBitMapsHolder(int64_t num_records, int num_local // Alloc 'num_local_bitmaps_' number of bitmaps, each of capacity 'num_records_'. for (int i = 0; i < num_local_bitmaps; ++i) { // TODO : round-up to a slab friendly multiple. - std::unique_ptr bitmap(new uint8_t[local_bitmap_size_]); + std::unique_ptr bitmap(new uint8_t[local_bitmap_size_]); // keep pointer to the bitmap in the array. (local_bitmaps_array_.get())[i] = bitmap.get(); diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 2af49084bf310..21a74bd4916ee 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -65,7 +65,7 @@ function(add_precompiled_unit_test REL_TEST_NAME) ) target_compile_definitions(${TEST_NAME} PRIVATE GANDIVA_UNIT_TEST=1) add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) - set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva;unittest ${TEST_NAME}) + set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva-tests {TEST_NAME}) endfunction(add_precompiled_unit_test REL_TEST_NAME) # testing diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 8020a45b3d302..40fdc201133a4 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -175,6 +175,12 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, astatus = arrow::AllocateBuffer(pool, data_len, &data); ARROW_RETURN_NOT_OK(astatus); + // Valgrind detects unitialized memory at byte level. Boolean types use bits + // and can leave buffer memory uninitialized in the last byte. + if (type->id() == arrow::Type::BOOL) { + data->mutable_data()[data_len - 1] = 0; + } + *array_data = arrow::ArrayData::Make(type, num_records, {null_bitmap, data}); return Status::OK(); } diff --git a/cpp/src/gandiva/selection_vector_test.cc b/cpp/src/gandiva/selection_vector_test.cc index acb0f338cd6ae..67389273c82f2 100644 --- a/cpp/src/gandiva/selection_vector_test.cc +++ b/cpp/src/gandiva/selection_vector_test.cc @@ -18,6 +18,7 @@ #include "gandiva/selection_vector.h" #include +#include #include @@ -102,15 +103,14 @@ TEST_F(TestSelectionVector, TestInt16PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); @@ -127,15 +127,14 @@ TEST_F(TestSelectionVector, TestInt16PopulateFromBitMapNegative) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = 16; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 1); - arrow::BitUtil::SetBit(bitmap.get(), 2); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 1); + arrow::BitUtil::SetBit(&bitmap[0], 2); // The bitmap has three set bits, whereas the selection vector has capacity for only 2. - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, 2); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, 2); EXPECT_EQ(status.IsInvalid(), true); } @@ -175,15 +174,14 @@ TEST_F(TestSelectionVector, TestInt32PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); @@ -243,15 +241,14 @@ TEST_F(TestSelectionVector, TestInt64PopulateFromBitMap) { EXPECT_EQ(status.ok(), true) << status.message(); int bitmap_size = RoundUpNumi64(max_slots) * 8; - std::unique_ptr bitmap(new uint8_t[bitmap_size]); - memset(bitmap.get(), 0, bitmap_size); + std::vector bitmap(bitmap_size); - arrow::BitUtil::SetBit(bitmap.get(), 0); - arrow::BitUtil::SetBit(bitmap.get(), 5); - arrow::BitUtil::SetBit(bitmap.get(), 121); - arrow::BitUtil::SetBit(bitmap.get(), 220); + arrow::BitUtil::SetBit(&bitmap[0], 0); + arrow::BitUtil::SetBit(&bitmap[0], 5); + arrow::BitUtil::SetBit(&bitmap[0], 121); + arrow::BitUtil::SetBit(&bitmap[0], 220); - status = selection->PopulateFromBitMap(bitmap.get(), bitmap_size, max_slots - 1); + status = selection->PopulateFromBitMap(&bitmap[0], bitmap_size, max_slots - 1); EXPECT_EQ(status.ok(), true) << status.message(); EXPECT_EQ(selection->GetNumSlots(), 3); diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index becaf8f1ba3d7..61d9dc3ad1629 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -493,14 +493,15 @@ TEST_F(TestProjector, TestZeroCopy) { // allocate output buffers int64_t bitmap_sz = arrow::BitUtil::BytesForBits(num_records); - std::unique_ptr bitmap(new uint8_t[bitmap_sz]); + int64_t bitmap_capacity = arrow::BitUtil::RoundUpToMultipleOf64(bitmap_sz); + std::vector bitmap(bitmap_capacity); std::shared_ptr bitmap_buf = - std::make_shared(bitmap.get(), bitmap_sz); + std::make_shared(&bitmap[0], bitmap_capacity); int64_t data_sz = sizeof(float) * num_records; - std::unique_ptr data(new uint8_t[data_sz]); + std::vector data(bitmap_capacity); std::shared_ptr data_buf = - std::make_shared(data.get(), data_sz); + std::make_shared(&data[0], data_sz); auto array_data = arrow::ArrayData::Make(float32(), num_records, {bitmap_buf, data_buf}); diff --git a/cpp/valgrind.supp b/cpp/valgrind.supp index 8e707e39e7cd8..d8bc8fb28f2d5 100644 --- a/cpp/valgrind.supp +++ b/cpp/valgrind.supp @@ -21,4 +21,15 @@ Memcheck:Cond fun:*CastFunctor*BooleanType* } - +{ + :Conditional jump or move depends on uninitialised value(s) + Memcheck:Cond + ... + fun:_ZN3re23RE2C1E* +} +{ + :Use of uninitialised value of size 8 + Memcheck:Value8 + ... + fun:_ZN3re23RE2C1E* +} From d08964334082e87010b37933623f021c98e8733d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Dec 2018 10:07:13 -0600 Subject: [PATCH 36/80] ARROW-3803: [C++/Python] Merge C++ builds and tests, run Python tests in separate CI entries I found unfortunately that the conda-forge boost-cpp package is not fully compatible with Xcode 8.3, see https://issues.apache.org/jira/browse/ARROW-4056 We might have to build a vendored Boost in this CI entry to work around the problem (this is what the Ray project did when they also hit this issue) Author: Wes McKinney Closes #3208 from wesm/ARROW-3803 and squashes the following commits: 7c47776a9 Remove now unneeded travis_script_gandiva_cpp.sh 9c8d6aa27 * Combine C++ CI jobs, split Python CI jobs into separate build entries * Use gcc 4.8 * Pin boost-cpp 1.68.0 due to crashes caused by 1.69.0 --- .travis.yml | 103 +++++++++--------- ci/conda_env_cpp.yml | 4 +- ci/travis_before_script_cpp.sh | 19 +++- ci/travis_env_common.sh | 2 + ci/travis_script_gandiva_cpp.sh | 30 ----- ci/travis_script_python.sh | 6 +- cpp/cmake_modules/ThirdpartyToolchain.cmake | 2 + cpp/src/plasma/CMakeLists.txt | 2 + .../gandiva/evaluator/MicroBenchmarkTest.java | 2 + python/CMakeLists.txt | 1 + 10 files changed, 77 insertions(+), 94 deletions(-) delete mode 100755 ci/travis_script_gandiva_cpp.sh diff --git a/.travis.yml b/.travis.yml index 64408128fe146..f7094fc56d641 100644 --- a/.travis.yml +++ b/.travis.yml @@ -62,74 +62,67 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh script: - $TRAVIS_BUILD_DIR/ci/travis_lint.sh - - name: "C++ & Python w/ gcc 4.9" + - name: "C++ unit tests, code coverage with gcc 4.8" compiler: gcc language: cpp os: linux jdk: openjdk8 env: - - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_COVERAGE=1 - ARROW_TRAVIS_PARQUET=1 - - ARROW_TRAVIS_PYTHON_DOCS=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - - ARROW_TRAVIS_PYTHON_JVM=1 - - ARROW_TRAVIS_JAVA_BUILD_ONLY=1 - - ARROW_TRAVIS_PYTHON_GANDIVA=1 - # ARROW-2999 Benchmarks are disabled in Travis CI for the time being - # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" before_script: - # (ARROW_CI_CPP_AFFECTED implies ARROW_CI_PYTHON_AFFECTED) - - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh # If either C++ or Python changed, we must install the C++ libraries - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - # All test steps are required for accurate C++ coverage info - - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - # Build Arrow Java to test the pyarrow<->JVM in-process bridge - - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh - # Only run Plasma tests with valgrind in one of the Python builds because - # they are slow - - export PLASMA_VALGRIND=0 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - - export PLASMA_VALGRIND=1 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh - - name: "Gandiva C++ w/ gcc 4.9 and Java" + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh || travis_terminate 1 + - name: "Python 2.7 and 3.6 unit tests, coverage with gcc 4.8" compiler: gcc language: cpp os: linux jdk: openjdk8 env: - - ARROW_TRAVIS_GANDIVA=1 - - ARROW_TRAVIS_GANDIVA_JAVA=1 - - ARROW_TRAVIS_GANDIVA_TESTS=1 - - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - - ARROW_CPP_BUILD_TARGETS="gandiva-all" - - ARROW_TRAVIS_USE_TOOLCHAIN=1 + # Valgrind is needed for the Plasma store tests - ARROW_TRAVIS_VALGRIND=1 + - ARROW_TRAVIS_USE_TOOLCHAIN=1 + - ARROW_TRAVIS_COVERAGE=1 + - ARROW_TRAVIS_PYTHON_DOCS=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" + - ARROW_TRAVIS_PYTHON_JVM=1 + - ARROW_TRAVIS_PYTHON_GANDIVA=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 + # TODO(wesm): Run the benchmarks outside of Travis + # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 before_script: - # Run if something changed in CPP or Java. - - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh script: - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - - name: "[OS X] C++ & Python w/ XCode 6.4" + - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh + # Only run Plasma tests with valgrind in one of the Python builds because + # they are slow + - export PLASMA_VALGRIND=0 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 + - export PLASMA_VALGRIND=1 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 + - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh + - name: "[OS X] C++ w/ XCode 8.3" compiler: clang language: cpp - osx_image: xcode6.4 + osx_image: xcode8.3 os: osx cache: addons: @@ -138,39 +131,41 @@ matrix: - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_PARQUET=1 + - ARROW_TRAVIS_GANDIVA=1 + - ARROW_TRAVIS_GANDIVA_JAVA=1 + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN + # ARROW-3803: The Xcode 8.3 image has Boost libraries in /usr/local/lib + # which can get loaded before the toolchain Boost libraries. These seem to + # get loaded even though we are modifying LD_LIBRARY_PATH. We build our own + # Boost and statically link to get around the issue until this can be + # investigated further + - ARROW_TRAVIS_VENDORED_BOOST=1 before_script: - - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi # If either C++ or Python changed, we must install the C++ libraries - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - - if [ $ARROW_CI_CPP_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh; fi - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - - name: "[OS X] Gandiva C++ w/ XCode 8.3 & Java" + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh + - name: "[OS X] Python w/ XCode 6.4" compiler: clang language: cpp - # xcode 7.3 has a bug in strptime. - osx_image: xcode8.3 + osx_image: xcode6.4 os: osx cache: addons: env: - - ARROW_TRAVIS_GANDIVA=1 - - ARROW_TRAVIS_GANDIVA_JAVA=1 - - ARROW_TRAVIS_GANDIVA_TESTS=1 - - ARROW_TRAVIS_OPTIONAL_INSTALL=1 - - ARROW_CPP_BUILD_TARGETS="gandiva-all" - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN + - ARROW_TRAVIS_OPTIONAL_INSTALL=1 before_script: - # Run if something changed in CPP or Java. - - if [ $ARROW_CI_CPP_AFFECTED != "1" ] && [ $ARROW_CI_JAVA_AFFECTED != "1" ]; then exit; fi - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library script: - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_cpp.sh - - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh + - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - name: "[manylinux1] Python" language: cpp before_script: diff --git a/ci/conda_env_cpp.yml b/ci/conda_env_cpp.yml index 1e22e9017fc62..87523b3fdd611 100644 --- a/ci/conda_env_cpp.yml +++ b/ci/conda_env_cpp.yml @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. -boost-cpp +# ARROW-4056: The conda-forge boost 1.69.0 seems to break the Parquet unit +# tests with Xcode 8.3. Root cause not yet determined +boost-cpp=1.68.0 brotli bzip2 cmake diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index aa5b2a6ab084c..8ddc98691015f 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -40,6 +40,14 @@ if [ "$only_library_mode" == "no" ]; then source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh fi +if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then + # Set up C++ toolchain from conda-forge packages for faster builds + source $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh +fi + +mkdir -p $ARROW_CPP_BUILD_DIR +pushd $ARROW_CPP_BUILD_DIR + CMAKE_COMMON_FLAGS="\ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL \ -DARROW_NO_DEPRECATED_API=ON \ @@ -48,15 +56,10 @@ CMAKE_LINUX_FLAGS="" CMAKE_OSX_FLAGS="" if [ "$ARROW_TRAVIS_USE_TOOLCHAIN" == "1" ]; then - # Set up C++ toolchain from conda-forge packages for faster builds - source $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh CMAKE_COMMON_FLAGS="${CMAKE_COMMON_FLAGS} -DARROW_JEMALLOC=ON" CMAKE_COMMON_FLAGS="${CMAKE_COMMON_FLAGS} -DARROW_WITH_BZ2=ON" fi -mkdir -p $ARROW_CPP_BUILD_DIR -pushd $ARROW_CPP_BUILD_DIR - if [ $only_library_mode == "yes" ]; then CMAKE_COMMON_FLAGS="\ $CMAKE_COMMON_FLAGS \ @@ -115,10 +118,14 @@ if [ $ARROW_TRAVIS_VERBOSE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_VERBOSE_THIRDPARTY_BUILD=ON" fi -if [ $ARROW_TRAVIS_USE_VENDORED_BOOST == "1" ]; then +if [ $ARROW_TRAVIS_VENDORED_BOOST == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BOOST_VENDORED=ON" fi +if [ $ARROW_TRAVIS_STATIC_BOOST == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_BOOST_USE_SHARED=OFF" +fi + if [ $ARROW_TRAVIS_OPTIONAL_INSTALL == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_OPTIONAL_INSTALL=ON" fi diff --git a/ci/travis_env_common.sh b/ci/travis_env_common.sh index f5748b2a0452a..636a25fcd7486 100755 --- a/ci/travis_env_common.sh +++ b/ci/travis_env_common.sh @@ -33,6 +33,8 @@ export ARROW_RUBY_DIR=$TRAVIS_BUILD_DIR/ruby export ARROW_RUST_DIR=${TRAVIS_BUILD_DIR}/rust export ARROW_R_DIR=${TRAVIS_BUILD_DIR}/r +export ARROW_TRAVIS_COVERAGE=${ARROW_TRAVIS_COVERAGE:=0} + if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then export ARROW_CPP_COVERAGE_FILE=${TRAVIS_BUILD_DIR}/coverage.info export ARROW_PYTHON_COVERAGE_FILE=${TRAVIS_BUILD_DIR}/.coverage diff --git a/ci/travis_script_gandiva_cpp.sh b/ci/travis_script_gandiva_cpp.sh deleted file mode 100755 index bc4a7a9a8f03b..0000000000000 --- a/ci/travis_script_gandiva_cpp.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh - -pushd $CPP_BUILD_DIR - -PATH=$ARROW_BUILD_TYPE:$PATH ctest -j2 --output-on-failure -L gandiva-tests - -popd - -# TODO : Capture C++ coverage info diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 20ec57efc39e4..69e115a9dcce7 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -87,7 +87,7 @@ rm -rf * # XXX Can we simply reuse CMAKE_COMMON_FLAGS from travis_before_script_cpp.sh? CMAKE_COMMON_FLAGS="-DARROW_EXTRA_ERROR_CONTEXT=ON" -PYTHON_CPP_BUILD_TARGETS="arrow_python-all plasma" +PYTHON_CPP_BUILD_TARGETS="arrow_python-all plasma parquet" if [ $ARROW_TRAVIS_COVERAGE == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON" @@ -103,6 +103,7 @@ cmake -GNinja \ -DARROW_BUILD_TESTS=ON \ -DARROW_BUILD_UTILITIES=OFF \ -DARROW_OPTIONAL_INSTALL=ON \ + -DARROW_PARQUET=on \ -DARROW_PLASMA=on \ -DARROW_TENSORFLOW=on \ -DARROW_PYTHON=on \ @@ -176,12 +177,11 @@ if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then coverage report -i --include="*/_parquet.pyx" # Generate XML file for CodeCov coverage xml -i -o $TRAVIS_BUILD_DIR/coverage.xml - # Capture C++ coverage info and combine with previous coverage file + # Capture C++ coverage info pushd $TRAVIS_BUILD_DIR lcov --quiet --directory . --capture --no-external --output-file coverage-python-tests.info \ 2>&1 | grep -v "WARNING: no data found for /usr/include" lcov --add-tracefile coverage-python-tests.info \ - --add-tracefile $ARROW_CPP_COVERAGE_FILE \ --output-file $ARROW_CPP_COVERAGE_FILE rm coverage-python-tests.info popd # $TRAVIS_BUILD_DIR diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index db0b69be460ce..3381b5cda16b4 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -346,6 +346,8 @@ if (MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() set(Boost_ADDITIONAL_VERSIONS + "1.70.0" "1.70" + "1.69.0" "1.69" "1.68.0" "1.68" "1.67.0" "1.67" "1.66.0" "1.66" diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index d9c7dcaedeac3..a71acf8ae43d8 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -96,6 +96,8 @@ ADD_ARROW_LIB(plasma SHARED_LINK_LIBS ${FLATBUFFERS_STATIC_LIB} ${CMAKE_THREAD_LIBS_INIT} ${PLASMA_LINK_LIBS} STATIC_LINK_LIBS ${FLATBUFFERS_STATIC_LIB} ${CMAKE_THREAD_LIBS_INIT} ${PLASMA_STATIC_LINK_LIBS}) +add_dependencies(plasma ${PLASMA_LIBRARIES}) + foreach(LIB_TARGET ${PLASMA_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java index c4d6bd9070613..6934c3f9e7d1a 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/MicroBenchmarkTest.java @@ -26,10 +26,12 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import com.google.common.collect.Lists; +@Ignore public class MicroBenchmarkTest extends BaseEvaluatorTest { private double toleranceRatio = 4.0; diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 1a874542c8f9d..a6e4123082532 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -138,6 +138,7 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constant-logical-operand") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sometimes-uninitialized") # We have public Cython APIs which return C++ types, which are in an extern # "C" blog (no symbol mangling) and clang doesn't like this From 320621dae6704dab000dddbf400a87a6f4a79914 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Dec 2018 10:54:35 -0600 Subject: [PATCH 37/80] ARROW-4030: [CI] Use travis_terminate in more script commands to fail faster I had done this partially in ARROW-3803, but I reviewed again and tried to apply this more consistently. Note it is not necessary to use this in the last command in the script: block Author: Wes McKinney Closes #3226 from wesm/ARROW-4030 and squashes the following commits: a04c11e11 Use travis_terminate in more builds --- .travis.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index f7094fc56d641..10300c9b6e287 100644 --- a/.travis.yml +++ b/.travis.yml @@ -111,13 +111,13 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh script: - - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh || travis_terminate 1 # Only run Plasma tests with valgrind in one of the Python builds because # they are slow - export PLASMA_VALGRIND=0 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 || travis_terminate 1 - export PLASMA_VALGRIND=1 - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_upload_cpp_coverage.sh - name: "[OS X] C++ w/ XCode 8.3" compiler: clang @@ -147,7 +147,7 @@ matrix: - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_script_gandiva_java.sh - name: "[OS X] Python w/ XCode 6.4" compiler: clang @@ -163,8 +163,8 @@ matrix: before_script: script: - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi - - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh - - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 + - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh || travis_terminate 1 + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 || travis_terminate 1 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 - name: "[manylinux1] Python" language: cpp From f66fa805e89aef948581876ac802b1ffc6430f5c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 19 Dec 2018 12:53:09 -0600 Subject: [PATCH 38/80] ARROW-554: [C++] Add functions to unify dictionary types and arrays Author: Antoine Pitrou Closes #3165 from pitrou/ARROW-554-conform-dicts and squashes the following commits: 7d2579b30 ARROW-554: Add functions to conform dictionaries --- cpp/src/arrow/array-dict-test.cc | 62 ++++++++++ cpp/src/arrow/array.cc | 61 ++++++++++ cpp/src/arrow/array.h | 24 +++- cpp/src/arrow/array/builder_dict.cc | 172 +++++++++++++++++++++------- cpp/src/arrow/type-test.cc | 128 +++++++++++++++++++++ cpp/src/arrow/type.cc | 7 +- cpp/src/arrow/type.h | 18 +++ cpp/src/arrow/util/hashing.h | 38 ------ cpp/src/arrow/util/int-util-test.cc | 9 ++ cpp/src/arrow/util/int-util.cc | 40 +++++++ cpp/src/arrow/util/int-util.h | 4 + cpp/src/arrow/visitor_inline.h | 2 +- python/pyarrow/tests/test_types.py | 4 +- 13 files changed, 484 insertions(+), 85 deletions(-) diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index 87cb2290a7bf9..730b891cf57f4 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -31,6 +31,7 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" namespace arrow { @@ -38,6 +39,8 @@ namespace arrow { using std::string; using std::vector; +using internal::checked_cast; + // ---------------------------------------------------------------------- // Dictionary tests @@ -740,4 +743,63 @@ TEST(TestDictionary, FromArray) { ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4)); } +TEST(TestDictionary, TransposeBasic) { + std::shared_ptr arr, out, expected; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), dict); + auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]"); + // ["B", "C", "A", "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); + + // Transpose to same index type + { + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int16(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*out, *expected); + } + + // Transpose to other type + { + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int8(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*expected, *out); + } +} + +TEST(TestDictionary, TransposeNulls) { + std::shared_ptr arr, out, expected; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), dict); + auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]"); + // ["B", "C", null, "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr)); + + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto out_dict_type = dictionary(int16(), out_dict); + + const std::vector transpose_map{1, 3, 2}; + ASSERT_OK(internal::checked_cast(*arr).Transpose( + default_memory_pool(), out_dict_type, transpose_map, &out)); + + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]"); + ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected)); + AssertArraysEqual(*expected, *out); +} + } // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index ff94aa2a1e6fe..7e45e90d9c8f7 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -33,6 +33,7 @@ #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/visitor.h" @@ -663,6 +664,66 @@ std::shared_ptr DictionaryArray::dictionary() const { return dict_type_->dictionary(); } +template +static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data, + const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) { + using in_c_type = typename InType::c_type; + using out_c_type = typename OutType::c_type; + + std::shared_ptr out_buffer; + RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer)); + // Null bitmap is unchanged + auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer}, + in_data.null_count); + internal::TransposeInts(in_data.GetValues(1), + out_data->GetMutableValues(1), in_data.length, + transpose_map.data()); + *out = MakeArray(out_data); + return Status::OK(); +} + +Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) const { + DCHECK_EQ(type->id(), Type::DICTIONARY); + const auto& out_dict_type = checked_cast(*type); + + // XXX We'll probably want to make this operation a kernel when we + // implement dictionary-to-dictionary casting. + auto in_type_id = dict_type_->index_type()->id(); + auto out_type_id = out_dict_type.index_type()->id(); + +#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \ + case OUT_INDEX_TYPE::type_id: \ + return TransposeDictIndices(pool, *data(), type, \ + transpose_map, out); + +#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \ + case IN_INDEX_TYPE::type_id: \ + switch (out_type_id) { \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \ + TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \ + default: \ + return Status::NotImplemented("unexpected index type"); \ + } + + switch (in_type_id) { + TRANSPOSE_IN_CASE(Int8Type) + TRANSPOSE_IN_CASE(Int16Type) + TRANSPOSE_IN_CASE(Int32Type) + TRANSPOSE_IN_CASE(Int64Type) + default: + return Status::NotImplemented("unexpected index type"); + } + +#undef TRANSPOSE_IN_OUT_CASE +#undef TRANSPOSE_IN_CASE +} + // ---------------------------------------------------------------------- // Implement Array::Accept as inline visitor diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 52c5207d8dddc..aead17f133d74 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -422,6 +422,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int64_t i) const { return raw_values()[i]; } + // For API compatibility with BinaryArray etc. + value_type GetView(int64_t i) const { return Value(i); } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -442,6 +445,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { i + data_->offset); } + bool GetView(int64_t i) const { return Value(i); } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -802,7 +807,7 @@ class ARROW_EXPORT DictionaryArray : public Array { /// This function does the validation of the indices and input type. It checks if /// all indices are non-negative and smaller than the size of the dictionary /// - /// \param[in] type a data type containing a dictionary + /// \param[in] type a dictionary type /// \param[in] indices an array of non-negative signed /// integers smaller than the size of the dictionary /// \param[out] out the resulting DictionaryArray instance @@ -810,6 +815,23 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& indices, std::shared_ptr* out); + /// \brief Transpose this DictionaryArray + /// + /// This method constructs a new dictionary array with the given dictionary type, + /// transposing indices using the transpose map. + /// The type and the transpose map are typically computed using + /// DictionaryType::Unify. + /// + /// \param[in] pool a pool to allocate the array data from + /// \param[in] type a dictionary type + /// \param[in] transpose_map a vector transposing this array's indices + /// into the target array's indices + /// \param[out] out the resulting DictionaryArray instance + Status Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::vector& transpose_map, + std::shared_ptr* out) const; + // XXX Do we also want an unsafe in-place Transpose? + std::shared_ptr indices() const; std::shared_ptr dictionary() const; diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 0891e4c0829f4..e534c3cadb14b 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -19,6 +19,9 @@ #include #include +#include +#include +#include #include #include @@ -30,11 +33,117 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// DictionaryType unification + +struct UnifyDictionaryValues { + MemoryPool* pool_; + std::shared_ptr value_type_; + const std::vector& types_; + std::shared_ptr* out_values_; + std::vector>* out_transpose_maps_; + + Status Visit(const DataType&, void* = nullptr) { + // Default implementation for non-dictionary-supported datatypes + std::stringstream ss; + ss << "Unification of " << value_type_->ToString() + << " dictionaries is not implemented"; + return Status::NotImplemented(ss.str()); + } + + template + Status Visit(const T&, + typename internal::DictionaryTraits::MemoTableType* = nullptr) { + using ArrayType = typename TypeTraits::ArrayType; + using DictTraits = typename internal::DictionaryTraits; + using MemoTableType = typename DictTraits::MemoTableType; + + MemoTableType memo_table; + if (out_transpose_maps_ != nullptr) { + out_transpose_maps_->clear(); + out_transpose_maps_->reserve(types_.size()); + } + // Build up the unified dictionary values and the transpose maps + for (const auto& type : types_) { + const ArrayType& values = checked_cast(*type->dictionary()); + if (out_transpose_maps_ != nullptr) { + std::vector transpose_map; + transpose_map.reserve(values.length()); + for (int64_t i = 0; i < values.length(); ++i) { + int32_t dict_index = memo_table.GetOrInsert(values.GetView(i)); + transpose_map.push_back(dict_index); + } + out_transpose_maps_->push_back(std::move(transpose_map)); + } else { + for (int64_t i = 0; i < values.length(); ++i) { + memo_table.GetOrInsert(values.GetView(i)); + } + } + } + // Build unified dictionary array + std::shared_ptr data; + RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table, + 0 /* start_offset */, &data)); + *out_values_ = MakeArray(data); + return Status::OK(); + } +}; + +Status DictionaryType::Unify(MemoryPool* pool, const std::vector& types, + std::shared_ptr* out_type, + std::vector>* out_transpose_maps) { + if (types.size() == 0) { + return Status::Invalid("need at least one input type"); + } + std::vector dict_types; + dict_types.reserve(types.size()); + for (const auto& type : types) { + if (type->id() != Type::DICTIONARY) { + return Status::TypeError("input types must be dictionary types"); + } + dict_types.push_back(checked_cast(type)); + } + + // XXX Should we check the ordered flag? + auto value_type = dict_types[0]->dictionary()->type(); + for (const auto& type : dict_types) { + auto values = type->dictionary(); + if (!values->type()->Equals(value_type)) { + return Status::TypeError("input types have different value types"); + } + if (values->null_count() != 0) { + return Status::TypeError("input types have null values"); + } + } + + std::shared_ptr values; + { + UnifyDictionaryValues visitor{pool, value_type, dict_types, &values, + out_transpose_maps}; + RETURN_NOT_OK(VisitTypeInline(*value_type, &visitor)); + } + + // Build unified dictionary type with the right index type + std::shared_ptr index_type; + if (values->length() <= std::numeric_limits::max()) { + index_type = int8(); + } else if (values->length() <= std::numeric_limits::max()) { + index_type = int16(); + } else if (values->length() <= std::numeric_limits::max()) { + index_type = int32(); + } else { + index_type = int64(); + } + *out_type = arrow::dictionary(index_type, values); + return Status::OK(); +} + // ---------------------------------------------------------------------- // DictionaryBuilder @@ -118,12 +227,31 @@ Status DictionaryBuilder::AppendNull() { return values_builder_.Append template Status DictionaryBuilder::AppendArray(const Array& array) { - const auto& numeric_array = checked_cast&>(array); + using ArrayType = typename TypeTraits::ArrayType; + + const auto& concrete_array = checked_cast(array); for (int64_t i = 0; i < array.length(); i++) { if (array.IsNull(i)) { RETURN_NOT_OK(AppendNull()); } else { - RETURN_NOT_OK(Append(numeric_array.Value(i))); + RETURN_NOT_OK(Append(concrete_array.GetView(i))); + } + } + return Status::OK(); +} + +template <> +Status DictionaryBuilder::AppendArray(const Array& array) { + if (!type_->Equals(*array.type())) { + return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); + } + + const auto& typed_array = checked_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + RETURN_NOT_OK(AppendNull()); + } else { + RETURN_NOT_OK(Append(typed_array.GetValue(i))); } } return Status::OK(); @@ -168,46 +296,6 @@ Status DictionaryBuilder::FinishInternal(std::shared_ptr* o return Status::OK(); } -// -// StringType and BinaryType specializations -// - -#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ - \ - template <> \ - Status DictionaryBuilder::AppendArray(const Array& array) { \ - using ArrayType = typename TypeTraits::ArrayType; \ - const ArrayType& binary_array = checked_cast(array); \ - for (int64_t i = 0; i < array.length(); i++) { \ - if (array.IsNull(i)) { \ - RETURN_NOT_OK(AppendNull()); \ - } else { \ - RETURN_NOT_OK(Append(binary_array.GetView(i))); \ - } \ - } \ - return Status::OK(); \ - } - -BINARY_DICTIONARY_SPECIALIZATIONS(StringType); -BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType); - -template <> -Status DictionaryBuilder::AppendArray(const Array& array) { - if (!type_->Equals(*array.type())) { - return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); - } - - const auto& typed_array = checked_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - RETURN_NOT_OK(AppendNull()); - } else { - RETURN_NOT_OK(Append(typed_array.GetValue(i))); - } - } - return Status::OK(); -} - template class DictionaryBuilder; template class DictionaryBuilder; template class DictionaryBuilder; diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index e0a10690c2c77..20b7aff884b7f 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -24,6 +24,8 @@ #include +#include "arrow/memory_pool.h" +#include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -480,6 +482,132 @@ TEST(TestStructType, GetChildIndex) { ASSERT_EQ(-1, struct_type.GetChildIndex("not-found")); } +TEST(TestDictionaryType, Equals) { + auto t1 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t2 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t3 = dictionary(int16(), ArrayFromJSON(int32(), "[3, 4, 5, 6]")); + auto t4 = dictionary(int8(), ArrayFromJSON(int16(), "[3, 4, 5, 6]")); + auto t5 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 7, 6]")); + + ASSERT_TRUE(t1->Equals(t2)); + // Different index type + ASSERT_FALSE(t1->Equals(t3)); + // Different value type + ASSERT_FALSE(t1->Equals(t4)); + // Different values + ASSERT_FALSE(t1->Equals(t5)); +} + +TEST(TestDictionaryType, UnifyNumeric) { + auto t1 = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7]")); + auto t2 = dictionary(int8(), ArrayFromJSON(int64(), "[1, 7, 4, 8]")); + auto t3 = dictionary(int8(), ArrayFromJSON(int64(), "[1, -200]")); + + auto expected = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7, 1, 8, -200]")); + + std::shared_ptr dict_type; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()}, + &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()}, + &dict_type, &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + ASSERT_EQ(transpose_maps.size(), 3); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1, 2})); + ASSERT_EQ(transpose_maps[1], std::vector({3, 2, 1, 4})); + ASSERT_EQ(transpose_maps[2], std::vector({3, 5})); +} + +TEST(TestDictionaryType, UnifyString) { + auto t1 = dictionary(int16(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\"]")); + auto t2 = dictionary(int32(), ArrayFromJSON(utf8(), "[\"quux\", \"foo\"]")); + + auto expected = + dictionary(int8(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"quux\"]")); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type, + &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + + ASSERT_EQ(transpose_maps.size(), 2); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1})); + ASSERT_EQ(transpose_maps[1], std::vector({2, 0})); +} + +TEST(TestDictionaryType, UnifyFixedSizeBinary) { + auto type = fixed_size_binary(3); + + std::string data = "foobarbazqux"; + auto buf = std::make_shared(data); + // ["foo", "bar"] + auto dict1 = std::make_shared(type, 2, SliceBuffer(buf, 0, 6)); + auto t1 = dictionary(int16(), dict1); + // ["bar", "baz", "qux"] + auto dict2 = std::make_shared(type, 3, SliceBuffer(buf, 3, 9)); + auto t2 = dictionary(int16(), dict2); + + // ["foo", "bar", "baz", "qux"] + auto expected_dict = std::make_shared(type, 4, buf); + auto expected = dictionary(int8(), expected_dict); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); + + std::vector> transpose_maps; + ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type, + &transpose_maps)); + ASSERT_TRUE(dict_type->Equals(expected)); + ASSERT_EQ(transpose_maps.size(), 2); + ASSERT_EQ(transpose_maps[0], std::vector({0, 1})); + ASSERT_EQ(transpose_maps[1], std::vector({1, 2, 3})); +} + +TEST(TestDictionaryType, UnifyLarge) { + // Unifying "large" dictionary types should choose the right index type + std::shared_ptr dict1, dict2, expected_dict; + + Int32Builder builder; + ASSERT_OK(builder.Reserve(120)); + for (int32_t i = 0; i < 120; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&dict1)); + ASSERT_EQ(dict1->length(), 120); + auto t1 = dictionary(int8(), dict1); + + ASSERT_OK(builder.Reserve(30)); + for (int32_t i = 110; i < 140; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&dict2)); + ASSERT_EQ(dict2->length(), 30); + auto t2 = dictionary(int8(), dict2); + + ASSERT_OK(builder.Reserve(140)); + for (int32_t i = 0; i < 140; ++i) { + builder.UnsafeAppend(i); + } + ASSERT_OK(builder.Finish(&expected_dict)); + ASSERT_EQ(expected_dict->length(), 140); + // int8 would be too narrow to hold all possible index values + auto expected = dictionary(int16(), expected_dict); + + std::shared_ptr dict_type; + ASSERT_OK( + DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type)); + ASSERT_TRUE(dict_type->Equals(expected)); +} + TEST(TypesTest, TestDecimal128Small) { Decimal128Type t1(8, 4); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5f1ca8d7b0f09..753cb65ff26da 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -260,7 +260,12 @@ DictionaryType::DictionaryType(const std::shared_ptr& index_type, : FixedWidthType(Type::DICTIONARY), index_type_(index_type), dictionary_(dictionary), - ordered_(ordered) {} + ordered_(ordered) { +#ifndef NDEBUG + const auto& int_type = checked_cast(*index_type); + DCHECK_EQ(int_type.is_signed(), true) << "dictionary index type should be signed"; +#endif +} int DictionaryType::bit_width() const { return checked_cast(*index_type_).bit_width(); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 9694202b9705c..8f6cfd6ced4ff 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -39,6 +39,7 @@ namespace arrow { class Array; class Field; +class MemoryPool; struct Type { /// \brief Main data type enumeration @@ -768,6 +769,23 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { bool ordered() const { return ordered_; } + /// \brief Unify several dictionary types + /// + /// Compute a resulting dictionary that will allow the union of values + /// of all input dictionary types. The input types must all have the + /// same value type. + /// \param[in] pool Memory pool to allocate dictionary values from + /// \param[in] types A sequence of input dictionary types + /// \param[out] out_type The unified dictionary type + /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors, + /// one per input type. Each integer vector represents the transposition + /// of input type indices into unified type indices. + // XXX Should we return something special (an empty transpose map?) when + // the transposition is the identity function? + static Status Unify(MemoryPool* pool, const std::vector& types, + std::shared_ptr* out_type, + std::vector>* out_transpose_maps = NULLPTR); + private: // Must be an integer type (not currently checked) std::shared_ptr index_type_; diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index ee368fb4e314c..76724b2a30035 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -651,25 +651,6 @@ template struct HashTraits> { using c_type = typename T::c_type; using MemoTableType = SmallScalarMemoTable; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } }; template @@ -677,25 +658,6 @@ struct HashTraits< T, typename std::enable_if::value && !is_8bit_int::value>::type> { using c_type = typename T::c_type; using MemoTableType = ScalarMemoTable; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } }; template diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc index 018eeda7248a3..5eba531d874e0 100644 --- a/cpp/src/arrow/util/int-util-test.cc +++ b/cpp/src/arrow/util/int-util-test.cc @@ -373,5 +373,14 @@ TEST(IntWidth, NullsMany) { } } +TEST(TransposeInts, Int8ToInt64) { + std::vector src = {1, 3, 5, 0, 3, 2}; + std::vector transpose_map = {1111, 2222, 3333, 4444, 5555, 6666, 7777}; + std::vector dest(src.size()); + + TransposeInts(src.data(), dest.data(), 6, transpose_map.data()); + ASSERT_EQ(dest, std::vector({2222, 4444, 6666, 1111, 4444, 3333})); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util.cc b/cpp/src/arrow/util/int-util.cc index ced1cd1c20da2..d81044b3cafdc 100644 --- a/cpp/src/arrow/util/int-util.cc +++ b/cpp/src/arrow/util/int-util.cc @@ -402,5 +402,45 @@ void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) { memcpy(dest, source, length * sizeof(int64_t)); } +template +void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length, + const int32_t* transpose_map) { + while (length >= 4) { + dest[0] = static_cast(transpose_map[src[0]]); + dest[1] = static_cast(transpose_map[src[1]]); + dest[2] = static_cast(transpose_map[src[2]]); + dest[3] = static_cast(transpose_map[src[3]]); + length -= 4; + src += 4; + dest += 4; + } + while (length > 0) { + *dest++ = static_cast(transpose_map[*src++]); + --length; + } +} + +#define INSTANTIATE(SRC, DEST) \ + template ARROW_EXPORT void TransposeInts( \ + const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map); + +#define INSTANTIATE_ALL_DEST(DEST) \ + INSTANTIATE(int8_t, DEST) \ + INSTANTIATE(int16_t, DEST) \ + INSTANTIATE(int32_t, DEST) \ + INSTANTIATE(int64_t, DEST) + +#define INSTANTIATE_ALL() \ + INSTANTIATE_ALL_DEST(int8_t) \ + INSTANTIATE_ALL_DEST(int16_t) \ + INSTANTIATE_ALL_DEST(int32_t) \ + INSTANTIATE_ALL_DEST(int64_t) + +INSTANTIATE_ALL() + +#undef INSTANTIATE +#undef INSTANTIATE_ALL +#undef INSTANTIATE_ALL_DEST + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/int-util.h b/cpp/src/arrow/util/int-util.h index 68355d34549ac..66d389e5f40cf 100644 --- a/cpp/src/arrow/util/int-util.h +++ b/cpp/src/arrow/util/int-util.h @@ -63,6 +63,10 @@ void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length); ARROW_EXPORT void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length); +template +ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length, + const int32_t* transpose_map); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index b6fc1f1ff2bfb..a5deaa7a1d22c 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -121,7 +121,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { // The scalar value's type depends on the array data type: // - the type's `c_type`, if any // - for boolean arrays, a `bool` -// - for binary, string and fixed-size binary arrars, a `util::string_view` +// - for binary, string and fixed-size binary arrays, a `util::string_view` template struct ArrayDataVisitor {}; diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 310656d86fd47..af2d1139c43fe 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -303,8 +303,8 @@ def test_dictionary_type(): assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False - ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True) - assert ty1.index_type == pa.float32() + ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True) + assert ty1.index_type == pa.int8() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True From e39e36441b94f211a57685887d14a8ff1d1b5f98 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 19 Dec 2018 15:51:08 -0600 Subject: [PATCH 39/80] ARROW-3545: [C++/Python] Use "field" terminology with StructType, specify behavior with duplicate field names Author: Wes McKinney Closes #3220 from wesm/ARROW-3545 and squashes the following commits: dc212e61c Fix more deprecated API uses 16e198473 Remove field_by_name/field APIs from Python bindings, cdef only 3c4abed05 Fix use of deprecated APIs 2eecdbf57 Rename GetChildIndex, GetChildByName for better semantic consistency. Define behavior of these functions when there are duplicate field names. Reflect changes in Python --- cpp/src/arrow/array.cc | 2 +- cpp/src/arrow/type-test.cc | 30 ++++++++++++++++--------- cpp/src/arrow/type.cc | 33 +++++++++++++++++++++++++--- cpp/src/arrow/type.h | 9 +++++++- python/pyarrow/includes/libarrow.pxd | 4 ++-- python/pyarrow/lib.pxd | 3 ++- python/pyarrow/scalar.pxi | 2 +- python/pyarrow/tests/test_types.py | 17 ++++++++++---- python/pyarrow/types.pxi | 12 +++++++--- 9 files changed, 86 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 7e45e90d9c8f7..d07c27fe15906 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -395,7 +395,7 @@ std::shared_ptr StructArray::field(int i) const { } std::shared_ptr StructArray::GetFieldByName(const std::string& name) const { - int i = struct_type()->GetChildIndex(name); + int i = struct_type()->GetFieldIndex(name); return i == -1 ? nullptr : field(i); } diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 20b7aff884b7f..5b758d7a129fd 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -448,7 +448,7 @@ TEST(TestStructType, Basics) { // TODO(wesm): out of bounds for field(...) } -TEST(TestStructType, GetChildByName) { +TEST(TestStructType, GetFieldByName) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); @@ -457,17 +457,17 @@ TEST(TestStructType, GetChildByName) { StructType struct_type({f0, f1, f2, f3}); std::shared_ptr result; - result = struct_type.GetChildByName("f1"); + result = struct_type.GetFieldByName("f1"); ASSERT_EQ(f1, result); - result = struct_type.GetChildByName("f3"); + result = struct_type.GetFieldByName("f3"); ASSERT_EQ(f3, result); - result = struct_type.GetChildByName("not-found"); + result = struct_type.GetFieldByName("not-found"); ASSERT_EQ(result, nullptr); } -TEST(TestStructType, GetChildIndex) { +TEST(TestStructType, GetFieldIndex) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); @@ -475,11 +475,21 @@ TEST(TestStructType, GetChildIndex) { StructType struct_type({f0, f1, f2, f3}); - ASSERT_EQ(0, struct_type.GetChildIndex(f0->name())); - ASSERT_EQ(1, struct_type.GetChildIndex(f1->name())); - ASSERT_EQ(2, struct_type.GetChildIndex(f2->name())); - ASSERT_EQ(3, struct_type.GetChildIndex(f3->name())); - ASSERT_EQ(-1, struct_type.GetChildIndex("not-found")); + ASSERT_EQ(0, struct_type.GetFieldIndex(f0->name())); + ASSERT_EQ(1, struct_type.GetFieldIndex(f1->name())); + ASSERT_EQ(2, struct_type.GetFieldIndex(f2->name())); + ASSERT_EQ(3, struct_type.GetFieldIndex(f3->name())); + ASSERT_EQ(-1, struct_type.GetFieldIndex("not-found")); +} + +TEST(TestStructType, GetFieldIndexDuplicates) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int64()); + auto f2 = field("f1", utf8()); + StructType struct_type({f0, f1, f2}); + + ASSERT_EQ(0, struct_type.GetFieldIndex("f0")); + ASSERT_EQ(-1, struct_type.GetFieldIndex("f1")); } TEST(TestDictionaryType, Equals) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 753cb65ff26da..ee7fda7c8c8f4 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -232,18 +232,37 @@ std::string StructType::ToString() const { return s.str(); } -std::shared_ptr StructType::GetChildByName(const std::string& name) const { - int i = GetChildIndex(name); +std::shared_ptr StructType::GetFieldByName(const std::string& name) const { + int i = GetFieldIndex(name); return i == -1 ? nullptr : children_[i]; } -int StructType::GetChildIndex(const std::string& name) const { +int StructType::GetFieldIndex(const std::string& name) const { if (children_.size() > 0 && name_to_index_.size() == 0) { for (size_t i = 0; i < children_.size(); ++i) { name_to_index_[children_[i]->name()] = static_cast(i); } } + if (name_to_index_.size() < children_.size()) { + // There are duplicate field names. Refuse to guess + int counts = 0; + int last_observed_index = -1; + for (size_t i = 0; i < children_.size(); ++i) { + if (children_[i]->name() == name) { + ++counts; + last_observed_index = static_cast(i); + } + } + + if (counts == 1) { + return last_observed_index; + } else { + // Duplicate or not found + return -1; + } + } + auto it = name_to_index_.find(name); if (it == name_to_index_.end()) { return -1; @@ -252,6 +271,14 @@ int StructType::GetChildIndex(const std::string& name) const { } } +std::shared_ptr StructType::GetChildByName(const std::string& name) const { + return GetFieldByName(name); +} + +int StructType::GetChildIndex(const std::string& name) const { + return GetFieldIndex(name); +} + // ---------------------------------------------------------------------- // DictionaryType diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 8f6cfd6ced4ff..95b5189de0343 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -516,9 +516,16 @@ class ARROW_EXPORT StructType : public NestedType { std::string name() const override { return "struct"; } /// Returns null if name not found + std::shared_ptr GetFieldByName(const std::string& name) const; + + /// Returns -1 if name not found or if there are multiple fields having the + /// same name + int GetFieldIndex(const std::string& name) const; + + ARROW_DEPRECATED("Use GetFieldByName") std::shared_ptr GetChildByName(const std::string& name) const; - /// Returns -1 if name not found + ARROW_DEPRECATED("Use GetChildIndex") int GetChildIndex(const std::string& name) const; private: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 61517e4f09d21..f4629af0617fb 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -276,8 +276,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) - shared_ptr[CField] GetChildByName(const c_string& name) - int GetChildIndex(const c_string& name) + shared_ptr[CField] GetFieldByName(const c_string& name) + int GetFieldIndex(const c_string& name) cdef cppclass CUnionType" arrow::UnionType"(CDataType): CUnionType(const vector[shared_ptr[CField]]& fields, diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 3e628263ba36f..d829d6a0c50ad 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -65,7 +65,8 @@ cdef class StructType(DataType): cdef: const CStructType* struct_type - cdef Field child_by_name(self, name) + cdef Field field(self, int i) + cdef Field field_by_name(self, name) cdef class DictionaryType(DataType): diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index a2a133beb43f6..fd3f58072d452 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -470,7 +470,7 @@ cdef class StructValue(ArrayValue): int index type = self.type.type - index = type.GetChildIndex(tobytes(key)) + index = type.GetFieldIndex(tobytes(key)) if index < 0: raise KeyError(key) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index af2d1139c43fe..729c76e1471f5 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -231,9 +231,12 @@ def test_list_type(): def test_struct_type(): - fields = [pa.field('a', pa.int64()), - pa.field('a', pa.int32()), - pa.field('b', pa.int32())] + fields = [ + # Duplicate field name on purpose + pa.field('a', pa.int64()), + pa.field('a', pa.int32()), + pa.field('b', pa.int32()) + ] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 @@ -243,11 +246,17 @@ def test_struct_type(): with pytest.raises(IndexError): assert ty[3] - assert ty['a'] == ty[1] assert ty['b'] == ty[2] + + # Duplicate + with pytest.raises(KeyError): + ty['a'] + + # Not found with pytest.raises(KeyError): ty['c'] + # Neither integer nor string with pytest.raises(TypeError): ty[None] diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index d367a8a85673f..29b2a1ea3c9a0 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -213,13 +213,19 @@ cdef class StructType(DataType): DataType.init(self, type) self.struct_type = type.get() - cdef Field child_by_name(self, name): + cdef Field field(self, int i): + """ + Alias for child(i) + """ + return self.child(i) + + cdef Field field_by_name(self, name): """ Access a child field by its name rather than the column index. """ cdef shared_ptr[CField] field - field = self.struct_type.GetChildByName(tobytes(name)) + field = self.struct_type.GetFieldByName(tobytes(name)) if field == nullptr: raise KeyError(name) @@ -234,7 +240,7 @@ cdef class StructType(DataType): def __getitem__(self, i): if isinstance(i, six.string_types): - return self.child_by_name(i) + return self.field_by_name(i) elif isinstance(i, six.integer_types): return self.child(i) else: From cca9d2866508030f0db6999ff3ce6d39be393bb9 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 19 Dec 2018 23:15:00 +0100 Subject: [PATCH 40/80] ARROW-3620: [Python] Document pa.cpu_count() in Sphinx API docs Author: Antoine Pitrou Closes #3224 from pitrou/ARROW-3620-document-cpu-count and squashes the following commits: 15fda9ba ARROW-3620: Document pa.cpu_count() in Sphinx API docs --- docs/source/python/api.rst | 47 +++++++++++++++++++++++--------------- docs/source/python/csv.rst | 4 ++++ python/pyarrow/lib.pyx | 7 +++--- 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 064a3e9740543..40ccb68c36f38 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -282,21 +282,8 @@ Serialization and IPC SerializedPyObject SerializationContext -.. _api.feather: - -Feather Format -~~~~~~~~~~~~~~ - -.. currentmodule:: pyarrow.feather - .. _api.memory_pool: -.. autosummary:: - :toctree: generated/ - - read_feather - write_feather - Memory Pools ------------ @@ -329,8 +316,8 @@ Type Classes .. _api.plasma: -In-Memory Object Store ----------------------- +Plasma In-Memory Object Store +----------------------------- .. autosummary:: :toctree: generated/ @@ -354,12 +341,27 @@ CSV Files ConvertOptions read_csv -.. currentmodule:: pyarrow.parquet +.. _api.feather: + +Feather Files +------------- + +.. currentmodule:: pyarrow.feather + +.. autosummary:: + :toctree: generated/ + + read_feather + write_feather + +.. currentmodule:: pyarrow .. _api.parquet: -Apache Parquet --------------- +Parquet Files +------------- + +.. currentmodule:: pyarrow.parquet .. autosummary:: :toctree: generated/ @@ -377,6 +379,15 @@ Apache Parquet .. currentmodule:: pyarrow +Multi-Threading +--------------- + +.. autosummary:: + :toctree: generated/ + + cpu_count + set_cpu_count + Using with C extensions ----------------------- diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst index f1bcea9e24795..17023b1610d48 100644 --- a/docs/source/python/csv.rst +++ b/docs/source/python/csv.rst @@ -86,3 +86,7 @@ overhead of reading CSV files. Performance options can be controlled through the :class:`ReadOptions` class. Multi-threaded reading is the default for highest performance, distributing the workload efficiently over all available cores. + +.. note:: + The number of threads to use concurrently is automatically inferred by Arrow + and can be inspected using the :func:`~pyarrow.cpu_count()` function. diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 9c661dbc3554a..3fe879a319668 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -42,9 +42,10 @@ def cpu_count(): Return the number of threads to use in parallel operations. The number of threads is determined at startup by inspecting the - OMP_NUM_THREADS and OMP_THREAD_LIMIT environment variables. If neither - is present, it will default to the number of hardware threads on the - system. It can be modified at runtime by calling set_cpu_count(). + ``OMP_NUM_THREADS`` and ``OMP_THREAD_LIMIT`` environment variables. + If neither is present, it will default to the number of hardware threads + on the system. It can be modified at runtime by calling + :func:`set_cpu_count()`. """ return GetCpuThreadPoolCapacity() From 1a8c8f0b2aae01fe8357980f1f4e5c879941c0eb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 19 Dec 2018 18:01:19 -0700 Subject: [PATCH 41/80] ARROW-4038: [Rust] Implement boolean AND, OR, NOT array ops - Implements boolean AND, OR, NOT operations in `array_ops` - Removes all uses of `unwrap()` in array_ops and replaces with `?` - Improve error messages Author: Andy Grove Closes #3189 from andygrove/ARROW-4038 and squashes the following commits: 69518d7 add tests a38d9a9 add docs for all array_ops and add explicit handling for case where both sides are null 661e2af improve error message 36b9171 Implement boolean AND, OR, NOT operations, remove unwraps and improve error message --- rust/src/array_ops.rs | 175 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 169 insertions(+), 6 deletions(-) diff --git a/rust/src/array_ops.rs b/rust/src/array_ops.rs index e73a858e951b1..59145754f0248 100644 --- a/rust/src/array_ops.rs +++ b/rust/src/array_ops.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Defines primitive computations on arrays + use std::ops::{Add, Div, Mul, Sub}; use num::Zero; @@ -25,6 +27,7 @@ use crate::datatypes; use crate::datatypes::ArrowNumericType; use crate::error::{ArrowError, Result}; +/// Perform `left + right` operation on two arrays. If either left or right value is null then the result is also null. pub fn add(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> where T: datatypes::ArrowNumericType, @@ -37,6 +40,7 @@ where math_op(left, right, |a, b| Ok(a + b)) } +/// Perform `left - right` operation on two arrays. If either left or right value is null then the result is also null. pub fn subtract(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> where T: datatypes::ArrowNumericType, @@ -49,6 +53,7 @@ where math_op(left, right, |a, b| Ok(a - b)) } +/// Perform `left * right` operation on two arrays. If either left or right value is null then the result is also null. pub fn multiply(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> where T: datatypes::ArrowNumericType, @@ -61,6 +66,8 @@ where math_op(left, right, |a, b| Ok(a * b)) } +/// Perform `left / right` operation on two arrays. If either left or right value is null then the result is also null. +/// If any right hand value is zero then the result of this operation will be `Err(ArrowError::DivideByZero)`. pub fn divide(left: &PrimitiveArray, right: &PrimitiveArray) -> Result> where T: datatypes::ArrowNumericType, @@ -79,6 +86,8 @@ where }) } +/// Helper function to perform math lambda function on values from two arrays. If either left or +/// right value is null then the output value is also null, so `1 + null` is `null`. fn math_op( left: &PrimitiveArray, right: &PrimitiveArray, @@ -90,16 +99,16 @@ where { if left.len() != right.len() { return Err(ArrowError::ComputeError( - "Cannot perform math operation on two batches of different length".to_string(), + "Cannot perform math operation on arrays of different length".to_string(), )); } let mut b = PrimitiveArrayBuilder::::new(left.len()); for i in 0..left.len() { let index = i; if left.is_null(i) || right.is_null(i) { - b.push_null().unwrap(); + b.push_null()?; } else { - b.push(op(left.value(index), right.value(index))?).unwrap(); + b.push(op(left.value(index), right.value(index))?)?; } } Ok(b.finish()) @@ -121,6 +130,7 @@ where min_max_helper(array, |a, b| a > b) } +/// Helper function to perform min/max lambda function on values from a numeric array. fn min_max_helper(array: &PrimitiveArray, cmp: F) -> Option where T: ArrowNumericType, @@ -145,6 +155,7 @@ where n } +/// Perform `left == right` operation on two arrays. pub fn eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, @@ -152,6 +163,7 @@ where bool_op(left, right, |a, b| a == b) } +/// Perform `left != right` operation on two arrays. pub fn neq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, @@ -159,50 +171,59 @@ where bool_op(left, right, |a, b| a != b) } +/// Perform `left < right` operation on two arrays. Null values are less than non-null values. pub fn lt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, { bool_op(left, right, |a, b| match (a, b) { + (None, None) => false, (None, _) => true, (_, None) => false, (Some(aa), Some(bb)) => aa < bb, }) } +/// Perform `left <= right` operation on two arrays. Null values are less than non-null values. pub fn lt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, { bool_op(left, right, |a, b| match (a, b) { + (None, None) => true, (None, _) => true, (_, None) => false, (Some(aa), Some(bb)) => aa <= bb, }) } +/// Perform `left > right` operation on two arrays. Non-null values are greater than null values. pub fn gt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, { bool_op(left, right, |a, b| match (a, b) { + (None, None) => false, (None, _) => false, (_, None) => true, (Some(aa), Some(bb)) => aa > bb, }) } +/// Perform `left >= right` operation on two arrays. Non-null values are greater than null values. pub fn gt_eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, { bool_op(left, right, |a, b| match (a, b) { + (None, None) => true, (None, _) => false, (_, None) => true, (Some(aa), Some(bb)) => aa >= bb, }) } +/// Helper function to perform boolean lambda function on values from two arrays. fn bool_op(left: &PrimitiveArray, right: &PrimitiveArray, op: F) -> Result where T: ArrowNumericType, @@ -210,7 +231,7 @@ where { if left.len() != right.len() { return Err(ArrowError::ComputeError( - "Cannot perform math operation on two batches of different length".to_string(), + "Cannot perform math operation on arrays of different length".to_string(), )); } let mut b = BooleanArray::builder(left.len()); @@ -226,7 +247,56 @@ where } else { Some(right.value(index)) }; - b.push(op(l, r)).unwrap(); + b.push(op(l, r))?; + } + Ok(b.finish()) +} + +/// Perform `AND` operation on two arrays. If either left or right value is null then the result is also null. +pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform boolean operation on arrays of different length".to_string(), + )); + } + let mut b = BooleanArray::builder(left.len()); + for i in 0..left.len() { + if left.is_null(i) || right.is_null(i) { + b.push_null()?; + } else { + b.push(left.value(i) && right.value(i))?; + } + } + Ok(b.finish()) +} + +/// Perform `OR` operation on two arrays. If either left or right value is null then the result is also null. +pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform boolean operation on arrays of different length".to_string(), + )); + } + let mut b = BooleanArray::builder(left.len()); + for i in 0..left.len() { + if left.is_null(i) || right.is_null(i) { + b.push_null()?; + } else { + b.push(left.value(i) || right.value(i))?; + } + } + Ok(b.finish()) +} + +/// Perform unary `NOT` operation on an arrays. If value is null then the result is also null. +pub fn not(left: &BooleanArray) -> Result { + let mut b = BooleanArray::builder(left.len()); + for i in 0..left.len() { + if left.is_null(i) { + b.push_null()?; + } else { + b.push(!left.value(i))?; + } } Ok(b.finish()) } @@ -256,7 +326,7 @@ mod tests { .err() .expect("should have failed due to different lengths"); assert_eq!( - "ComputeError(\"Cannot perform math operation on two batches of different length\")", + "ComputeError(\"Cannot perform math operation on arrays of different length\")", format!("{:?}", e) ); } @@ -365,6 +435,16 @@ mod tests { assert_eq!(true, c.value(4)); } + #[test] + fn test_primitive_array_lt_nulls() { + let a = Int32Array::from(vec![None, None, Some(1)]); + let b = Int32Array::from(vec![None, Some(1), None]); + let c = lt(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + } + #[test] fn test_primitive_array_lt_eq() { let a = Int32Array::from(vec![8, 8, 8, 8, 8]); @@ -377,6 +457,16 @@ mod tests { assert_eq!(true, c.value(4)); } + #[test] + fn test_primitive_array_lt_eq_nulls() { + let a = Int32Array::from(vec![None, None, Some(1)]); + let b = Int32Array::from(vec![None, Some(1), None]); + let c = lt_eq(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + } + #[test] fn test_primitive_array_gt() { let a = Int32Array::from(vec![8, 8, 8, 8, 8]); @@ -389,6 +479,16 @@ mod tests { assert_eq!(false, c.value(4)); } + #[test] + fn test_primitive_array_gt_nulls() { + let a = Int32Array::from(vec![None, None, Some(1)]); + let b = Int32Array::from(vec![None, Some(1), None]); + let c = gt(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(true, c.value(2)); + } + #[test] fn test_primitive_array_gt_eq() { let a = Int32Array::from(vec![8, 8, 8, 8, 8]); @@ -401,6 +501,16 @@ mod tests { assert_eq!(false, c.value(4)); } + #[test] + fn test_primitive_array_gt_eq_nulls() { + let a = Int32Array::from(vec![None, None, Some(1)]); + let b = Int32Array::from(vec![None, Some(1), None]); + let c = gt_eq(&a, &b).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(true, c.value(2)); + } + #[test] fn test_buffer_array_min_max() { let a = Int32Array::from(vec![5, 6, 7, 8, 9]); @@ -415,4 +525,57 @@ mod tests { assert_eq!(9, max(&a).unwrap()); } + #[test] + fn test_bool_array_and() { + let a = BooleanArray::from(vec![false, false, true, true]); + let b = BooleanArray::from(vec![false, true, false, true]); + let c = and(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(false, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(true, c.value(3)); + } + + #[test] + fn test_bool_array_or() { + let a = BooleanArray::from(vec![false, false, true, true]); + let b = BooleanArray::from(vec![false, true, false, true]); + let c = or(&a, &b).unwrap(); + assert_eq!(false, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(true, c.value(2)); + assert_eq!(true, c.value(3)); + } + + #[test] + fn test_bool_array_or_nulls() { + let a = BooleanArray::from(vec![None, Some(false), None, Some(false)]); + let b = BooleanArray::from(vec![None, None, Some(false), Some(false)]); + let c = or(&a, &b).unwrap(); + assert_eq!(true, c.is_null(0)); + assert_eq!(true, c.is_null(1)); + assert_eq!(true, c.is_null(2)); + assert_eq!(false, c.is_null(3)); + } + + #[test] + fn test_bool_array_not() { + let a = BooleanArray::from(vec![false, false, true, true]); + let c = not(&a).unwrap(); + assert_eq!(true, c.value(0)); + assert_eq!(true, c.value(1)); + assert_eq!(false, c.value(2)); + assert_eq!(false, c.value(3)); + } + + #[test] + fn test_bool_array_and_nulls() { + let a = BooleanArray::from(vec![None, Some(false), None, Some(false)]); + let b = BooleanArray::from(vec![None, None, Some(false), Some(false)]); + let c = and(&a, &b).unwrap(); + assert_eq!(true, c.is_null(0)); + assert_eq!(true, c.is_null(1)); + assert_eq!(true, c.is_null(2)); + assert_eq!(false, c.is_null(3)); + } } From 729cc3d3f31ebeeab9a86ec0ed59cf4000802135 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 20 Dec 2018 18:27:01 +0900 Subject: [PATCH 42/80] ARROW-4085: [GLib] Use "field" for struct data type Because C++ API is changed to use "field" by ARROW-3545. Author: Kouhei Sutou Closes #3229 from kou/glib-use-field and squashes the following commits: c078e31f Use "field" for struct data type --- c_glib/arrow-glib/composite-data-type.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index a4d3d843617a0..599506f269c8c 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -230,7 +230,7 @@ garrow_struct_data_type_get_field_by_name(GArrowStructDataType *data_type, auto arrow_struct_data_type = std::static_pointer_cast(arrow_data_type); - auto arrow_field = arrow_struct_data_type->GetChildByName(name); + auto arrow_field = arrow_struct_data_type->GetFieldByName(name); if (arrow_field) { return garrow_field_new_raw(&arrow_field); } else { @@ -256,7 +256,7 @@ garrow_struct_data_type_get_field_index(GArrowStructDataType *data_type, auto arrow_struct_data_type = std::static_pointer_cast(arrow_data_type); - return arrow_struct_data_type->GetChildIndex(name); + return arrow_struct_data_type->GetFieldIndex(name); } From c39db631f74e617b5317a64997364ea61c82c5f1 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 20 Dec 2018 08:36:54 -0600 Subject: [PATCH 43/80] ARROW-4082: [C++] Allow RelWithDebInfo, improve FindClangTools SetupCxxFlags.cmake does not list "RELWITHDEBINFO" in the final flag setup, so cmake will error out if that build config is selected. It's handy for quick debugging without switching your python build etc over to "DEBUG". FindClangTools.cmake could check the version of 'clang-format' (no version suffix) to see if it satisfies a version requirement. Also the doccomment lists the incorrect variable name for the hint path Author: Benjamin Kietzman Closes #3227 from bkietz/ARROW-4082-tweak-cmake and squashes the following commits: 15526cf01 allow RelWithDebInfo, improve FindClangTools --- cpp/README.md | 6 ++++++ cpp/cmake_modules/FindClangTools.cmake | 29 ++++++++++++++++++++++---- cpp/cmake_modules/SetupCxxFlags.cmake | 1 + 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index 5940db1f44301..b602bef1c7710 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -428,6 +428,12 @@ You may find the required packages at http://releases.llvm.org/download.html or use the Debian/Ubuntu APT repositories on https://apt.llvm.org/. On macOS with [Homebrew][1] you can get it via `brew install llvm@6`. +Depending on how you installed clang-format, the build system may not be able +to find it. You can provide an explicit path to your LLVM installation (or the +root path for the clang tools) with the environment variable +`$CLANG_TOOLS_PATH` or by passing `-DClangTools_PATH=$PATH_TO_CLANG_TOOLS` when +invoking CMake. + ## Checking for ABI and API stability To build ABI compliance reports, you need to install the two tools diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index 2ddf7880ceb43..62ee8c3b6dd4a 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -20,7 +20,7 @@ # Variables used by this module, they can change the default behaviour and need # to be set before calling find_package: # -# ClangToolsBin_HOME - +# ClangTools_PATH - # When set, this path is inspected instead of standard library binary locations # to find clang-tidy and clang-format # @@ -75,10 +75,11 @@ if (CLANG_FORMAT_VERSION) ) # If not found yet, search alternative locations - if (("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") AND APPLE) + if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND") + STRING(REGEX REPLACE "^([0-9]+)\\.[0-9]+" "\\1" CLANG_FORMAT_MAJOR_VERSION "${CLANG_FORMAT_VERSION}") + STRING(REGEX REPLACE "^[0-9]+\\.([0-9]+)" "\\1" CLANG_FORMAT_MINOR_VERSION "${CLANG_FORMAT_VERSION}") + if (APPLE) # Homebrew ships older LLVM versions in /usr/local/opt/llvm@version/ - STRING(REGEX REPLACE "^([0-9]+)\\.[0-9]+" "\\1" CLANG_FORMAT_MAJOR_VERSION "${CLANG_FORMAT_VERSION}") - STRING(REGEX REPLACE "^[0-9]+\\.([0-9]+)" "\\1" CLANG_FORMAT_MINOR_VERSION "${CLANG_FORMAT_VERSION}") if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0") find_program(CLANG_FORMAT_BIN NAMES clang-format @@ -102,7 +103,27 @@ if (CLANG_FORMAT_VERSION) NO_DEFAULT_PATH ) endif() + else() + # try searching for "clang-format" and check the version + find_program(CLANG_FORMAT_BIN + NAMES clang-format + PATHS + ${ClangTools_PATH} + $ENV{CLANG_TOOLS_PATH} + /usr/local/bin /usr/bin + NO_DEFAULT_PATH + ) + if (NOT ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND")) + execute_process(COMMAND ${CLANG_FORMAT_BIN} "-version" + OUTPUT_VARIABLE CLANG_FORMAT_FOUND_VERSION_MESSAGE + OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ("${CLANG_FORMAT_FOUND_VERSION_MESSAGE}" MATCHES "^clang-format version ${CLANG_FORMAT_MAJOR_VERSION}\\.${CLANG_FORMAT_MINOR_VERSION}.*")) + set(CLANG_FORMAT_BIN "CLANG_FORMAT_BIN-NOTFOUND") + endif() + endif() + endif() endif() + else() find_program(CLANG_FORMAT_BIN NAMES clang-format-4.0 diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 61fd14ca2cf46..11608350c5f7a 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -365,6 +365,7 @@ message("Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_ if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_DEBUG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_DEBUG}") +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "RELWITHDEBINFO") elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_FASTDEBUG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_FASTDEBUG}") From ce9c6e3914274dcaf7806159ea5373e0cb632727 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 20 Dec 2018 08:51:31 -0600 Subject: [PATCH 44/80] ARROW-4084: [C++] Make Status static method support variadic arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Static constructors like `Status::Invalid` now supports variadic arguments à la `Status::Invalid("my", variable, "error message: ", i)`. - A new macro was added `ARROW_RETURN_IF(cond, status)` which replaces the previous `ARROW_RETURN_IF_FALSE` but also adds branch prediction hints. Note that only gandiva was refactored with this macro as otherwise the code review would have exploded. - Fixed a bug in memory map implementations not checking the return code of `mmap` and `mremap`. Author: François Saint-Jacques Closes #3228 from fsaintjacques/ARROW-4084-variadic-status-message and squashes the following commits: a877ab994 Travis 890df68f9 Remove gandiva expect string message testing 71ecbae7d Use perfect forwarding. 774bf9387 Add missing string header bf5cdfe06 Removed code printing in status 1d1db49c4 Reformat d9fcad919 ARROW-4084: Make Status static method support variadic arguments --- cpp/src/arrow/adapters/orc/adapter.cc | 22 +-- cpp/src/arrow/array.cc | 60 +++---- cpp/src/arrow/array/builder_binary.cc | 21 +-- cpp/src/arrow/array/builder_nested.cc | 12 +- cpp/src/arrow/builder.cc | 5 +- cpp/src/arrow/compute/kernels/cast.cc | 51 +++--- cpp/src/arrow/compute/kernels/hash.cc | 8 +- cpp/src/arrow/csv/converter.cc | 23 +-- cpp/src/arrow/csv/parser.cc | 4 +- cpp/src/arrow/csv/reader.cc | 6 +- .../arrow/dbi/hiveserver2/hiveserver2-test.cc | 6 +- cpp/src/arrow/dbi/hiveserver2/service.cc | 4 +- .../arrow/dbi/hiveserver2/thrift-internal.cc | 6 +- cpp/src/arrow/flight/internal.cc | 11 +- cpp/src/arrow/gpu/cuda_arrow_ipc.cc | 5 +- cpp/src/arrow/gpu/cuda_common.h | 16 +- cpp/src/arrow/io/file-test.cc | 4 +- cpp/src/arrow/io/file.cc | 4 +- cpp/src/arrow/io/hdfs-internal.cc | 8 +- cpp/src/arrow/io/hdfs.cc | 34 ++-- cpp/src/arrow/ipc/dictionary.cc | 8 +- cpp/src/arrow/ipc/feather.cc | 4 +- cpp/src/arrow/ipc/json-integration-test.cc | 10 +- cpp/src/arrow/ipc/json-internal.cc | 42 ++--- cpp/src/arrow/ipc/json-internal.h | 63 +++---- cpp/src/arrow/ipc/json-simple.cc | 42 ++--- cpp/src/arrow/ipc/message.cc | 31 ++-- cpp/src/arrow/ipc/metadata-internal.cc | 8 +- cpp/src/arrow/ipc/reader.cc | 29 +-- cpp/src/arrow/memory_pool.cc | 20 +-- cpp/src/arrow/python/arrow_to_pandas.cc | 114 +++++------- cpp/src/arrow/python/common.h | 6 +- cpp/src/arrow/python/decimal.cc | 8 +- cpp/src/arrow/python/helpers.cc | 24 +-- cpp/src/arrow/python/inference.cc | 17 +- cpp/src/arrow/python/numpy-internal.h | 5 +- cpp/src/arrow/python/numpy_convert.cc | 12 +- cpp/src/arrow/python/numpy_to_arrow.cc | 19 +- cpp/src/arrow/python/python_to_arrow.cc | 15 +- cpp/src/arrow/python/serialize.cc | 7 +- cpp/src/arrow/python/util/datetime.h | 4 +- cpp/src/arrow/record_batch.cc | 26 ++- cpp/src/arrow/status.cc | 1 + cpp/src/arrow/status.h | 155 +++++++++------- cpp/src/arrow/table.cc | 52 ++---- cpp/src/arrow/util/compression_brotli.cc | 4 +- cpp/src/arrow/util/compression_lz4.cc | 16 +- cpp/src/arrow/util/compression_snappy.cc | 6 +- cpp/src/arrow/util/compression_zlib.cc | 54 ++---- cpp/src/arrow/util/compression_zstd.cc | 4 +- cpp/src/arrow/util/decimal.cc | 20 +-- cpp/src/arrow/util/decimal.h | 5 +- cpp/src/arrow/util/io-util.cc | 54 +++--- cpp/src/arrow/util/string_builder.h | 51 ++++++ cpp/src/gandiva/date_utils.cc | 19 +- cpp/src/gandiva/engine.cc | 29 ++- cpp/src/gandiva/expr_validator.cc | 168 ++++++++---------- cpp/src/gandiva/filter.cc | 59 +++--- cpp/src/gandiva/like_holder.cc | 37 ++-- cpp/src/gandiva/llvm_generator.cc | 38 ++-- cpp/src/gandiva/projector.cc | 120 +++++-------- cpp/src/gandiva/regex_util.cc | 14 +- cpp/src/gandiva/selection_vector.cc | 82 ++++----- .../tests/projector_build_validation_test.cc | 13 +- cpp/src/parquet/arrow/reader.cc | 11 +- cpp/src/parquet/arrow/schema.cc | 28 ++- cpp/src/parquet/arrow/writer.cc | 11 +- cpp/src/plasma/io.cc | 10 +- 68 files changed, 763 insertions(+), 1122 deletions(-) create mode 100644 cpp/src/arrow/util/string_builder.h diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index de803d5ba6f03..01fc09afb0c92 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -206,11 +206,7 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { *out = union_(fields, type_codes); break; } - default: { - std::stringstream ss; - ss << "Unknown Orc type kind: " << kind; - return Status::Invalid(ss.str()); - } + default: { return Status::Invalid("Unknown Orc type kind: ", kind); } } return Status::OK(); } @@ -346,11 +342,9 @@ class ORCFileReader::Impl { } Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) { - if (stripe < 0 || stripe >= NumberOfStripes()) { - std::stringstream ss; - ss << "Out of bounds stripe: " << stripe; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(), + Status::Invalid("Out of bounds stripe: ", stripe)); + opts->range(stripes_[stripe].offset, stripes_[stripe].length); return Status::OK(); } @@ -359,9 +353,7 @@ class ORCFileReader::Impl { const std::vector& include_indices) { std::list include_indices_list; for (auto it = include_indices.begin(); it != include_indices.end(); ++it) { - if (*it < 0) { - return Status::Invalid("Negative field index"); - } + ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index")); include_indices_list.push_back(*it); } opts->includeTypes(include_indices_list); @@ -455,9 +447,7 @@ class ORCFileReader::Impl { case liborc::DECIMAL: return AppendDecimalBatch(type, batch, offset, length, builder); default: - std::stringstream ss; - ss << "Not implemented type kind: " << kind; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type kind: ", kind); } } diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index d07c27fe15906..66a685b45d315 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -638,9 +638,8 @@ Status DictionaryArray::FromArrays(const std::shared_ptr& type, is_valid = ValidateDictionaryIndices(indices, upper_bound); break; default: - std::stringstream ss; - ss << "Categorical index type not supported: " << indices->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Categorical index type not supported: ", + indices->type()->ToString()); } if (!is_valid.ok()) { @@ -740,12 +739,11 @@ struct ValidateVisitor { Status Visit(const NullArray&) { return Status::OK(); } Status Visit(const PrimitiveArray& array) { - if (array.data()->buffers.size() != 2) { - return Status::Invalid("number of buffers was != 2"); - } - if (array.values() == nullptr) { - return Status::Invalid("values was null"); - } + ARROW_RETURN_IF(array.data()->buffers.size() != 2, + Status::Invalid("number of buffers was != 2")); + + ARROW_RETURN_IF(array.values() == nullptr, Status::Invalid("values was null")); + return Status::OK(); } @@ -776,10 +774,8 @@ struct ValidateVisitor { return Status::Invalid("value_offsets_ was null"); } if (value_offsets->size() / static_cast(sizeof(int32_t)) < array.length()) { - std::stringstream ss; - ss << "offset buffer size (bytes): " << value_offsets->size() - << " isn't large enough for length: " << array.length(); - return Status::Invalid(ss.str()); + return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), + " isn't large enough for length: ", array.length()); } if (!array.values()) { @@ -788,17 +784,13 @@ struct ValidateVisitor { const int32_t last_offset = array.value_offset(array.length()); if (array.values()->length() != last_offset) { - std::stringstream ss; - ss << "Final offset invariant not equal to values length: " << last_offset - << "!=" << array.values()->length(); - return Status::Invalid(ss.str()); + return Status::Invalid("Final offset invariant not equal to values length: ", + last_offset, "!=", array.values()->length()); } const Status child_valid = ValidateArray(*array.values()); if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Child array invalid: ", child_valid.ToString()); } int32_t prev_offset = array.value_offset(0); @@ -808,18 +800,14 @@ struct ValidateVisitor { for (int64_t i = 1; i <= array.length(); ++i) { int32_t current_offset = array.value_offset(i); if (array.IsNull(i - 1) && current_offset != prev_offset) { - std::stringstream ss; - ss << "Offset invariant failure at: " << i - << " inconsistent value_offsets for null slot" << current_offset - << "!=" << prev_offset; - return Status::Invalid(ss.str()); + return Status::Invalid("Offset invariant failure at: ", i, + " inconsistent value_offsets for null slot", + current_offset, "!=", prev_offset); } if (current_offset < prev_offset) { - std::stringstream ss; - ss << "Offset invariant failure: " << i - << " inconsistent offset for non-null slot: " << current_offset << "<" - << prev_offset; - return Status::Invalid(ss.str()); + return Status::Invalid("Offset invariant failure: ", i, + " inconsistent offset for non-null slot: ", current_offset, + "<", prev_offset); } prev_offset = current_offset; } @@ -842,18 +830,14 @@ struct ValidateVisitor { for (int i = 0; i < array.num_fields(); ++i) { auto it = array.field(i); if (it->length() != array_length) { - std::stringstream ss; - ss << "Length is not equal from field " << it->type()->ToString() - << " at position {" << idx << "}"; - return Status::Invalid(ss.str()); + return Status::Invalid("Length is not equal from field ", + it->type()->ToString(), " at position [", idx, "]"); } const Status child_valid = ValidateArray(*it); if (!child_valid.ok()) { - std::stringstream ss; - ss << "Child array invalid: " << child_valid.ToString() << " at position {" - << idx << "}"; - return Status::Invalid(ss.str()); + return Status::Invalid("Child array invalid: ", child_valid.ToString(), + " at position [", idx, "}"); } ++idx; } diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index ad6ba11a484d1..8739859310b10 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -59,21 +59,18 @@ Status BinaryBuilder::Resize(int64_t capacity) { } Status BinaryBuilder::ReserveData(int64_t elements) { - if (value_data_length() + elements > value_data_capacity()) { - if (value_data_length() + elements > kBinaryMemoryLimit) { - return Status::CapacityError( - "Cannot reserve capacity larger than 2^31 - 1 for binary"); - } - RETURN_NOT_OK(value_data_builder_.Reserve(elements)); - } - return Status::OK(); + const int64_t size = value_data_length() + elements; + ARROW_RETURN_IF( + size > kBinaryMemoryLimit, + Status::CapacityError("Cannot reserve capacity larger than 2^31 - 1 for binary")); + + return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements) + : Status::OK(); } Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { - std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " - << num_bytes; - return Status::CapacityError(ss.str()); + return Status::CapacityError("BinaryArray cannot contain more than ", + kBinaryMemoryLimit, " bytes, have ", num_bytes); } Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index e73324323af3d..87c302a82cfe6 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -58,13 +58,11 @@ Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, } Status ListBuilder::AppendNextOffset() { - int64_t num_values = value_builder_->length(); - if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { - std::stringstream ss; - ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," - << " have " << num_values; - return Status::CapacityError(ss.str()); - } + const int64_t num_values = value_builder_->length(); + ARROW_RETURN_IF( + num_values > kListMaximumElements, + Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,", + " have ", num_values)); return offsets_builder_.Append(static_cast(num_values)); } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index ff2b453bb4494..2072edc936a3c 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -93,9 +93,8 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } default: { - std::stringstream ss; - ss << "MakeBuilder: cannot construct builder for type " << type->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("MakeBuilder: cannot construct builder for type ", + type->ToString()); } } } diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index b148486bd212f..2ce0702f20c32 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -508,11 +508,9 @@ void ShiftTime(FunctionContext* ctx, const CastOptions& options, const bool is_m out_data[i] = static_cast(in_data[i] / factor); } } else { -#define RAISE_INVALID_CAST(VAL) \ - std::stringstream ss; \ - ss << "Casting from " << input.type->ToString() << " to " << output->type->ToString() \ - << " would lose data: " << VAL; \ - ctx->SetStatus(Status::Invalid(ss.str())); +#define RAISE_INVALID_CAST(VAL) \ + ctx->SetStatus(Status::Invalid("Casting from ", input.type->ToString(), " to ", \ + output->type->ToString(), " would lose data: ", VAL)); if (input.null_count != 0) { internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset, @@ -795,9 +793,8 @@ struct CastFunctor< UnpackFixedSizeBinaryDictionary(ctx, indices, dictionary, output); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -874,9 +871,8 @@ struct CastFunctor(ctx, indices, dictionary, output))); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -932,9 +928,8 @@ struct CastFunctor(indices, dictionary, out); break; default: - std::stringstream ss; - ss << "Invalid index type: " << indices.type()->ToString(); - ctx->SetStatus(Status::Invalid(ss.str())); + ctx->SetStatus( + Status::Invalid("Invalid index type: ", indices.type()->ToString())); return; } } @@ -960,9 +955,8 @@ struct CastFunctor> { auto str = input_array.GetView(i); if (!converter(str.data(), str.length(), out_data)) { - std::stringstream ss; - ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ", + output->type->ToString())); return; } } @@ -991,10 +985,9 @@ struct CastFunctortype->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", + input_array.GetString(i), "' into ", + output->type->ToString())); return; } @@ -1029,9 +1022,8 @@ struct CastFunctor { const auto str = input_array.GetView(i); if (!converter(str.data(), str.length(), out_data)) { - std::stringstream ss; - ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); - ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ", + output->type->ToString())); return; } } @@ -1123,9 +1115,8 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& i if (!(is_primitive(type_id) || type_id == Type::FIXED_SIZE_BINARY || type_id == Type::DECIMAL)) { - std::stringstream ss; - ss << "Cannot pre-allocate memory for type: " << out->type->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot pre-allocate memory for type: ", + out->type->ToString()); } if (type_id != Type::NA) { @@ -1400,10 +1391,8 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& break; } if (*kernel == nullptr) { - std::stringstream ss; - ss << "No cast implemented from " << in_type.ToString() << " to " - << out_type->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("No cast implemented from ", in_type.ToString(), " to ", + out_type->ToString()); } return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc index c057ea5736139..0513fe1f6ad4f 100644 --- a/cpp/src/arrow/compute/kernels/hash.cc +++ b/cpp/src/arrow/compute/kernels/hash.cc @@ -56,11 +56,9 @@ namespace compute { namespace { -#define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE) \ - if (!KERNEL) { \ - std::stringstream ss; \ - ss << FUNCNAME << " not implemented for " << type->ToString(); \ - return Status::NotImplemented(ss.str()); \ +#define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE) \ + if (!KERNEL) { \ + return Status::NotImplemented(FUNCNAME, " not implemented for ", type->ToString()); \ } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 8a249a68c07ec..1018f8553860e 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -40,10 +40,9 @@ namespace { Status GenericConversionError(const std::shared_ptr& type, const uint8_t* data, uint32_t size) { - std::stringstream ss; - ss << "CSV conversion error to " << type->ToString() << ": invalid value '" - << std::string(reinterpret_cast(data), size) << "'"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type->ToString(), + ": invalid value '", + std::string(reinterpret_cast(data), size), "'"); } inline bool IsWhitespace(uint8_t c) { @@ -214,9 +213,8 @@ class VarSizeBinaryConverter : public ConcreteConverter { auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) { - std::stringstream ss; - ss << "CSV conversion error to " << type_->ToString() << ": invalid UTF8 data"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type_->ToString(), + ": invalid UTF8 data"); } builder.UnsafeAppend(data, size); return Status::OK(); @@ -256,10 +254,8 @@ Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_ auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { if (ARROW_PREDICT_FALSE(size != byte_width)) { - std::stringstream ss; - ss << "CSV conversion error to " << type_->ToString() << ": got a " << size - << "-byte long string"; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV conversion error to ", type_->ToString(), ": got a ", + size, "-byte long string"); } return builder.Append(data); }; @@ -410,9 +406,8 @@ Status Converter::Make(const std::shared_ptr& type, break; default: { - std::stringstream ss; - ss << "CSV conversion to " << type->ToString() << " is not supported"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("CSV conversion to ", type->ToString(), + " is not supported"); } #undef CONVERTER_CASE diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index fe7f841f58328..b1d175adfb582 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -30,9 +30,7 @@ namespace arrow { namespace csv { static Status ParseError(const char* message) { - std::stringstream ss; - ss << "CSV parse error: " << message; - return Status::Invalid(ss.str()); + return Status::Invalid("CSV parse error: ", message); } static Status MismatchingColumns(int32_t expected, int32_t actual) { diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index b2a6b7b430ad0..efd61167b71a5 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -355,10 +355,8 @@ class ThreadedTableReader : public BaseTableReader { chunk_size, &parsed_size)); if (parsed_size != chunk_size) { DCHECK_EQ(parsed_size, chunk_size); - std::stringstream ss; - ss << "Chunker and parser disagree on block size: " << chunk_size << " vs " - << parsed_size; - return Status::Invalid(ss.str()); + return Status::Invalid("Chunker and parser disagree on block size: ", + chunk_size, " vs ", parsed_size); } RETURN_NOT_OK(ProcessData(parser, chunk_index)); // Keep chunk buffer alive within closure and release it at the end diff --git a/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc b/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc index 7022ff017f48e..a7749161c4676 100644 --- a/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc +++ b/cpp/src/arrow/dbi/hiveserver2/hiveserver2-test.cc @@ -97,10 +97,8 @@ Status Wait(const std::unique_ptr& op, if (op_state == state) { return Status::OK(); } else { - std::stringstream ss; - ss << "Failed to reach state '" << OperationStateToString(state) << "' after " - << retries << " retries."; - return Status::IOError(ss.str()); + return Status::IOError("Failed to reach state '", OperationStateToString(state), + "' after ", retries, " retries"); } } diff --git a/cpp/src/arrow/dbi/hiveserver2/service.cc b/cpp/src/arrow/dbi/hiveserver2/service.cc index e2d3f2a21bf37..502a8a284b86f 100644 --- a/cpp/src/arrow/dbi/hiveserver2/service.cc +++ b/cpp/src/arrow/dbi/hiveserver2/service.cc @@ -92,9 +92,7 @@ Service::Service(const string& host, int port, int conn_timeout, Status Service::Open() { if (impl_->protocol_version < hs2::TProtocolVersion::HIVE_CLI_SERVICE_PROTOCOL_V6) { - std::stringstream ss; - ss << "Unsupported protocol: " << impl_->protocol_version; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported protocol: ", impl_->protocol_version); } impl_->socket.reset(new TSocket(host_, port_)); diff --git a/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc b/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc index d154e143ba290..171eae36816e0 100644 --- a/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc +++ b/cpp/src/arrow/dbi/hiveserver2/thrift-internal.cc @@ -204,11 +204,7 @@ Status TStatusToStatus(const hs2::TStatus& tstatus) { return Status::IOError(tstatus.errorMessage); case hs2::TStatusCode::INVALID_HANDLE_STATUS: return Status::Invalid("Invalid handle"); - default: { - std::stringstream ss; - ss << "Unknown TStatusCode " << tstatus.statusCode; - return Status::UnknownError(ss.str()); - } + default: { return Status::UnknownError("Unknown TStatusCode ", tstatus.statusCode); } } } diff --git a/cpp/src/arrow/flight/internal.cc b/cpp/src/arrow/flight/internal.cc index 796e6095cdb7f..b4c6b2addcc11 100644 --- a/cpp/src/arrow/flight/internal.cc +++ b/cpp/src/arrow/flight/internal.cc @@ -37,16 +37,13 @@ Status FromGrpcStatus(const grpc::Status& grpc_status) { if (grpc_status.ok()) { return Status::OK(); } - std::stringstream ss; if (grpc_status.error_code() == grpc::StatusCode::UNIMPLEMENTED) { - ss << "gRPC returned unimplemented error, with message: " - << grpc_status.error_message(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("gRPC returned unimplemented error, with message: ", + grpc_status.error_message()); } else { - ss << "gRPC failed with error code " << grpc_status.error_code() - << " and message: " << grpc_status.error_message(); - return Status::IOError(ss.str()); + return Status::IOError("gRPC failed with error code ", grpc_status.error_code(), + " and message: ", grpc_status.error_message()); } } diff --git a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc index 03256a1f52c70..b4d8744cb0bd0 100644 --- a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc +++ b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc @@ -82,9 +82,8 @@ Status ReadMessage(CudaBufferReader* reader, MemoryPool* pool, RETURN_NOT_OK(AllocateBuffer(pool, message_length, &metadata)); RETURN_NOT_OK(reader->Read(message_length, &bytes_read, metadata->mutable_data())); if (bytes_read != message_length) { - std::stringstream ss; - ss << "Expected " << message_length << " metadata bytes, but only got " << bytes_read; - return Status::IOError(ss.str()); + return Status::IOError("Expected ", message_length, " metadata bytes, but only got ", + bytes_read); } return ipc::Message::ReadFrom(metadata, reader, out); diff --git a/cpp/src/arrow/gpu/cuda_common.h b/cpp/src/arrow/gpu/cuda_common.h index a53dd220adda0..2b630c8114325 100644 --- a/cpp/src/arrow/gpu/cuda_common.h +++ b/cpp/src/arrow/gpu/cuda_common.h @@ -34,15 +34,13 @@ namespace cuda { (void)ret; \ } while (0) -#define CU_RETURN_NOT_OK(STMT) \ - do { \ - CUresult ret = (STMT); \ - if (ret != CUDA_SUCCESS) { \ - std::stringstream ss; \ - ss << "Cuda Driver API call in " << __FILE__ << " at line " << __LINE__ \ - << " failed with code " << ret << ": " << #STMT; \ - return Status::IOError(ss.str()); \ - } \ +#define CU_RETURN_NOT_OK(STMT) \ + do { \ + CUresult ret = (STMT); \ + if (ret != CUDA_SUCCESS) { \ + return Status::IOError("Cuda Driver API call in ", __FILE__, " at line ", \ + __LINE__, " failed with code ", ret, ": ", #STMT); \ + } \ } while (0) } // namespace cuda diff --git a/cpp/src/arrow/io/file-test.cc b/cpp/src/arrow/io/file-test.cc index 4d710d3470f5c..6d780c0940eba 100644 --- a/cpp/src/arrow/io/file-test.cc +++ b/cpp/src/arrow/io/file-test.cc @@ -460,9 +460,7 @@ class MyMemoryPool : public MemoryPool { *ptr = reinterpret_cast(std::realloc(*ptr, new_size)); if (*ptr == NULL) { - std::stringstream ss; - ss << "realloc of size " << new_size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("realloc of size ", new_size, " failed"); } return Status::OK(); diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 869d8e3720766..0398d5a1f9e80 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -479,9 +479,7 @@ class MemoryMappedFile::MemoryMap : public MutableBuffer { void* result = mmap(nullptr, static_cast(initial_size), prot_flags_, map_mode_, file_->fd(), 0); if (result == MAP_FAILED) { - std::stringstream ss; - ss << "Memory mapping file failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return Status::IOError("Memory mapping file failed: ", std::strerror(errno)); } size_ = capacity_ = initial_size; data_ = mutable_data_ = static_cast(result); diff --git a/cpp/src/arrow/io/hdfs-internal.cc b/cpp/src/arrow/io/hdfs-internal.cc index c8be5164cfa78..c273ab45f634f 100644 --- a/cpp/src/arrow/io/hdfs-internal.cc +++ b/cpp/src/arrow/io/hdfs-internal.cc @@ -218,9 +218,7 @@ static arrow::Status try_dlopen(std::vector potential_paths, const cha } if (out_handle == NULL) { - std::stringstream ss; - ss << "Unable to load " << name; - return arrow::Status::IOError(ss.str()); + return arrow::Status::IOError("Unable to load ", name); } return arrow::Status::OK(); @@ -243,9 +241,7 @@ static arrow::Status try_dlopen(std::vector potential_paths, const cha } if (out_handle == NULL) { - std::stringstream ss; - ss << "Unable to load " << name; - return arrow::Status::IOError(ss.str()); + return arrow::Status::IOError("Unable to load ", name); } return arrow::Status::OK(); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 030b84853da60..3e9b804ca233c 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -57,13 +57,11 @@ std::string TranslateErrno(int error_code) { } // namespace -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - std::stringstream ss; \ - ss << "HDFS " << WHAT << " failed, errno: " << TranslateErrno(errno); \ - return Status::IOError(ss.str()); \ - } \ +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + return Status::IOError("HDFS ", WHAT, " failed, errno: ", TranslateErrno(errno)); \ + } \ } while (0) static constexpr int kDefaultHdfsBufferSize = 1 << 16; @@ -466,10 +464,8 @@ class HadoopFileSystem::HadoopFileSystemImpl { if ((errno == 0) || (errno == ENOENT && Exists(path))) { num_entries = 0; } else { - std::stringstream ss; - ss << "HDFS list directory of " << path - << " failed, errno: " << TranslateErrno(errno); - return Status::IOError(ss.str()); + return Status::IOError("HDFS list directory failed, errno: ", + TranslateErrno(errno)); } } @@ -492,14 +488,9 @@ class HadoopFileSystem::HadoopFileSystemImpl { hdfsFile handle = driver_->OpenFile(fs_, path.c_str(), O_RDONLY, buffer_size, 0, 0); if (handle == nullptr) { - std::stringstream ss; - if (!Exists(path)) { - ss << "HDFS file does not exist: " << path; - } else { - // TODO(wesm): determine other causes of failure - ss << "HDFS path exists, but opening file failed: " << path; - } - return Status::IOError(ss.str()); + const char* msg = !Exists(path) ? "HDFS file does not exist: " + : "HDFS path exists, but opening file failed: "; + return Status::IOError(msg, path); } // std::make_shared does not work with private ctors @@ -521,10 +512,7 @@ class HadoopFileSystem::HadoopFileSystemImpl { static_cast(default_block_size)); if (handle == nullptr) { - // TODO(wesm): determine cause of failure - std::stringstream ss; - ss << "Unable to open file " << path; - return Status::IOError(ss.str()); + return Status::IOError("Unable to open file ", path); } // std::make_shared does not work with private ctors diff --git a/cpp/src/arrow/ipc/dictionary.cc b/cpp/src/arrow/ipc/dictionary.cc index 488bb75b9d75f..aa0d9085f5a8f 100644 --- a/cpp/src/arrow/ipc/dictionary.cc +++ b/cpp/src/arrow/ipc/dictionary.cc @@ -34,9 +34,7 @@ Status DictionaryMemo::GetDictionary(int64_t id, std::shared_ptr* dictionary) const { auto it = id_to_dictionary_.find(id); if (it == id_to_dictionary_.end()) { - std::stringstream ss; - ss << "Dictionary with id " << id << " not found"; - return Status::KeyError(ss.str()); + return Status::KeyError("Dictionary with id ", id, " not found"); } *dictionary = it->second; return Status::OK(); @@ -70,9 +68,7 @@ bool DictionaryMemo::HasDictionaryId(int64_t id) const { Status DictionaryMemo::AddDictionary(int64_t id, const std::shared_ptr& dictionary) { if (HasDictionaryId(id)) { - std::stringstream ss; - ss << "Dictionary with id " << id << " already exists"; - return Status::KeyError(ss.str()); + return Status::KeyError("Dictionary with id ", id, " already exists"); } intptr_t address = reinterpret_cast(dictionary.get()); id_to_dictionary_[id] = dictionary; diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index ebdb335fa57f7..b0ab62c678c72 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -642,9 +642,7 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()))) { - std::stringstream ss; - ss << "Array is not primitive type: " << values.type()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Array is not primitive type: ", values.type()->ToString()); } meta->type = ToFlatbufferType(values.type_id()); diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc index 914cdb66599f4..fe69a53a944c7 100644 --- a/cpp/src/arrow/ipc/json-integration-test.cc +++ b/cpp/src/arrow/ipc/json-integration-test.cc @@ -170,10 +170,8 @@ static Status ValidateArrowVsJson(const std::string& arrow_path, const int arrow_nbatches = arrow_reader->num_record_batches(); if (json_nbatches != arrow_nbatches) { - std::stringstream ss; - ss << "Different number of record batches: " << json_nbatches << " (JSON) vs " - << arrow_nbatches << " (Arrow)"; - return Status::Invalid(ss.str()); + return Status::Invalid("Different number of record batches: ", json_nbatches, + " (JSON) vs ", arrow_nbatches, " (Arrow)"); } std::shared_ptr arrow_batch; @@ -231,9 +229,7 @@ Status RunCommand(const std::string& json_path, const std::string& arrow_path, return ValidateArrowVsJson(arrow_path, json_path); } else { - std::stringstream ss; - ss << "Unknown command: " << command; - return Status::Invalid(ss.str()); + return Status::Invalid("Unknown command: ", command); } } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index d5a5dd9f397db..05e547506c596 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -633,9 +633,7 @@ static Status GetInteger(const rj::Value::ConstObject& json_type, *type = is_signed ? int64() : uint64(); break; default: - std::stringstream ss; - ss << "Invalid bit width: " << bit_width; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid bit width: ", bit_width); } return Status::OK(); } @@ -654,9 +652,7 @@ static Status GetFloatingPoint(const RjObject& json_type, } else if (precision == "HALF") { *type = float16(); } else { - std::stringstream ss; - ss << "Invalid precision: " << precision; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid precision: ", precision); } return Status::OK(); } @@ -693,9 +689,7 @@ static Status GetDate(const RjObject& json_type, std::shared_ptr* type } else if (unit_str == "MILLISECOND") { *type = date64(); } else { - std::stringstream ss; - ss << "Invalid date unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid date unit: ", unit_str); } return Status::OK(); } @@ -718,9 +712,7 @@ static Status GetTime(const RjObject& json_type, std::shared_ptr* type } else if (unit_str == "NANOSECOND") { *type = time64(TimeUnit::NANO); } else { - std::stringstream ss; - ss << "Invalid time unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid time unit: ", unit_str); } const auto& fw_type = checked_cast(**type); @@ -749,9 +741,7 @@ static Status GetTimestamp(const RjObject& json_type, std::shared_ptr* } else if (unit_str == "NANOSECOND") { unit = TimeUnit::NANO; } else { - std::stringstream ss; - ss << "Invalid time unit: " << unit_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid time unit: ", unit_str); } const auto& it_tz = json_type.FindMember("timezone"); @@ -778,9 +768,7 @@ static Status GetUnion(const RjObject& json_type, } else if (mode_str == "DENSE") { mode = UnionMode::DENSE; } else { - std::stringstream ss; - ss << "Invalid union mode: " << mode_str; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid union mode: ", mode_str); } const auto& it_type_codes = json_type.FindMember("typeIds"); @@ -838,9 +826,7 @@ static Status GetType(const RjObject& json_type, } else if (type_name == "union") { return GetUnion(json_type, children, type); } else { - std::stringstream ss; - ss << "Unrecognized type name: " << type_name; - return Status::Invalid(ss.str()); + return Status::Invalid("Unrecognized type name: ", type_name); } return Status::OK(); } @@ -1235,10 +1221,8 @@ class ArrayReader { const auto& json_children_arr = json_children->value.GetArray(); if (type.num_children() != static_cast(json_children_arr.Size())) { - std::stringstream ss; - ss << "Expected " << type.num_children() << " children, but got " - << json_children_arr.Size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected ", type.num_children(), " children, but got ", + json_children_arr.Size()); } for (int i = 0; i < static_cast(json_children_arr.Size()); ++i) { @@ -1342,9 +1326,7 @@ static Status ReadDictionary(const RjObject& obj, const DictionaryTypeMap& id_to auto it = id_to_field.find(id); if (it == id_to_field.end()) { - std::stringstream ss; - ss << "No dictionary with id " << id; - return Status::Invalid(ss.str()); + return Status::Invalid("No dictionary with id ", id); } std::vector> fields = {it->second}; @@ -1489,9 +1471,7 @@ Status ReadArray(MemoryPool* pool, const rj::Value& json_array, const Schema& sc } if (result == nullptr) { - std::stringstream ss; - ss << "Field named " << name << " not found in schema"; - return Status::KeyError(ss.str()); + return Status::KeyError("Field named ", name, " not found in schema"); } return ReadArray(pool, json_array, result->type(), array); diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 5516e2dd72a2e..c8c724968f67c 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -49,56 +49,39 @@ using RjWriter = rj::Writer; using RjArray = rj::Value::ConstArray; using RjObject = rj::Value::ConstObject; -#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ - if (NAME == (PARENT).MemberEnd()) { \ - std::stringstream ss; \ - ss << "field " << TOK << " not found"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ + if (NAME == (PARENT).MemberEnd()) { \ + return Status::Invalid("field ", TOK, " not found"); \ } -#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsString()) { \ - std::stringstream ss; \ - ss << "field was not a string" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + return Status::Invalid("field was not a string line ", __LINE__); \ } -#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsBool()) { \ - std::stringstream ss; \ - ss << "field was not a boolean" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + return Status::Invalid("field was not a boolean line ", __LINE__); \ } -#define RETURN_NOT_INT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsInt()) { \ - std::stringstream ss; \ - ss << "field was not an int" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + return Status::Invalid("field was not an int line ", __LINE__); \ } -#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsArray()) { \ - std::stringstream ss; \ - ss << "field was not an array" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + return Status::Invalid("field was not an array line ", __LINE__); \ } -#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsObject()) { \ - std::stringstream ss; \ - ss << "field was not an object" \ - << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + return Status::Invalid("field was not an object line ", __LINE__); \ } namespace arrow { diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index a8d120036e4f5..d812f841d9353 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -41,9 +41,7 @@ using ::arrow::internal::checked_cast; static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; static Status JSONTypeError(const char* expected_type, rj::Type json_type) { - std::stringstream ss; - ss << "Expected " << expected_type << " or null, got type " << json_type; - return Status::Invalid(ss.str()); + return Status::Invalid("Expected ", expected_type, " or null, got type ", json_type); } class Converter { @@ -184,9 +182,8 @@ class IntegerConverter final : public ConcreteConverter> if (v == v64) { return builder_->Append(v); } else { - std::stringstream ss; - ss << "Value " << v64 << " out of bounds for " << this->type_->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", v64, " out of bounds for ", + this->type_->ToString()); } } else { return JSONTypeError("signed int", json_obj.GetType()); @@ -203,9 +200,8 @@ class IntegerConverter final : public ConcreteConverter> if (v == v64) { return builder_->Append(v); } else { - std::stringstream ss; - ss << "Value " << v64 << " out of bounds for " << this->type_->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", v64, " out of bounds for ", + this->type_->ToString()); } return builder_->Append(v); } else { @@ -272,10 +268,8 @@ class DecimalConverter final : public ConcreteConverter { auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); RETURN_NOT_OK(Decimal128::FromString(view, &d, &precision, &scale)); if (scale != decimal_type_->scale()) { - std::stringstream ss; - ss << "Invalid scale for decimal: expected " << decimal_type_->scale() << ", got " - << scale; - return Status::Invalid(ss.str()); + return Status::Invalid("Invalid scale for decimal: expected ", + decimal_type_->scale(), ", got ", scale); } return builder_->Append(d); } @@ -390,10 +384,8 @@ class StructConverter final : public ConcreteConverter { auto size = json_obj.Size(); auto expected_size = static_cast(type_->num_children()); if (size != expected_size) { - std::stringstream ss; - ss << "Expected array of size " << expected_size << ", got array of size " - << size; - return Status::Invalid(ss.str()); + return Status::Invalid("Expected array of size ", expected_size, + ", got array of size ", size); } for (uint32_t i = 0; i < size; ++i) { RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); @@ -414,9 +406,8 @@ class StructConverter final : public ConcreteConverter { } } if (remaining > 0) { - std::stringstream ss; - ss << "Unexpected members in JSON object for type " << type_->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Unexpected members in JSON object for type ", + type_->ToString()); } return builder_->Append(); } @@ -460,9 +451,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) default: { - std::stringstream ss; - ss << "JSON conversion to " << type->ToString() << " not implemented"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("JSON conversion to ", type->ToString(), + " not implemented"); } } @@ -481,10 +471,8 @@ Status ArrayFromJSON(const std::shared_ptr& type, rj::Document json_doc; json_doc.Parse(json_string.data(), json_string.length()); if (json_doc.HasParseError()) { - std::stringstream ss; - ss << "JSON parse error at offset " << json_doc.GetErrorOffset() << ": " - << GetParseError_En(json_doc.GetParseError()); - return Status::Invalid(ss.str()); + return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", + GetParseError_En(json_doc.GetParseError())); } // The JSON document should be an array, append it diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 724e6255cbddb..8adf4a8b66038 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -153,10 +153,8 @@ Status Message::ReadFrom(const std::shared_ptr& metadata, io::InputStrea std::shared_ptr body; RETURN_NOT_OK(stream->Read(body_length, &body)); if (body->size() < body_length) { - std::stringstream ss; - ss << "Expected to be able to read " << body_length << " bytes for message body, got " - << body->size(); - return Status::IOError(ss.str()); + return Status::IOError("Expected to be able to read ", body_length, + " bytes for message body, got ", body->size()); } return Message::Open(metadata, body, out); @@ -171,10 +169,8 @@ Status Message::ReadFrom(const int64_t offset, const std::shared_ptr& me std::shared_ptr body; RETURN_NOT_OK(file->ReadAt(offset, body_length, &body)); if (body->size() < body_length) { - std::stringstream ss; - ss << "Expected to be able to read " << body_length << " bytes for message body, got " - << body->size(); - return Status::IOError(ss.str()); + return Status::IOError("Expected to be able to read ", body_length, + " bytes for message body, got ", body->size()); } return Message::Open(metadata, body, out); @@ -238,19 +234,16 @@ Status ReadMessage(int64_t offset, int32_t metadata_length, io::RandomAccessFile RETURN_NOT_OK(file->ReadAt(offset, metadata_length, &buffer)); if (buffer->size() < metadata_length) { - std::stringstream ss; - ss << "Expected to read " << metadata_length << " metadata bytes but got " - << buffer->size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected to read ", metadata_length, + " metadata bytes but got ", buffer->size()); } int32_t flatbuffer_size = *reinterpret_cast(buffer->data()); if (flatbuffer_size + static_cast(sizeof(int32_t)) > metadata_length) { - std::stringstream ss; - ss << "flatbuffer size " << metadata_length << " invalid. File offset: " << offset - << ", metadata length: " << metadata_length; - return Status::Invalid(ss.str()); + return Status::Invalid("flatbuffer size ", metadata_length, + " invalid. File offset: ", offset, + ", metadata length: ", metadata_length); } auto metadata = SliceBuffer(buffer, 4, buffer->size() - 4); @@ -303,10 +296,8 @@ Status ReadMessage(io::InputStream* file, std::unique_ptr* message) { std::shared_ptr metadata; RETURN_NOT_OK(file->Read(message_length, &metadata)); if (metadata->size() != message_length) { - std::stringstream ss; - ss << "Expected to read " << message_length << " metadata bytes, but " - << "only read " << metadata->size(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected to read ", message_length, " metadata bytes, but ", + "only read ", metadata->size()); } return Message::ReadFrom(metadata, file, message); diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index ef189c8ae617a..1d4c80c2946b1 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -443,9 +443,7 @@ static Status TypeToFlatbuffer(FBB& fbb, const DataType& type, return UnionToFlatBuffer(fbb, *value_type, children, dictionary_memo, offset); default: *out_type = flatbuf::Type_NONE; // Make clang-tidy happy - std::stringstream ss; - ss << "Unable to convert type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unable to convert type: ", type.ToString()); } return Status::OK(); } @@ -483,9 +481,7 @@ static Status TensorTypeToFlatbuffer(FBB& fbb, const DataType& type, break; default: *out_type = flatbuf::Type_NONE; // Make clang-tidy happy - std::stringstream ss; - ss << "Unable to convert type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unable to convert type: ", type.ToString()); } return Status::OK(); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 65f5d963e88db..b2c26767be4e9 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -225,9 +225,7 @@ class ArrayLoader { const int num_children = type.num_children(); if (num_children != 1) { - std::stringstream ss; - ss << "Wrong number of children: " << num_children; - return Status::Invalid(ss.str()); + return Status::Invalid("Wrong number of children: ", num_children); } return LoadChildren(type.children()); @@ -343,9 +341,7 @@ Status ReadDictionary(const Buffer& metadata, const DictionaryTypeMap& dictionar int64_t id = *dictionary_id = dictionary_batch->id(); auto it = dictionary_types.find(id); if (it == dictionary_types.end()) { - std::stringstream ss; - ss << "Do not have type metadata for dictionary with id: " << id; - return Status::KeyError(ss.str()); + return Status::KeyError("Do not have type metadata for dictionary with id: ", id); } std::vector> fields = {it->second}; @@ -372,10 +368,8 @@ static Status ReadMessageAndValidate(MessageReader* reader, Message::Type expect RETURN_NOT_OK(reader->ReadNextMessage(message)); if (!(*message) && !allow_null) { - std::stringstream ss; - ss << "Expected " << FormatMessageType(expected_type) - << " message in stream, was null or length 0"; - return Status::Invalid(ss.str()); + return Status::Invalid("Expected ", FormatMessageType(expected_type), + " message in stream, was null or length 0"); } if ((*message) == nullptr) { @@ -383,10 +377,9 @@ static Status ReadMessageAndValidate(MessageReader* reader, Message::Type expect } if ((*message)->type() != expected_type) { - std::stringstream ss; - ss << "Message not expected type: " << FormatMessageType(expected_type) - << ", was: " << (*message)->type(); - return Status::IOError(ss.str()); + return Status::IOError( + "Message not expected type: ", FormatMessageType(expected_type), + ", was: ", (*message)->type()); } return Status::OK(); } @@ -512,9 +505,7 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { int magic_size = static_cast(strlen(kArrowMagicBytes)); if (footer_offset_ <= magic_size * 2 + 4) { - std::stringstream ss; - ss << "File is too small: " << footer_offset_; - return Status::Invalid(ss.str()); + return Status::Invalid("File is too small: ", footer_offset_); } std::shared_ptr buffer; @@ -523,9 +514,7 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { const int64_t expected_footer_size = magic_size + sizeof(int32_t); if (buffer->size() < expected_footer_size) { - std::stringstream ss; - ss << "Unable to read " << expected_footer_size << "from end of file"; - return Status::Invalid(ss.str()); + return Status::Invalid("Unable to read ", expected_footer_size, "from end of file"); } if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) { diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index d62db32b062ac..fb5beacf0f863 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -55,31 +55,23 @@ Status AllocateAligned(int64_t size, uint8_t** out) { *out = reinterpret_cast(_aligned_malloc(static_cast(size), kAlignment)); if (!*out) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } #elif defined(ARROW_JEMALLOC) *out = reinterpret_cast(mallocx( std::max(static_cast(size), kAlignment), MALLOCX_ALIGN(kAlignment))); if (*out == NULL) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } #else const int result = posix_memalign(reinterpret_cast(out), kAlignment, static_cast(size)); if (result == ENOMEM) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("malloc of size ", size, " failed"); } if (result == EINVAL) { - std::stringstream ss; - ss << "invalid alignment parameter: " << kAlignment; - return Status::Invalid(ss.str()); + return Status::Invalid("invalid alignment parameter: ", kAlignment); } #endif return Status::OK(); @@ -118,10 +110,8 @@ class DefaultMemoryPool : public MemoryPool { *ptr = reinterpret_cast( rallocx(*ptr, static_cast(new_size), MALLOCX_ALIGN(kAlignment))); if (*ptr == NULL) { - std::stringstream ss; - ss << "realloc of size " << new_size << " failed"; *ptr = previous_ptr; - return Status::OutOfMemory(ss.str()); + return Status::OutOfMemory("realloc of size ", new_size, " failed"); } #else // Note: We cannot use realloc() here as it doesn't guarantee alignment. diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 3e04f2727ed51..29d64355bdaed 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -414,9 +414,7 @@ inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, *out_values = WrapBytes::Wrap(view.data(), view.length()); if (*out_values == nullptr) { PyErr_Clear(); - std::stringstream ss; - ss << "Wrapping " << view << " failed"; - return Status::UnknownError(ss.str()); + return Status::UnknownError("Wrapping ", view, " failed"); } } ++out_values; @@ -773,18 +771,16 @@ class ObjectBlock : public PandasBlock { CONVERTLISTSLIKE_CASE(ListType, LIST) CONVERTLISTSLIKE_CASE(NullType, NA) default: { - std::stringstream ss; - ss << "Not implemented type for conversion from List to Pandas ObjectBlock: " - << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "Not implemented type for conversion from List to Pandas ObjectBlock: ", + list_type->value_type()->ToString()); } } } else if (type == Type::STRUCT) { RETURN_NOT_OK(ConvertStruct(options_, data, out_buffer)); } else { - std::stringstream ss; - ss << "Unsupported type for object array output: " << col->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported type for object array output: ", + col->type()->ToString()); } placement_data_[rel_placement] = abs_placement; @@ -810,10 +806,9 @@ class IntBlock : public PandasBlock { const ChunkedArray& data = *col->data().get(); if (type != ARROW_TYPE) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas int" << sizeof(C_TYPE) << " block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), " to a Pandas int", + sizeof(C_TYPE), " block"); } ConvertIntegerNoNullsSameType(options_, data, out_buffer); @@ -841,10 +836,9 @@ class Float16Block : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::HALF_FLOAT) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float16 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float16 block"); } npy_half* out_buffer = @@ -866,10 +860,9 @@ class Float32Block : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::FLOAT) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float32 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float32 block"); } float* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; @@ -922,10 +915,9 @@ class Float64Block : public PandasBlock { ConvertNumericNullable(data, NAN, out_buffer); break; default: - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas float64 block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas float64 block"); } #undef INTEGER_CASE @@ -945,10 +937,9 @@ class BoolBlock : public PandasBlock { Type::type type = col->type()->id(); if (type != Type::BOOL) { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas boolean block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas boolean block"); } uint8_t* out_buffer = @@ -1006,10 +997,9 @@ class DatetimeBlock : public PandasBlock { return Status::NotImplemented("Unsupported time unit"); } } else { - std::stringstream ss; - ss << "Cannot write Arrow data of type " << col->type()->ToString(); - ss << " to a Pandas datetime block."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Cannot write Arrow data of type ", + col->type()->ToString(), + " to a Pandas datetime block."); } placement_data_[rel_placement] = abs_placement; @@ -1075,9 +1065,8 @@ class CategoricalBlock : public PandasBlock { const T* values = arr.raw_values(); for (int64_t i = 0; i < arr.length(); ++i) { if (arr.IsValid(i) && (values[i] < 0 || values[i] >= dict_length)) { - std::stringstream ss; - ss << "Out of bounds dictionary index: " << static_cast(values[i]); - return Status::Invalid(ss.str()); + return Status::Invalid("Out of bounds dictionary index: ", + static_cast(values[i])); } } return Status::OK(); @@ -1088,16 +1077,15 @@ class CategoricalBlock : public PandasBlock { RETURN_NOT_OK(AllocateNDArrayFromIndices(npy_type, indices_first)); } else { if (options_.zero_copy_only) { - std::stringstream ss; if (needs_copy_) { - ss << "Need to allocate categorical memory, " - << "but only zero-copy conversions allowed."; - } else { - ss << "Needed to copy " << data.num_chunks() << " chunks with " - << indices_first->null_count() - << " indices nulls, but zero_copy_only was True"; + return Status::Invalid("Need to allocate categorical memory, but ", + "only zero-copy conversions " + "allowed"); } - return Status::Invalid(ss.str()); + + return Status::Invalid("Needed to copy ", data.num_chunks(), " chunks with ", + indices_first->null_count(), + " indices nulls, but zero_copy_only was True"); } RETURN_NOT_OK(AllocateNDArray(npy_type, 1)); @@ -1155,10 +1143,8 @@ class CategoricalBlock : public PandasBlock { RETURN_NOT_OK(WriteIndices(converted_col)); break; default: { - std::stringstream ss; - ss << "Categorical index type not supported: " - << dict_type.index_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Categorical index type not supported: ", + dict_type.index_type()->ToString()); } } @@ -1349,10 +1335,8 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options case Type::LIST: { auto list_type = std::static_pointer_cast(col.type()); if (!ListTypeSupported(*list_type->value_type())) { - std::stringstream ss; - ss << "Not implemented type for list in DataFrameBlock: " - << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type for list in DataFrameBlock: ", + list_type->value_type()->ToString()); } *output_type = PandasBlock::OBJECT; } break; @@ -1360,10 +1344,9 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options *output_type = PandasBlock::CATEGORICAL; break; default: - std::stringstream ss; - ss << "No known equivalent Pandas block for Arrow data of type "; - ss << col.type()->ToString() << " is known."; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "No known equivalent Pandas block for Arrow data of type ", + col.type()->ToString(), " is known."); } return Status::OK(); } @@ -1657,10 +1640,8 @@ class ArrowDeserializer { if (data_.num_chunks() == 1 && data_.null_count() == 0) { return ConvertValuesZeroCopy(options_, npy_type, data_.chunk(0)); } else if (options_.zero_copy_only) { - std::stringstream ss; - ss << "Needed to copy " << data_.num_chunks() << " chunks with " - << data_.null_count() << " nulls, but zero_copy_only was True"; - return Status::Invalid(ss.str()); + return Status::Invalid("Needed to copy ", data_.num_chunks(), " chunks with ", + data_.null_count(), " nulls, but zero_copy_only was True"); } RETURN_NOT_OK(AllocateOutput(npy_type)); @@ -1751,10 +1732,8 @@ class ArrowDeserializer { if (data_.num_chunks() == 1 && data_.null_count() == 0) { return ConvertValuesZeroCopy(options_, traits::npy_type, data_.chunk(0)); } else if (options_.zero_copy_only) { - std::stringstream ss; - ss << "Needed to copy " << data_.num_chunks() << " chunks with " - << data_.null_count() << " nulls, but zero_copy_only was True"; - return Status::Invalid(ss.str()); + return Status::Invalid("Needed to copy ", data_.num_chunks(), " chunks with ", + data_.null_count(), " nulls, but zero_copy_only was True"); } if (data_.null_count() > 0) { @@ -1854,9 +1833,8 @@ class ArrowDeserializer { CONVERTVALUES_LISTSLIKE_CASE(Decimal128Type, DECIMAL) CONVERTVALUES_LISTSLIKE_CASE(ListType, LIST) default: { - std::stringstream ss; - ss << "Not implemented type for lists: " << list_type->value_type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Not implemented type for lists: ", + list_type->value_type()->ToString()); } } #undef CONVERTVALUES_LISTSLIKE_CASE diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 6587bd328f3fb..6e41beddd1b72 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -215,10 +215,8 @@ struct PyBytesView { this->ref.reset(); return Status::OK(); } else { - std::stringstream ss; - ss << "Expected " << expected_msg << ", got a '" << Py_TYPE(obj)->tp_name - << "' object"; - return Status::TypeError(ss.str()); + return Status::TypeError("Expected ", expected_msg, ", got a '", + Py_TYPE(obj)->tp_name, "' object"); } } diff --git a/cpp/src/arrow/python/decimal.cc b/cpp/src/arrow/python/decimal.cc index 051f31faacacf..8db7c01b9ab8b 100644 --- a/cpp/src/arrow/python/decimal.cc +++ b/cpp/src/arrow/python/decimal.cc @@ -125,11 +125,9 @@ Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arr const int32_t scale = arrow_type.scale(); if (ARROW_PREDICT_FALSE(inferred_precision > precision)) { - std::stringstream buf; - buf << "Decimal type with precision " << inferred_precision - << " does not fit into precision inferred from first array element: " - << precision; - return Status::Invalid(buf.str()); + return Status::Invalid( + "Decimal type with precision ", inferred_precision, + " does not fit into precision inferred from first array element: ", precision); } if (scale != inferred_scale) { diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 2f43db6505c67..28ed1a6c364dc 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -164,11 +164,10 @@ namespace { Status IntegerOverflowStatus(PyObject* obj, const std::string& overflow_message) { if (overflow_message.empty()) { - std::stringstream ss; std::string obj_as_stdstring; RETURN_NOT_OK(PyObject_StdStringStr(obj, &obj_as_stdstring)); - ss << "Value " << obj_as_stdstring << " too large to fit in C integer type"; - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", obj_as_stdstring, + " too large to fit in C integer type"); } else { return Status::Invalid(overflow_message); } @@ -299,13 +298,10 @@ bool PandasObjectIsNull(PyObject* obj) { } Status InvalidValue(PyObject* obj, const std::string& why) { - std::stringstream ss; - std::string obj_as_str; RETURN_NOT_OK(internal::PyObject_StdStringStr(obj, &obj_as_str)); - ss << "Could not convert " << obj_as_str << " with type " << Py_TYPE(obj)->tp_name - << ": " << why; - return Status::Invalid(ss.str()); + return Status::Invalid("Could not convert ", obj_as_str, " with type ", + Py_TYPE(obj)->tp_name, ": ", why); } Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) { @@ -355,10 +351,8 @@ Status IntegerScalarToDoubleSafe(PyObject* obj, double* out) { constexpr int64_t kDoubleMin = -(1LL << 53); if (value < kDoubleMin || value > kDoubleMax) { - std::stringstream ss; - ss << "Integer value " << value << " is outside of the range exactly" - << " representable by a IEEE 754 double precision value"; - return Status::Invalid(ss.str()); + return Status::Invalid("Integer value ", value, " is outside of the range exactly", + " representable by a IEEE 754 double precision value"); } *out = static_cast(value); return Status::OK(); @@ -372,10 +366,8 @@ Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) { constexpr int64_t kFloatMin = -(1LL << 24); if (value < kFloatMin || value > kFloatMax) { - std::stringstream ss; - ss << "Integer value " << value << " is outside of the range exactly" - << " representable by a IEEE 754 single precision value"; - return Status::Invalid(ss.str()); + return Status::Invalid("Integer value ", value, " is outside of the range exactly", + " representable by a IEEE 754 single precision value"); } *out = static_cast(value); return Status::OK(); diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index 0f1d85ead2a16..c9db5f4f28531 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -58,10 +58,9 @@ class NumPyDtypeUnifier { NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(NULLPTR) {} Status InvalidMix(int new_dtype) { - std::stringstream ss; - ss << "Cannot mix NumPy dtypes " << GetNumPyTypeName(current_type_num_) << " and " - << GetNumPyTypeName(new_dtype); - return Status::Invalid(ss.str()); + return Status::Invalid("Cannot mix NumPy dtypes ", + GetNumPyTypeName(current_type_num_), " and ", + GetNumPyTypeName(new_dtype)); } int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; } @@ -250,9 +249,7 @@ class NumPyDtypeUnifier { action = Observe_DATETIME(descr); break; default: - std::stringstream ss; - ss << "Unsupported numpy type " << GetNumPyTypeName(dtype) << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype)); } if (action == INVALID) { @@ -480,10 +477,8 @@ class TypeInferrer { } else if (PyBytes_Check(key_obj)) { key = internal::PyBytes_AsStdString(key_obj); } else { - std::stringstream ss; - ss << "Expected dict key of type str or bytes, got '" << Py_TYPE(key_obj)->tp_name - << "'"; - return Status::TypeError(ss.str()); + return Status::TypeError("Expected dict key of type str or bytes, got '", + Py_TYPE(key_obj)->tp_name, "'"); } // Get or create visitor for this key auto it = struct_inferrers_.find(key); diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 463795a2109f0..6954e35c3e199 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -143,9 +143,8 @@ inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { TYPE_VISIT_INLINE(DATETIME); TYPE_VISIT_INLINE(OBJECT); } - std::stringstream ss; - ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr)); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("NumPy type not implemented: ", + GetNumPyTypeName(PyArray_TYPE(arr))); } #undef TYPE_VISIT_INLINE diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc index d95e337a4870d..c73e0bc15c9c5 100644 --- a/cpp/src/arrow/python/numpy_convert.cc +++ b/cpp/src/arrow/python/numpy_convert.cc @@ -92,9 +92,7 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr* out) { TO_ARROW_TYPE_CASE(FLOAT32, float32); TO_ARROW_TYPE_CASE(FLOAT64, float64); default: { - std::stringstream ss; - ss << "Unsupported numpy type " << descr->type_num << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } } return Status::OK(); @@ -119,9 +117,7 @@ Status GetNumPyType(const DataType& type, int* type_num) { NUMPY_TYPE_CASE(FLOAT, FLOAT32); NUMPY_TYPE_CASE(DOUBLE, FLOAT64); default: { - std::stringstream ss; - ss << "Unsupported tensor type: " << type.ToString() << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported tensor type: ", type.ToString()); } } #undef NUMPY_TYPE_CASE @@ -181,9 +177,7 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out) { } } break; default: { - std::stringstream ss; - ss << "Unsupported numpy type " << descr->type_num << std::endl; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } } diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index da288d3c6868e..461a085722243 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -283,9 +283,8 @@ class NumPyConverter { } Status TypeNotImplemented(std::string type_name) { - std::stringstream ss; - ss << "NumPyConverter doesn't implement <" << type_name << "> conversion. "; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("NumPyConverter doesn't implement <", type_name, + "> conversion. "); } MemoryPool* pool_; @@ -574,9 +573,8 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { auto byte_width = type.byte_width(); if (itemsize_ != byte_width) { - std::stringstream ss; - ss << "Got bytestring of length " << itemsize_ << " (expected " << byte_width << ")"; - return Status::Invalid(ss.str()); + return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ", + byte_width, ")"); } FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_); @@ -651,9 +649,8 @@ Status NumPyConverter::Visit(const StringType& type) { if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) { return builder.Append(data, itemsize_); } else { - std::stringstream ss; - ss << "Encountered non-UTF8 binary value: " << HexEncode(data, itemsize_); - return Status::Invalid(ss.str()); + return Status::Invalid("Encountered non-UTF8 binary value: ", + HexEncode(data, itemsize_)); } } else { return AppendUTF32(reinterpret_cast(data), itemsize_, byteorder, @@ -697,9 +694,7 @@ Status NumPyConverter::Visit(const StructType& type) { for (auto field : type.children()) { PyObject* tup = PyDict_GetItemString(dtype_->fields, field->name().c_str()); if (tup == NULL) { - std::stringstream ss; - ss << "Missing field '" << field->name() << "' in struct array"; - return Status::TypeError(ss.str()); + return Status::TypeError("Missing field '", field->name(), "' in struct array"); } PyArray_Descr* sub_dtype = reinterpret_cast(PyTuple_GET_ITEM(tup, 0)); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index a77cebc7e7d50..f5e6a5776071d 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -402,10 +402,7 @@ class TimestampConverter : public TypedConverter type; RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); if (type->id() != Type::TIMESTAMP) { - std::ostringstream ss; - ss << "Expected np.datetime64 but got: "; - ss << type->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Expected np.datetime64 but got: ", type->ToString()); } const TimestampType& ttype = checked_cast(*type); if (unit_ != ttype.unit()) { @@ -705,10 +702,7 @@ Status ListConverter::AppendNdarrayItem(PyObject* obj) { return value_converter_->AppendSingleVirtual(obj); } default: { - std::stringstream ss; - ss << "Unknown list item type: "; - ss << value_type_->ToString(); - return Status::TypeError(ss.str()); + return Status::TypeError("Unknown list item type: ", value_type_->ToString()); } } } @@ -911,9 +905,8 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, new StructConverter(from_pandas, strict_conversions)); break; default: - std::stringstream ss; - ss << "Sequence converter for type " << type->ToString() << " not implemented"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Sequence converter for type ", type->ToString(), + " not implemented"); } return Status::OK(); } diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 7911557ee73e0..ca94369be5157 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -407,10 +407,9 @@ Status CallCustomCallback(PyObject* context, PyObject* method_name, PyObject* el PyObject** result) { *result = NULL; if (context == Py_None) { - std::stringstream ss; - ss << "error while calling callback on " << internal::PyObject_StdStringRepr(elem) - << ": handler not registered"; - return Status::SerializationError(ss.str()); + return Status::SerializationError("error while calling callback on ", + internal::PyObject_StdStringRepr(elem), + ": handler not registered"); } else { *result = PyObject_CallMethodObjArgs(context, method_name, elem, NULL); return PassPyError(); diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h index 7350deadcc67f..dc462972c57b7 100644 --- a/cpp/src/arrow/python/util/datetime.h +++ b/cpp/src/arrow/python/util/datetime.h @@ -199,9 +199,7 @@ static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, switch (unit) { case TimeUnit::NANO: if (val % 1000 != 0) { - std::stringstream ss; - ss << "Value " << val << " has non-zero nanoseconds"; - return Status::Invalid(ss.str()); + return Status::Invalid("Value ", val, " has non-zero nanoseconds"); } val /= 1000; // fall through diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 33287c19ffdde..baaf5cb17500f 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -95,16 +95,13 @@ class SimpleRecordBatch : public RecordBatch { DCHECK(column != nullptr); if (!field->type()->Equals(column->type())) { - std::stringstream ss; - ss << "Column data type " << field->type()->name() - << " does not match field data type " << column->type()->name(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column data type ", field->type()->name(), + " does not match field data type ", column->type()->name()); } if (column->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match record batch's length. Expected length " - << num_rows_ << " but got length " << column->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match record batch's length. Expected length ", + num_rows_, " but got length ", column->length()); } std::shared_ptr new_schema; @@ -229,17 +226,14 @@ Status RecordBatch::Validate() const { auto arr_shared = this->column_data(i); const ArrayData& arr = *arr_shared; if (arr.length != num_rows_) { - std::stringstream ss; - ss << "Number of rows in column " << i << " did not match batch: " << arr.length - << " vs " << num_rows_; - return Status::Invalid(ss.str()); + return Status::Invalid("Number of rows in column ", i, + " did not match batch: ", arr.length, " vs ", num_rows_); } const auto& schema_type = *schema_->field(i)->type(); if (!arr.type->Equals(schema_type)) { - std::stringstream ss; - ss << "Column " << i << " type not match schema: " << arr.type->ToString() << " vs " - << schema_type.ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, + " type not match schema: ", arr.type->ToString(), " vs ", + schema_type.ToString()); } } return Status::OK(); diff --git a/cpp/src/arrow/status.cc b/cpp/src/arrow/status.cc index 8be8b36d13bd8..db7f087149017 100644 --- a/cpp/src/arrow/status.cc +++ b/cpp/src/arrow/status.cc @@ -13,6 +13,7 @@ #include "arrow/status.h" #include +#include namespace arrow { diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index e3632a6d5f62e..12975afcc8100 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -25,34 +25,41 @@ #endif #include "arrow/util/macros.h" +#include "arrow/util/string_builder.h" #include "arrow/util/visibility.h" #ifdef ARROW_EXTRA_ERROR_CONTEXT -/// \brief Propagate any non-successful Status to the caller -#define ARROW_RETURN_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - std::stringstream ss; \ - ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \ - return ::arrow::Status(_s.code(), ss.str()); \ - } \ +/// \brief Return with given status if condition is met. +#define ARROW_RETURN_IF(condition, status) \ + do { \ + if (ARROW_PREDICT_FALSE(condition)) { \ + ::arrow::Status _s = (status); \ + std::stringstream ss; \ + ss << __FILE__ << ":" << __LINE__ << " : " << _s.message(); \ + return ::arrow::Status(_s.code(), ss.str()); \ + } \ } while (0) #else -/// \brief Propagate any non-successful Status to the caller -#define ARROW_RETURN_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - return _s; \ - } \ - } while (false) +#define ARROW_RETURN_IF(condition, status) \ + do { \ + if (ARROW_PREDICT_FALSE(condition)) { \ + return (status); \ + } \ + } while (0) #endif // ARROW_EXTRA_ERROR_CONTEXT +/// \brief Propagate any non-successful Status to the caller +#define ARROW_RETURN_NOT_OK(status) \ + do { \ + ::arrow::Status __s = (status); \ + ARROW_RETURN_IF(!__s.ok(), __s); \ + \ + } while (false) + #define RETURN_NOT_OK_ELSE(s, else_) \ do { \ ::arrow::Status _s = (s); \ @@ -62,17 +69,6 @@ } \ } while (false) -#define ARROW_RETURN_FAILURE_IF_FALSE(condition, status) \ - do { \ - if (!(condition)) { \ - Status _status = (status); \ - std::stringstream ss; \ - ss << __FILE__ << ":" << __LINE__ << " code: " << _status.CodeAsString() << " \n " \ - << _status.message(); \ - return ::arrow::Status(_status.code(), ss.str()); \ - } \ - } while (0) - // This is an internal-use macro and should not be used in public headers. #ifndef RETURN_NOT_OK #define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s) @@ -149,84 +145,119 @@ class ARROW_EXPORT Status { static Status OK() { return Status(); } /// Return a success status with a specific message - static Status OK(const std::string& msg) { return Status(StatusCode::OK, msg); } + template + static Status OK(Args&&... args) { + return Status(StatusCode::OK, util::StringBuilder(std::forward(args)...)); + } /// Return an error status for out-of-memory conditions - static Status OutOfMemory(const std::string& msg) { - return Status(StatusCode::OutOfMemory, msg); + template + static Status OutOfMemory(Args&&... args) { + return Status(StatusCode::OutOfMemory, + util::StringBuilder(std::forward(args)...)); } /// Return an error status for failed key lookups (e.g. column name in a table) - static Status KeyError(const std::string& msg) { - return Status(StatusCode::KeyError, msg); + template + static Status KeyError(Args&&... args) { + return Status(StatusCode::KeyError, util::StringBuilder(std::forward(args)...)); } /// Return an error status for type errors (such as mismatching data types) - static Status TypeError(const std::string& msg) { - return Status(StatusCode::TypeError, msg); + template + static Status TypeError(Args&&... args) { + return Status(StatusCode::TypeError, + util::StringBuilder(std::forward(args)...)); } /// Return an error status for unknown errors - static Status UnknownError(const std::string& msg) { - return Status(StatusCode::UnknownError, msg); + template + static Status UnknownError(Args&&... args) { + return Status(StatusCode::UnknownError, + util::StringBuilder(std::forward(args)...)); } /// Return an error status when an operation or a combination of operation and /// data types is unimplemented - static Status NotImplemented(const std::string& msg) { - return Status(StatusCode::NotImplemented, msg); + template + static Status NotImplemented(Args&&... args) { + return Status(StatusCode::NotImplemented, + util::StringBuilder(std::forward(args)...)); } /// Return an error status for invalid data (for example a string that fails parsing) - static Status Invalid(const std::string& msg) { - return Status(StatusCode::Invalid, msg); + template + static Status Invalid(Args&&... args) { + return Status(StatusCode::Invalid, util::StringBuilder(std::forward(args)...)); } /// Return an error status when a container's capacity would exceed its limits - static Status CapacityError(const std::string& msg) { - return Status(StatusCode::CapacityError, msg); + template + static Status CapacityError(Args&&... args) { + return Status(StatusCode::CapacityError, + util::StringBuilder(std::forward(args)...)); } /// Return an error status when some IO-related operation failed - static Status IOError(const std::string& msg) { - return Status(StatusCode::IOError, msg); + template + static Status IOError(Args&&... args) { + return Status(StatusCode::IOError, util::StringBuilder(std::forward(args)...)); } /// Return an error status when some (de)serialization operation failed - static Status SerializationError(const std::string& msg) { - return Status(StatusCode::SerializationError, msg); + template + static Status SerializationError(Args&&... args) { + return Status(StatusCode::SerializationError, + util::StringBuilder(std::forward(args)...)); } - static Status RError(const std::string& msg) { return Status(StatusCode::RError, msg); } + template + static Status RError(Args&&... args) { + return Status(StatusCode::RError, util::StringBuilder(std::forward(args)...)); + } - static Status PlasmaObjectExists(const std::string& msg) { - return Status(StatusCode::PlasmaObjectExists, msg); + template + static Status PlasmaObjectExists(Args&&... args) { + return Status(StatusCode::PlasmaObjectExists, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaObjectNonexistent(const std::string& msg) { - return Status(StatusCode::PlasmaObjectNonexistent, msg); + template + static Status PlasmaObjectNonexistent(Args&&... args) { + return Status(StatusCode::PlasmaObjectNonexistent, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaObjectAlreadySealed(const std::string& msg) { - return Status(StatusCode::PlasmaObjectAlreadySealed, msg); + template + static Status PlasmaObjectAlreadySealed(Args&&... args) { + return Status(StatusCode::PlasmaObjectAlreadySealed, + util::StringBuilder(std::forward(args)...)); } - static Status PlasmaStoreFull(const std::string& msg) { - return Status(StatusCode::PlasmaStoreFull, msg); + template + static Status PlasmaStoreFull(Args&&... args) { + return Status(StatusCode::PlasmaStoreFull, + util::StringBuilder(std::forward(args)...)); } static Status StillExecuting() { return Status(StatusCode::StillExecuting, ""); } - static Status CodeGenError(const std::string& msg) { - return Status(StatusCode::CodeGenError, msg); + template + static Status CodeGenError(Args&&... args) { + return Status(StatusCode::CodeGenError, + util::StringBuilder(std::forward(args)...)); } - static Status ExpressionValidationError(const std::string& msg) { - return Status(StatusCode::ExpressionValidationError, msg); + template + static Status ExpressionValidationError(Args&&... args) { + return Status(StatusCode::ExpressionValidationError, + util::StringBuilder(std::forward(args)...)); } - static Status ExecutionError(const std::string& msg) { - return Status(StatusCode::ExecutionError, msg); + template + static Status ExecutionError(Args&&... args) { + return Status(StatusCode::ExecutionError, + util::StringBuilder(std::forward(args)...)); } /// Return true iff the status indicates success. diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 1f3d927ddd62b..d232ac35e30c7 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -234,10 +234,8 @@ Status Column::ValidateData() { for (int i = 0; i < data_->num_chunks(); ++i) { std::shared_ptr type = data_->chunk(i)->type(); if (!this->type()->Equals(type)) { - std::stringstream ss; - ss << "In chunk " << i << " expected type " << this->type()->ToString() - << " but saw " << type->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("In chunk ", i, " expected type ", this->type()->ToString(), + " but saw ", type->ToString()); } } return Status::OK(); @@ -301,10 +299,9 @@ class SimpleTable : public Table { DCHECK(col != nullptr); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match table's length. Expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match table's length. Expected length ", num_rows_, + " but got length ", col->length()); } std::shared_ptr new_schema; @@ -319,10 +316,9 @@ class SimpleTable : public Table { DCHECK(col != nullptr); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Added column's length must match table's length. Expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid( + "Added column's length must match table's length. Expected length ", num_rows_, + " but got length ", col->length()); } std::shared_ptr new_schema; @@ -363,15 +359,11 @@ class SimpleTable : public Table { for (int i = 0; i < num_columns(); ++i) { const Column* col = columns_[i].get(); if (col == nullptr) { - std::stringstream ss; - ss << "Column " << i << " was null"; - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, " was null"); } if (!col->field()->Equals(*schema_->field(i))) { - std::stringstream ss; - ss << "Column field " << i << " named " << col->name() - << " is inconsistent with schema"; - return Status::Invalid(ss.str()); + return Status::Invalid("Column field ", i, " named ", col->name(), + " is inconsistent with schema"); } } @@ -379,10 +371,8 @@ class SimpleTable : public Table { for (int i = 0; i < num_columns(); ++i) { const Column* col = columns_[i].get(); if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Column " << i << " named " << col->name() << " expected length " - << num_rows_ << " but got length " << col->length(); - return Status::Invalid(ss.str()); + return Status::Invalid("Column ", i, " named ", col->name(), " expected length ", + num_rows_, " but got length ", col->length()); } } return Status::OK(); @@ -414,11 +404,9 @@ Status Table::FromRecordBatches(const std::shared_ptr& schema, for (int i = 0; i < nbatches; ++i) { if (!batches[i]->schema()->Equals(*schema, false)) { - std::stringstream ss; - ss << "Schema at index " << static_cast(i) << " was different: \n" - << schema->ToString() << "\nvs\n" - << batches[i]->schema()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Schema at index ", static_cast(i), + " was different: \n", schema->ToString(), "\nvs\n", + batches[i]->schema()->ToString()); } } @@ -458,11 +446,9 @@ Status ConcatenateTables(const std::vector>& tables, for (int i = 1; i < ntables; ++i) { if (!tables[i]->schema()->Equals(*schema, false)) { - std::stringstream ss; - ss << "Schema at index " << static_cast(i) << " was different: \n" - << schema->ToString() << "\nvs\n" - << tables[i]->schema()->ToString(); - return Status::Invalid(ss.str()); + return Status::Invalid("Schema at index ", static_cast(i), + " was different: \n", schema->ToString(), "\nvs\n", + tables[i]->schema()->ToString()); } } diff --git a/cpp/src/arrow/util/compression_brotli.cc b/cpp/src/arrow/util/compression_brotli.cc index 89d099d6a6067..3d75253e11d9f 100644 --- a/cpp/src/arrow/util/compression_brotli.cc +++ b/cpp/src/arrow/util/compression_brotli.cc @@ -81,9 +81,7 @@ class BrotliDecompressor : public Decompressor { Status BrotliError(const char* msg) { return Status::IOError(msg); } Status BrotliError(BrotliDecoderErrorCode code, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << BrotliDecoderErrorString(code); - return Status::IOError(ss.str()); + return Status::IOError(prefix_msg, BrotliDecoderErrorString(code)); } BrotliDecoderState* state_ = nullptr; diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 97fd46ab6c587..d157ba6176054 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -31,6 +31,10 @@ namespace arrow { namespace util { +static Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { + return Status::IOError(prefix_msg, LZ4F_getErrorName(ret)); +} + // ---------------------------------------------------------------------- // Lz4 decompressor implementation @@ -79,12 +83,6 @@ class LZ4Decompressor : public Decompressor { bool IsFinished() override { return finished_; } protected: - Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << LZ4F_getErrorName(ret); - return Status::IOError(ss.str()); - } - LZ4F_dctx* ctx_ = nullptr; bool finished_; }; @@ -125,12 +123,6 @@ class LZ4Compressor : public Compressor { bool* should_retry) override; protected: - Status LZ4Error(LZ4F_errorCode_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << LZ4F_getErrorName(ret); - return Status::IOError(ss.str()); - } - LZ4F_cctx* ctx_ = nullptr; LZ4F_preferences_t prefs_; bool first_time_; diff --git a/cpp/src/arrow/util/compression_snappy.cc b/cpp/src/arrow/util/compression_snappy.cc index 1b483e5855209..058593fe13d4e 100644 --- a/cpp/src/arrow/util/compression_snappy.cc +++ b/cpp/src/arrow/util/compression_snappy.cc @@ -57,10 +57,8 @@ Status SnappyCodec::Decompress(int64_t input_len, const uint8_t* input, return Status::IOError("Corrupt snappy compressed data."); } if (output_buffer_len < static_cast(decompressed_size)) { - std::stringstream ss; - ss << "Output buffer size (" << output_buffer_len << ") must be " << decompressed_size - << " or larger."; - return Status::Invalid(ss.str()); + return Status::Invalid("Output buffer size (", output_buffer_len, ") must be ", + decompressed_size, " or larger."); } if (output_len) { *output_len = static_cast(decompressed_size); diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index 686dffa640940..dfda317e3bf36 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -76,6 +76,10 @@ static int DecompressionWindowBitsForFormat(GZipCodec::Format format) { } } +static Status ZlibErrorPrefix(const char* prefix_msg, const char* msg) { + return Status::IOError(prefix_msg, (msg) ? msg : "(unknown error)"); +} + // ---------------------------------------------------------------------- // gzip decompressor implementation @@ -142,14 +146,7 @@ class GZipDecompressor : public Decompressor { protected: Status ZlibError(const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg; - if (stream_.msg && *stream_.msg) { - ss << stream_.msg; - } else { - ss << "(unknown error)"; - } - return Status::IOError(ss.str()); + return ZlibErrorPrefix(prefix_msg, stream_.msg); } z_stream stream_; @@ -197,14 +194,7 @@ class GZipCompressor : public Compressor { protected: Status ZlibError(const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg; - if (stream_.msg && *stream_.msg) { - ss << stream_.msg; - } else { - ss << "(unknown error)"; - } - return Status::IOError(ss.str()); + return ZlibErrorPrefix(prefix_msg, stream_.msg); } z_stream stream_; @@ -344,9 +334,7 @@ class GZipCodec::GZipCodecImpl { int window_bits = CompressionWindowBitsForFormat(format_); if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, kGZipDefaultCompressionLevel, Z_DEFAULT_STRATEGY)) != Z_OK) { - std::stringstream ss; - ss << "zlib deflateInit failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib deflateInit failed: ", stream_.msg); } compressor_initialized_ = true; return Status::OK(); @@ -367,9 +355,7 @@ class GZipCodec::GZipCodecImpl { // Initialize to run either deflate or zlib/gzip format int window_bits = DecompressionWindowBitsForFormat(format_); if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) { - std::stringstream ss; - ss << "zlib inflateInit failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib inflateInit failed: ", stream_.msg); } decompressor_initialized_ = true; return Status::OK(); @@ -401,9 +387,7 @@ class GZipCodec::GZipCodecImpl { // Reset the stream for this block if (inflateReset(&stream_) != Z_OK) { - std::stringstream ss; - ss << "zlib inflateReset failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib inflateReset failed: ", stream_.msg); } int ret = 0; @@ -425,18 +409,13 @@ class GZipCodec::GZipCodecImpl { if (ret == Z_STREAM_END || ret != Z_OK) break; // Failure, buffer was too small - std::stringstream ss; - ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length - << " OutputLength=" << output_buffer_length; - return Status::IOError(ss.str()); + return Status::IOError("Too small a buffer passed to GZipCodec. InputLength=", + input_length, " OutputLength=", output_buffer_length); } // Failure for some other reason if (ret != Z_STREAM_END) { - std::stringstream ss; - ss << "GZipCodec failed: "; - if (stream_.msg != NULL) ss << stream_.msg; - return Status::IOError(ss.str()); + return ZlibErrorPrefix("GZipCodec failed: ", stream_.msg); } if (output_length) { @@ -475,15 +454,12 @@ class GZipCodec::GZipCodecImpl { // small return Status::IOError("zlib deflate failed, output buffer too small"); } - std::stringstream ss; - ss << "zlib deflate failed: " << stream_.msg; - return Status::IOError(ss.str()); + + return ZlibErrorPrefix("zlib deflate failed: ", stream_.msg); } if (deflateReset(&stream_) != Z_OK) { - std::stringstream ss; - ss << "zlib deflateReset failed: " << std::string(stream_.msg); - return Status::IOError(ss.str()); + return ZlibErrorPrefix("zlib deflateReset failed: ", stream_.msg); } // Actual output length diff --git a/cpp/src/arrow/util/compression_zstd.cc b/cpp/src/arrow/util/compression_zstd.cc index 083cae99b9730..de9df8fc9492e 100644 --- a/cpp/src/arrow/util/compression_zstd.cc +++ b/cpp/src/arrow/util/compression_zstd.cc @@ -36,9 +36,7 @@ namespace util { constexpr int kZSTDDefaultCompressionLevel = 1; static Status ZSTDError(size_t ret, const char* prefix_msg) { - std::stringstream ss; - ss << prefix_msg << ZSTD_getErrorName(ret); - return Status::IOError(ss.str()); + return Status::IOError(prefix_msg, ZSTD_getErrorName(ret)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index c47ac82e8ce3c..f6e110561b275 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -345,9 +345,7 @@ Status Decimal128::FromString(const util::string_view& s, Decimal128* out, DecimalComponents dec; if (!ParseDecimalComponents(s.data(), s.size(), &dec)) { - std::stringstream ss; - ss << "The string '" << s << "' is not a valid decimal number"; - return Status::Invalid(ss.str()); + return Status::Invalid("The string '", s, "' is not a valid decimal number"); } std::string exponent_value = dec.exponent_sign + dec.exponent_digits; @@ -878,11 +876,9 @@ Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, // Fail if we overflow or truncate if (ARROW_PREDICT_FALSE(rescale_would_cause_data_loss)) { - std::stringstream buf; - buf << "Rescaling decimal value " << ToString(original_scale) - << " from original scale of " << original_scale << " to new scale of " - << new_scale << " would cause data loss"; - return Status::Invalid(buf.str()); + return Status::Invalid("Rescaling decimal value ", ToString(original_scale), + " from original scale of ", original_scale, + " to new scale of ", new_scale, " would cause data loss"); } return Status::OK(); @@ -909,11 +905,9 @@ Status Decimal128::FromBigEndian(const uint8_t* bytes, int32_t length, Decimal12 int64_t high, low; if (length < kMinDecimalBytes || length > kMaxDecimalBytes) { - std::ostringstream stream; - stream << "Length of byte array passed to Decimal128::FromBigEndian "; - stream << "was " << length << ", but must be between "; - stream << kMinDecimalBytes << " and " << kMaxDecimalBytes; - return Status::Invalid(stream.str()); + return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ", + "was ", length, ", but must be between ", kMinDecimalBytes, + " and ", kMaxDecimalBytes); } // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index fe76d25eb41d0..f59a4a42abed6 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -149,9 +149,8 @@ class ARROW_EXPORT Decimal128 { constexpr auto max_value = std::numeric_limits::max(); const auto& self = *this; if (self < min_value || self > max_value) { - std::stringstream buf; - buf << "Invalid cast from Decimal128 to " << sizeof(T) << " byte integer"; - return Status::Invalid(buf.str()); + return Status::Invalid("Invalid cast from Decimal128 to ", sizeof(T), + " byte integer"); } *out = static_cast(low_bits_); return Status::OK(); diff --git a/cpp/src/arrow/util/io-util.cc b/cpp/src/arrow/util/io-util.cc index 74ad80691da94..5d67fe87fa0e5 100644 --- a/cpp/src/arrow/util/io-util.cc +++ b/cpp/src/arrow/util/io-util.cc @@ -113,10 +113,8 @@ static inline Status CheckFileOpResult(int ret, int errno_actual, const PlatformFilename& file_name, const char* opname) { if (ret == -1) { - std::stringstream ss; - ss << "Failed to " << opname << " file: " << file_name.string(); - ss << " , error: " << std::strerror(errno_actual); - return Status::IOError(ss.str()); + return Status::IOError("Failed to ", opname, " file: ", file_name.string(), + " , error: ", std::strerror(errno_actual)); } return Status::OK(); } @@ -232,12 +230,18 @@ Status CreatePipe(int fd[2]) { #endif if (ret == -1) { - return Status::IOError(std::string("Error creating pipe: ") + - std::string(strerror(errno))); + return Status::IOError("Error creating pipe: ", std::strerror(errno)); } return Status::OK(); } +static Status StatusFromErrno(const char* prefix) { +#ifdef _WIN32 + errno = __map_mman_error(GetLastError(), EPERM); +#endif + return Status::IOError(prefix, std::strerror(errno)); +} + // // Compatible way to remap a memory map // @@ -251,18 +255,12 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, HANDLE fm, h; if (!UnmapViewOfFile(addr)) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "UnmapViewOfFile failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("UnmapViewOfFile failed: "); } h = reinterpret_cast(_get_osfhandle(fildes)); if (h == INVALID_HANDLE_VALUE) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "cannot get file handle: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("Cannot get file handle: "); } LONG new_size_low = static_cast(new_size & 0xFFFFFFFFL); @@ -272,18 +270,12 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, SetEndOfFile(h); fm = CreateFileMapping(h, NULL, PAGE_READWRITE, 0, 0, ""); if (fm == NULL) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "mremap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("CreateFileMapping failed: "); } *new_addr = MapViewOfFile(fm, FILE_MAP_WRITE, 0, 0, new_size); CloseHandle(fm); if (new_addr == NULL) { - errno = __map_mman_error(GetLastError(), EPERM); - std::stringstream ss; - ss << "mremap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("MapViewOfFile failed: "); } return Status::OK(); #else @@ -291,26 +283,26 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, // we have to close the mmap first, truncate the file to the new size // and recreate the mmap if (munmap(addr, old_size) == -1) { - std::stringstream ss; - ss << "munmap failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("munmap failed: "); } if (ftruncate(fildes, new_size) == -1) { - std::stringstream ss; - ss << "cannot truncate file: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("ftruncate failed: "); } // we set READ / WRITE flags on the new map, since we could only have // unlarged a RW map in the first place *new_addr = mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, fildes, 0); + if (*new_addr == MAP_FAILED) { + return StatusFromErrno("mmap failed: "); + } return Status::OK(); #else if (ftruncate(fildes, new_size) == -1) { - std::stringstream ss; - ss << "file truncate failed: " << std::strerror(errno); - return Status::IOError(ss.str()); + return StatusFromErrno("ftruncate failed: "); } *new_addr = mremap(addr, old_size, new_size, MREMAP_MAYMOVE); + if (*new_addr == MAP_FAILED) { + return StatusFromErrno("mremap failed: "); + } return Status::OK(); #endif #endif diff --git a/cpp/src/arrow/util/string_builder.h b/cpp/src/arrow/util/string_builder.h new file mode 100644 index 0000000000000..7b3e10742a9a9 --- /dev/null +++ b/cpp/src/arrow/util/string_builder.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. template + +#ifndef ARROW_UTIL_STRING_BUILDER_H +#define ARROW_UTIL_STRING_BUILDER_H + +#include +#include +#include + +namespace arrow { +namespace util { + +template +void StringBuilderRecursive(std::stringstream& stream, Head&& head) { + stream << head; +} + +template +void StringBuilderRecursive(std::stringstream& stream, Head&& head, Tail&&... tail) { + StringBuilderRecursive(stream, std::forward(head)); + StringBuilderRecursive(stream, std::forward(tail)...); +} + +template +std::string StringBuilder(Args&&... args) { + std::stringstream stream; + + StringBuilderRecursive(stream, std::forward(args)...); + + return stream.str(); +} + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_STRING_BUILDER_H diff --git a/cpp/src/gandiva/date_utils.cc b/cpp/src/gandiva/date_utils.cc index 2686b193500ff..8a7e1f03fbd20 100644 --- a/cpp/src/gandiva/date_utils.cc +++ b/cpp/src/gandiva/date_utils.cc @@ -75,11 +75,8 @@ Status DateUtils::ToInternalFormat(const std::string& format, buffer.str(""); continue; } else { - if (buffer.str().length() > 0) { - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << i; - return Status::Invalid(err_msg.str()); - } + ARROW_RETURN_IF(buffer.str().length() > 0, + Status::Invalid("Invalid date format string '", format, "'")); is_in_quoted_text = true; continue; @@ -156,10 +153,7 @@ Status DateUtils::ToInternalFormat(const std::string& format, } } } else { - // no potential matches found - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << i; - return Status::Invalid(err_msg.str()); + return Status::Invalid("Invalid date format string '", format, "'"); } } @@ -170,11 +164,10 @@ Status DateUtils::ToInternalFormat(const std::string& format, if (exactMatches.size() == 1 && exactMatches[0].length() == buffer.str().length()) { builder << sql_date_format_to_boost_map_[exactMatches[0]]; } else { - // we didn't successfully parse the entire string + // Format partially parsed int64_t pos = format.length() - buffer.str().length(); - std::stringstream err_msg; - err_msg << "Invalid date format string '" << format << "' at position " << pos; - return Status::Invalid(err_msg.str()); + return Status::Invalid("Invalid date format string '", format, "' at position ", + pos); } } std::string final_pattern = builder.str(); diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 59884c5b4ad44..da7a6d886c0e0 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -103,12 +103,11 @@ Status Engine::LoadPreCompiledIRFiles(const std::string& byte_code_file_path) { /// Read from file into memory buffer. llvm::ErrorOr> buffer_or_error = llvm::MemoryBuffer::getFile(byte_code_file_path); - if (!buffer_or_error) { - std::stringstream ss; - ss << "Could not load module from IR " << byte_code_file_path << ": " - << buffer_or_error.getError().message(); - return Status::CodeGenError(ss.str()); - } + ARROW_RETURN_IF( + !buffer_or_error, + Status::CodeGenError("Could not load module from IR ", byte_code_file_path, ": ", + buffer_or_error.getError().message())); + std::unique_ptr buffer = move(buffer_or_error.get()); /// Parse the IR module. @@ -123,15 +122,11 @@ Status Engine::LoadPreCompiledIRFiles(const std::string& byte_code_file_path) { } std::unique_ptr ir_module = move(module_or_error.get()); - /// Verify the IR module - if (llvm::verifyModule(*ir_module, &llvm::errs())) { - return Status::CodeGenError("verify of IR Module failed"); - } + ARROW_RETURN_IF(llvm::verifyModule(*ir_module, &llvm::errs()), + Status::CodeGenError("verify of IR Module failed")); + ARROW_RETURN_IF(llvm::Linker::linkModules(*module_, move(ir_module)), + Status::CodeGenError("failed to link IR Modules")); - // Link this to the primary module. - if (llvm::Linker::linkModules(*module_, move(ir_module))) { - return Status::CodeGenError("failed to link IR Modules"); - } return Status::OK(); } @@ -197,13 +192,13 @@ Status Engine::FinalizeModule(bool optimise_ir, bool dump_ir) { } } - if (llvm::verifyModule(*module_, &llvm::errs())) { - return Status::CodeGenError("verify of module failed after optimisation passes"); - } + ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()), + Status::CodeGenError("Module verification failed after optimizer")); // do the compilation execution_engine_->finalizeObject(); module_finalized_ = true; + return Status::OK(); } diff --git a/cpp/src/gandiva/expr_validator.cc b/cpp/src/gandiva/expr_validator.cc index 3f5d63745f942..43de9d7a053f8 100644 --- a/cpp/src/gandiva/expr_validator.cc +++ b/cpp/src/gandiva/expr_validator.cc @@ -24,133 +24,114 @@ namespace gandiva { Status ExprValidator::Validate(const ExpressionPtr& expr) { - if (expr == nullptr) { - return Status::ExpressionValidationError("Expression cannot be null."); - } + ARROW_RETURN_IF(expr == nullptr, + Status::ExpressionValidationError("Expression cannot be null")); + Node& root = *expr->root(); - Status status = root.Accept(*this); - if (!status.ok()) { - return status; - } - // validate return type matches - // no need to check if type is supported - // since root type has been validated. - if (!root.return_type()->Equals(*expr->result()->type())) { - std::stringstream ss; - ss << "Return type of root node " << root.return_type()->name() - << " does not match that of expression " << *expr->result()->type(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_NOT_OK(root.Accept(*this)); + + // Ensure root's return type match the expression return type. Type + // support validation is not required because root type is already supported. + ARROW_RETURN_IF(!root.return_type()->Equals(*expr->result()->type()), + Status::ExpressionValidationError("Return type of root node ", + root.return_type()->name(), + " does not match that of expression ", + expr->result()->type()->name())); + return Status::OK(); } Status ExprValidator::Visit(const FieldNode& node) { auto llvm_type = types_->IRType(node.return_type()->id()); - if (llvm_type == nullptr) { - std::stringstream ss; - ss << "Field " << node.field()->name() << " has unsupported data type " - << node.return_type()->name(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(llvm_type == nullptr, + Status::ExpressionValidationError("Field ", node.field()->name(), + " has unsupported data type ", + node.return_type()->name())); + // Ensure that field is found in schema auto field_in_schema_entry = field_map_.find(node.field()->name()); + ARROW_RETURN_IF(field_in_schema_entry == field_map_.end(), + Status::ExpressionValidationError("Field ", node.field()->name(), + " not in schema.")); - // validate that field is in schema. - if (field_in_schema_entry == field_map_.end()) { - std::stringstream ss; - ss << "Field " << node.field()->name() << " not in schema."; - return Status::ExpressionValidationError(ss.str()); - } - + // Ensure that that the found field match. FieldPtr field_in_schema = field_in_schema_entry->second; - // validate that field matches the definition in schema. - if (!field_in_schema->Equals(node.field())) { - std::stringstream ss; - ss << "Field definition in schema " << field_in_schema->ToString() - << " different from field in expression " << node.field()->ToString(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(!field_in_schema->Equals(node.field()), + Status::ExpressionValidationError( + "Field definition in schema ", field_in_schema->ToString(), + " different from field in expression ", node.field()->ToString())); + return Status::OK(); } Status ExprValidator::Visit(const FunctionNode& node) { auto desc = node.descriptor(); FunctionSignature signature(desc->name(), desc->params(), desc->return_type()); + const NativeFunction* native_function = registry_.LookupSignature(signature); - if (native_function == nullptr) { - std::stringstream ss; - ss << "Function " << signature.ToString() << " not supported yet. "; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(native_function == nullptr, + Status::ExpressionValidationError("Function ", signature.ToString(), + " not supported yet. ")); for (auto& child : node.children()) { - Status status = child->Accept(*this); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(child->Accept(*this)); } + return Status::OK(); } Status ExprValidator::Visit(const IfNode& node) { - Status status = node.condition()->Accept(*this); - ARROW_RETURN_NOT_OK(status); - status = node.then_node()->Accept(*this); - ARROW_RETURN_NOT_OK(status); - status = node.else_node()->Accept(*this); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(node.condition()->Accept(*this)); + ARROW_RETURN_NOT_OK(node.then_node()->Accept(*this)); + ARROW_RETURN_NOT_OK(node.else_node()->Accept(*this)); auto if_node_ret_type = node.return_type(); auto then_node_ret_type = node.then_node()->return_type(); auto else_node_ret_type = node.else_node()->return_type(); - if (!if_node_ret_type->Equals(*then_node_ret_type)) { - std::stringstream ss; - ss << "Return type of if " << *if_node_ret_type << " and then " << *then_node_ret_type - << " not matching."; - return Status::ExpressionValidationError(ss.str()); - } + // Then-branch return type must match. + ARROW_RETURN_IF(!if_node_ret_type->Equals(*then_node_ret_type), + Status::ExpressionValidationError( + "Return type of if ", if_node_ret_type->ToString(), " and then ", + then_node_ret_type->ToString(), " not matching.")); - if (!if_node_ret_type->Equals(*else_node_ret_type)) { - std::stringstream ss; - ss << "Return type of if " << *if_node_ret_type << " and else " << *else_node_ret_type - << " not matching."; - return Status::ExpressionValidationError(ss.str()); - } + // Else-branch return type must match. + ARROW_RETURN_IF(!if_node_ret_type->Equals(*else_node_ret_type), + Status::ExpressionValidationError( + "Return type of if ", if_node_ret_type->ToString(), " and else ", + else_node_ret_type->ToString(), " not matching.")); return Status::OK(); } Status ExprValidator::Visit(const LiteralNode& node) { auto llvm_type = types_->IRType(node.return_type()->id()); - if (llvm_type == nullptr) { - std::stringstream ss; - ss << "Value " << node.holder() << " has unsupported data type " - << node.return_type()->name(); - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(llvm_type == nullptr, + Status::ExpressionValidationError("Value ", node.holder(), + " has unsupported data type ", + node.return_type()->name())); + return Status::OK(); } Status ExprValidator::Visit(const BooleanNode& node) { - Status status; - - if (node.children().size() < 2) { - std::stringstream ss; - ss << "Boolean expression has " << node.children().size() - << " children, expected atleast two"; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF( + node.children().size() < 2, + Status::ExpressionValidationError("Boolean expression has ", node.children().size(), + " children, expected atleast two")); for (auto& child : node.children()) { - if (!child->return_type()->Equals(arrow::boolean())) { - std::stringstream ss; - ss << "Boolean expression has a child with return type " - << child->return_type()->name() << ", expected return type boolean"; - return Status::ExpressionValidationError(ss.str()); - } - - status = child->Accept(*this); - ARROW_RETURN_NOT_OK(status); + const auto bool_type = arrow::boolean(); + const auto ret_type = child->return_type(); + + ARROW_RETURN_IF(!ret_type->Equals(bool_type), + Status::ExpressionValidationError( + "Boolean expression has a child with return type ", + ret_type->ToString(), ", expected return type boolean")); + + ARROW_RETURN_NOT_OK(child->Accept(*this)); } + return Status::OK(); } @@ -178,18 +159,13 @@ Status ExprValidator::Visit(const InExpressionNode& node) { Status ExprValidator::ValidateInExpression(size_t number_of_values, DataTypePtr in_expr_return_type, DataTypePtr type_of_values) { - if (static_cast(number_of_values) == 0) { - std::stringstream ss; - ss << "IN Expression needs a non-empty constant list to match."; - return Status::ExpressionValidationError(ss.str()); - } - - if (!in_expr_return_type->Equals(type_of_values)) { - std::stringstream ss; - ss << "Evaluation expression for IN clause returns " << in_expr_return_type - << " values are of type" << type_of_values; - return Status::ExpressionValidationError(ss.str()); - } + ARROW_RETURN_IF(number_of_values == 0, + Status::ExpressionValidationError( + "IN Expression needs a non-empty constant list to match.")); + ARROW_RETURN_IF(!in_expr_return_type->Equals(type_of_values), + Status::ExpressionValidationError( + "Evaluation expression for IN clause returns ", in_expr_return_type, + " values are of type", type_of_values)); return Status::OK(); } diff --git a/cpp/src/gandiva/filter.cc b/cpp/src/gandiva/filter.cc index 7a24d9554ef3f..6075e2574559b 100644 --- a/cpp/src/gandiva/filter.cc +++ b/cpp/src/gandiva/filter.cc @@ -40,32 +40,28 @@ Filter::Filter(std::unique_ptr llvm_generator, SchemaPtr schema, Status Filter::Make(SchemaPtr schema, ConditionPtr condition, std::shared_ptr configuration, std::shared_ptr* filter) { - ARROW_RETURN_FAILURE_IF_FALSE(schema != nullptr, - Status::Invalid("schema cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(condition != nullptr, - Status::Invalid("condition cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(configuration != nullptr, - Status::Invalid("configuration cannot be null")); + ARROW_RETURN_IF(schema == nullptr, Status::Invalid("Schema cannot be null")); + ARROW_RETURN_IF(condition == nullptr, Status::Invalid("Condition cannot be null")); + ARROW_RETURN_IF(configuration == nullptr, + Status::Invalid("Configuration cannot be null")); + static Cache> cache; FilterCacheKey cache_key(schema, configuration, *(condition.get())); - std::shared_ptr cachedFilter = cache.GetModule(cache_key); + auto cachedFilter = cache.GetModule(cache_key); if (cachedFilter != nullptr) { *filter = cachedFilter; return Status::OK(); } + // Build LLVM generator, and generate code for the specified expression std::unique_ptr llvm_gen; - Status status = LLVMGenerator::Make(configuration, &llvm_gen); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, &llvm_gen)); // Run the validation on the expression. // Return if the expression is invalid since we will not be able to process further. ExprValidator expr_validator(llvm_gen->types(), schema); - status = expr_validator.Validate(condition); - ARROW_RETURN_NOT_OK(status); - - status = llvm_gen->Build({condition}); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(expr_validator.Validate(condition)); + ARROW_RETURN_NOT_OK(llvm_gen->Build({condition})); // Instantiate the filter with the completely built llvm generator *filter = std::make_shared(std::move(llvm_gen), schema, configuration); @@ -76,42 +72,33 @@ Status Filter::Make(SchemaPtr schema, ConditionPtr condition, Status Filter::Evaluate(const arrow::RecordBatch& batch, std::shared_ptr out_selection) { - if (!batch.schema()->Equals(*schema_)) { - return Status::Invalid("Schema in RecordBatch must match the schema in Make()"); - } - if (batch.num_rows() == 0) { - return Status::Invalid("RecordBatch must be non-empty."); - } - if (out_selection == nullptr) { - return Status::Invalid("out_selection must be non-null."); - } - if (out_selection->GetMaxSlots() < batch.num_rows()) { - std::stringstream ss; - ss << "out_selection has " << out_selection->GetMaxSlots() - << " slots, which is less than the batch size " << batch.num_rows(); - return Status::Invalid(ss.str()); - } + const auto num_rows = batch.num_rows(); + ARROW_RETURN_IF(!batch.schema()->Equals(*schema_), + Status::Invalid("RecordBatch schema must expected filter schema")); + ARROW_RETURN_IF(num_rows == 0, Status::Invalid("RecordBatch must be non-empty.")); + ARROW_RETURN_IF(out_selection == nullptr, + Status::Invalid("out_selection must be non-null.")); + ARROW_RETURN_IF(out_selection->GetMaxSlots() < num_rows, + Status::Invalid("Output selection vector capacity too small")); // Allocate three local_bitmaps (one for output, one for validity, one to compute the // intersection). - LocalBitMapsHolder bitmaps(batch.num_rows(), 3 /*local_bitmaps*/); + LocalBitMapsHolder bitmaps(num_rows, 3 /*local_bitmaps*/); int64_t bitmap_size = bitmaps.GetLocalBitMapSize(); auto validity = std::make_shared(bitmaps.GetLocalBitMap(0), bitmap_size); auto value = std::make_shared(bitmaps.GetLocalBitMap(1), bitmap_size); - auto array_data = - arrow::ArrayData::Make(arrow::boolean(), batch.num_rows(), {validity, value}); + auto array_data = arrow::ArrayData::Make(arrow::boolean(), num_rows, {validity, value}); // Execute the expression(s). - auto status = llvm_generator_->Execute(batch, {array_data}); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_generator_->Execute(batch, {array_data})); // Compute the intersection of the value and validity. auto result = bitmaps.GetLocalBitMap(2); BitMapAccumulator::IntersectBitMaps( - result, {bitmaps.GetLocalBitMap(0), bitmaps.GetLocalBitMap((1))}, batch.num_rows()); + result, {bitmaps.GetLocalBitMap(0), bitmaps.GetLocalBitMap((1))}, num_rows); - return out_selection->PopulateFromBitMap(result, bitmap_size, batch.num_rows() - 1); + return out_selection->PopulateFromBitMap(result, bitmap_size, num_rows - 1); } } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index d659b22c46e34..051b75b7dc137 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -50,39 +50,40 @@ const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) { } } - // didn't hit any of the optimisation paths. return original. + // Could not optimize, return original node. return node; } +static bool IsArrowStringLiteral(arrow::Type::type type) { + return type == arrow::Type::STRING || type == arrow::Type::BINARY; +} + Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { - if (node.children().size() != 2) { - return Status::Invalid("'like' function requires two parameters"); - } + ARROW_RETURN_IF(node.children().size() != 2, + Status::Invalid("'like' function requires two parameters")); auto literal = dynamic_cast(node.children().at(1).get()); - if (literal == nullptr) { - return Status::Invalid("'like' function requires a literal as the second parameter"); - } + ARROW_RETURN_IF( + literal == nullptr, + Status::Invalid("'like' function requires a literal as the second parameter")); auto literal_type = literal->return_type()->id(); - if (literal_type != arrow::Type::STRING && literal_type != arrow::Type::BINARY) { - return Status::Invalid( - "'like' function requires a string literal as the second parameter"); - } - auto pattern = boost::get(literal->holder()); - return Make(pattern, holder); + ARROW_RETURN_IF( + !IsArrowStringLiteral(literal_type), + Status::Invalid( + "'like' function requires a string literal as the second parameter")); + + return Make(boost::get(literal->holder()), holder); } Status LikeHolder::Make(const std::string& sql_pattern, std::shared_ptr* holder) { std::string pcre_pattern; - auto status = RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); - if (!lholder->regex_.ok()) { - return Status::Invalid("building re2 regex failed for pattern " + pcre_pattern); - } + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); *holder = lholder; return Status::OK(); diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 82d0386cfb9f3..50f147b2fc7dd 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -44,10 +44,10 @@ LLVMGenerator::LLVMGenerator() Status LLVMGenerator::Make(std::shared_ptr config, std::unique_ptr* llvm_generator) { std::unique_ptr llvmgen_obj(new LLVMGenerator()); - Status status = Engine::Make(config, &(llvmgen_obj->engine_)); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(Engine::Make(config, &(llvmgen_obj->engine_))); *llvm_generator = std::move(llvmgen_obj); + return Status::OK(); } @@ -57,33 +57,29 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out // decompose the expression to separate out value and validities. ExprDecomposer decomposer(function_registry_, annotator_); ValueValidityPairPtr value_validity; - auto status = decomposer.Decompose(*expr->root(), &value_validity); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(decomposer.Decompose(*expr->root(), &value_validity)); // Generate the IR function for the decomposed expression. llvm::Function* ir_function = nullptr; - status = CodeGenExprValue(value_validity->value_expr(), output, idx, &ir_function); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK( + CodeGenExprValue(value_validity->value_expr(), output, idx, &ir_function)); std::unique_ptr compiled_expr( new CompiledExpr(value_validity, output, ir_function)); compiled_exprs_.push_back(std::move(compiled_expr)); + return Status::OK(); } /// Build and optimise module for projection expression. Status LLVMGenerator::Build(const ExpressionVector& exprs) { - Status status; - for (auto& expr : exprs) { auto output = annotator_.AddOutputFieldDescriptor(expr->result()); - status = Add(expr, output); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(Add(expr, output)); } - // optimise, compile and finalize the module - status = engine_->FinalizeModule(optimise_ir_, dump_ir_); - ARROW_RETURN_NOT_OK(status); + // Optimize, compile and finalize the module + ARROW_RETURN_NOT_OK(engine_->FinalizeModule(optimise_ir_, dump_ir_)); // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { @@ -91,6 +87,7 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs) { EvalFunc fn = reinterpret_cast(engine_->CompiledFunction(ir_func)); compiled_expr->set_jit_function(fn); } + return Status::OK(); } @@ -107,13 +104,15 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, EvalFunc jit_function = compiled_expr->jit_function(); jit_function(eval_batch->GetBufferArray(), eval_batch->GetLocalBitMapArray(), (int64_t)eval_batch->GetExecutionContext(), record_batch.num_rows()); - // check for execution errors - if (eval_batch->GetExecutionContext()->has_error()) { - return Status::ExecutionError(eval_batch->GetExecutionContext()->get_error()); - } + + ARROW_RETURN_IF( + eval_batch->GetExecutionContext()->has_error(), + Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); + // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, *eval_batch); } + return Status::OK(); } @@ -233,8 +232,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, FieldDescriptorPtr out engine_->AddFunctionToCompile(func_name); *fn = llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, func_name, module()); - ARROW_RETURN_FAILURE_IF_FALSE((*fn != nullptr), - Status::CodeGenError("Error creating function.")); + ARROW_RETURN_IF((*fn == nullptr), Status::CodeGenError("Error creating function.")); + // Name the arguments llvm::Function::arg_iterator args = (*fn)->arg_begin(); llvm::Value* arg_addrs = &*args; @@ -396,6 +395,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, value = ir_builder()->CreateCall(fn, args, full_name); DCHECK(value->getType() == ret_type); } + return value; } diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 40fdc201133a4..d5902fc72f16d 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -45,12 +45,10 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, std::shared_ptr configuration, std::shared_ptr* projector) { - ARROW_RETURN_FAILURE_IF_FALSE(schema != nullptr, - Status::Invalid("schema cannot be null")); - ARROW_RETURN_FAILURE_IF_FALSE(!exprs.empty(), - Status::Invalid("expressions need to be non-empty")); - ARROW_RETURN_FAILURE_IF_FALSE(configuration != nullptr, - Status::Invalid("configuration cannot be null")); + ARROW_RETURN_IF(schema == nullptr, Status::Invalid("Schema cannot be null")); + ARROW_RETURN_IF(exprs.empty(), Status::Invalid("Expressions cannot be empty")); + ARROW_RETURN_IF(configuration == nullptr, + Status::Invalid("Configuration cannot be null")); // see if equivalent projector was already built static Cache> cache; @@ -63,23 +61,21 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, // Build LLVM generator, and generate code for the specified expressions std::unique_ptr llvm_gen; - Status status = LLVMGenerator::Make(configuration, &llvm_gen); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, &llvm_gen)); // Run the validation on the expressions. // Return if any of the expression is invalid since // we will not be able to process further. ExprValidator expr_validator(llvm_gen->types(), schema); for (auto& expr : exprs) { - status = expr_validator.Validate(expr); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(expr_validator.Validate(expr)); } - status = llvm_gen->Build(exprs); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_gen->Build(exprs)); // save the output field types. Used for validation at Evaluate() time. std::vector output_fields; + output_fields.reserve(exprs.size()); for (auto& expr : exprs) { output_fields.push_back(expr->result()); } @@ -94,86 +90,70 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, Status Projector::Evaluate(const arrow::RecordBatch& batch, const ArrayDataVector& output_data_vecs) { - Status status = ValidateEvaluateArgsCommon(batch); - ARROW_RETURN_NOT_OK(status); - - if (output_data_vecs.size() != output_fields_.size()) { - std::stringstream ss; - ss << "number of buffers for output_data_vecs is " << output_data_vecs.size() - << ", expected " << output_fields_.size(); - return Status::Invalid(ss.str()); - } + ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); + ARROW_RETURN_IF( + output_data_vecs.size() != output_fields_.size(), + Status::Invalid("Number of output buffers must match number of fields")); int idx = 0; for (auto& array_data : output_data_vecs) { + const auto output_field = output_fields_[idx]; if (array_data == nullptr) { - std::stringstream ss; - ss << "array for output field " << output_fields_[idx]->name() << "is null."; - return Status::Invalid(ss.str()); + return Status::Invalid("Output array for field ", output_field->name(), + " should not be null"); } - Status status = - ValidateArrayDataCapacity(*array_data, *(output_fields_[idx]), batch.num_rows()); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK( + ValidateArrayDataCapacity(*array_data, *output_field, batch.num_rows())); ++idx; } + return llvm_generator_->Execute(batch, output_data_vecs); } Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* pool, arrow::ArrayVector* output) { - Status status = ValidateEvaluateArgsCommon(batch); - ARROW_RETURN_NOT_OK(status); - - if (output == nullptr) { - return Status::Invalid("output must be non-null."); - } - - if (pool == nullptr) { - return Status::Invalid("memory pool must be non-null."); - } + ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); + ARROW_RETURN_IF(output == nullptr, Status::Invalid("Output must be non-null.")); + ARROW_RETURN_IF(pool == nullptr, Status::Invalid("Memory pool must be non-null.")); // Allocate the output data vecs. ArrayDataVector output_data_vecs; + output_data_vecs.reserve(output_fields_.size()); for (auto& field : output_fields_) { ArrayDataPtr output_data; - status = AllocArrayData(field->type(), batch.num_rows(), pool, &output_data); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK( + AllocArrayData(field->type(), batch.num_rows(), pool, &output_data)); output_data_vecs.push_back(output_data); } // Execute the expression(s). - status = llvm_generator_->Execute(batch, output_data_vecs); - ARROW_RETURN_NOT_OK(status); + ARROW_RETURN_NOT_OK(llvm_generator_->Execute(batch, output_data_vecs)); // Create and return array arrays. output->clear(); for (auto& array_data : output_data_vecs) { output->push_back(arrow::MakeArray(array_data)); } + return Status::OK(); } // TODO : handle variable-len vectors Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, arrow::MemoryPool* pool, ArrayDataPtr* array_data) { - if (!arrow::is_primitive(type->id())) { - return Status::Invalid("Unsupported output data type " + type->ToString()); - } + ARROW_RETURN_IF(!arrow::is_primitive(type->id()), + Status::Invalid("Unsupported output data type ", type)); - arrow::Status astatus; std::shared_ptr null_bitmap; - int64_t size = arrow::BitUtil::BytesForBits(num_records); - astatus = arrow::AllocateBuffer(pool, size, &null_bitmap); - ARROW_RETURN_NOT_OK(astatus); + int64_t bitmap_bytes = arrow::BitUtil::BytesForBits(num_records); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, bitmap_bytes, &null_bitmap)); std::shared_ptr data; const auto& fw_type = dynamic_cast(*type); int64_t data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); - astatus = arrow::AllocateBuffer(pool, data_len, &data); - ARROW_RETURN_NOT_OK(astatus); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, data_len, &data)); // Valgrind detects unitialized memory at byte level. Boolean types use bits // and can leave buffer memory uninitialized in the last byte. @@ -186,47 +166,33 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } Status Projector::ValidateEvaluateArgsCommon(const arrow::RecordBatch& batch) { - if (!batch.schema()->Equals(*schema_)) { - return Status::Invalid("Schema in RecordBatch must match the schema in Make()"); - } - if (batch.num_rows() == 0) { - return Status::Invalid("RecordBatch must be non-empty."); - } + ARROW_RETURN_IF(!batch.schema()->Equals(*schema_), + Status::Invalid("Schema in RecordBatch must match schema in Make()")); + ARROW_RETURN_IF(batch.num_rows() == 0, + Status::Invalid("RecordBatch must be non-empty.")); + return Status::OK(); } Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, const arrow::Field& field, int64_t num_records) { - // verify that there are atleast two buffers (validity and data). - if (array_data.buffers.size() < 2) { - std::stringstream ss; - ss << "number of buffers for output field " << field.name() << "is " - << array_data.buffers.size() << ", must have minimum 2."; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(array_data.buffers.size() < 2, + Status::Invalid("ArrayData must have at least 2 buffers")); - // verify size of bitmap buffer. int64_t min_bitmap_len = arrow::BitUtil::BytesForBits(num_records); int64_t bitmap_len = array_data.buffers[0]->capacity(); - if (bitmap_len < min_bitmap_len) { - std::stringstream ss; - ss << "bitmap buffer for output field " << field.name() << "has size " << bitmap_len - << ", must have minimum size " << min_bitmap_len; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(bitmap_len < min_bitmap_len, + Status::Invalid("Bitmap buffer too small for ", field.name())); // verify size of data buffer. // TODO : handle variable-len vectors const auto& fw_type = dynamic_cast(*field.type()); int64_t min_data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); int64_t data_len = array_data.buffers[1]->capacity(); - if (data_len < min_data_len) { - std::stringstream ss; - ss << "data buffer for output field " << field.name() << " has size " << data_len - << ", must have minimum size " << min_data_len; - return Status::Invalid(ss.str()); - } + ARROW_RETURN_IF(data_len < min_data_len, + Status::Invalid("Data buffer too small for ", field.name())); + return Status::OK(); } diff --git a/cpp/src/gandiva/regex_util.cc b/cpp/src/gandiva/regex_util.cc index 893af095a3dd2..1d3860615d57f 100644 --- a/cpp/src/gandiva/regex_util.cc +++ b/cpp/src/gandiva/regex_util.cc @@ -38,20 +38,16 @@ Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char esca if (cur == escape_char) { // escape char must be followed by '_', '%' or the escape char itself. ++idx; - if (idx == sql_pattern.size()) { - std::stringstream msg; - msg << "unexpected escape char at the end of pattern " << sql_pattern; - return Status::Invalid(msg.str()); - } + ARROW_RETURN_IF( + idx == sql_pattern.size(), + Status::Invalid("Unexpected escape char at the end of pattern ", sql_pattern)); cur = sql_pattern.at(idx); if (cur == '_' || cur == '%' || cur == escape_char) { pcre_pattern += cur; } else { - std::stringstream msg; - msg << "invalid escape sequence in pattern " << sql_pattern << " at offset " - << idx; - return Status::Invalid(msg.str()); + return Status::Invalid("Invalid escape sequence in pattern ", sql_pattern, + " at offset ", idx); } } else if (cur == '_') { pcre_pattern += '.'; diff --git a/cpp/src/gandiva/selection_vector.cc b/cpp/src/gandiva/selection_vector.cc index 9266ca7fe1056..f89b80c2b510f 100644 --- a/cpp/src/gandiva/selection_vector.cc +++ b/cpp/src/gandiva/selection_vector.cc @@ -28,22 +28,15 @@ namespace gandiva { Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap_size, int64_t max_bitmap_index) { - if (bitmap_size % 8 != 0) { - std::stringstream ss; - ss << "bitmap size " << bitmap_size << " must be padded to 64-bit size"; - return Status::Invalid(ss.str()); - } - if (max_bitmap_index < 0) { - std::stringstream ss; - ss << "max bitmap index " << max_bitmap_index << " must be positive"; - return Status::Invalid(ss.str()); - } - if (static_cast(max_bitmap_index) > GetMaxSupportedValue()) { - std::stringstream ss; - ss << "max_bitmap_index " << max_bitmap_index << " must be <= maxSupportedValue " - << GetMaxSupportedValue() << " in selection vector"; - return Status::Invalid(ss.str()); - } + const uint64_t max_idx = static_cast(max_bitmap_index); + ARROW_RETURN_IF(bitmap_size % 8, Status::Invalid("Bitmap size ", bitmap_size, + " must be aligned to 64-bit size")); + ARROW_RETURN_IF(max_bitmap_index < 0, + Status::Invalid("Max bitmap index must be positive")); + ARROW_RETURN_IF( + max_idx > GetMaxSupportedValue(), + Status::Invalid("max_bitmap_index ", max_idx, " must be <= maxSupportedValue ", + GetMaxSupportedValue(), " in selection vector")); int64_t max_slots = GetMaxSlots(); @@ -64,9 +57,9 @@ Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap break; } - if (selection_idx >= max_slots) { - return Status::Invalid("selection vector has no remaining slots"); - } + ARROW_RETURN_IF(selection_idx >= max_slots, + Status::Invalid("selection vector has no remaining slots")); + SetIndex(selection_idx, pos_in_bitmap); ++selection_idx; @@ -81,60 +74,54 @@ Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap Status SelectionVector::MakeInt16(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt16::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt16::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt16(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt16::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt16::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt32(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt32::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt32::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt32(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt32::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt32::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt64(int64_t max_slots, std::shared_ptr buffer, std::shared_ptr* selection_vector) { - auto status = SelectionVectorInt64::ValidateBuffer(max_slots, buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt64::ValidateBuffer(max_slots, buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } Status SelectionVector::MakeInt64(int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* selection_vector) { std::shared_ptr buffer; - auto status = SelectionVectorInt64::AllocateBuffer(max_slots, pool, &buffer); - ARROW_RETURN_NOT_OK(status); - + ARROW_RETURN_NOT_OK(SelectionVectorInt64::AllocateBuffer(max_slots, pool, &buffer)); *selection_vector = std::make_shared(max_slots, buffer); + return Status::OK(); } @@ -142,8 +129,7 @@ template Status SelectionVectorImpl::AllocateBuffer( int64_t max_slots, arrow::MemoryPool* pool, std::shared_ptr* buffer) { auto buffer_len = max_slots * sizeof(C_TYPE); - auto astatus = arrow::AllocateBuffer(pool, buffer_len, buffer); - ARROW_RETURN_NOT_OK(astatus); + ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, buffer_len, buffer)); return Status::OK(); } @@ -151,19 +137,13 @@ Status SelectionVectorImpl::AllocateBuffer( template Status SelectionVectorImpl::ValidateBuffer( int64_t max_slots, std::shared_ptr buffer) { - // verify buffer is mutable - if (!buffer->is_mutable()) { - return Status::Invalid("buffer for selection vector must be mutable"); - } + ARROW_RETURN_IF(!buffer->is_mutable(), + Status::Invalid("buffer for selection vector must be mutable")); + + const int64_t min_len = max_slots * sizeof(C_TYPE); + ARROW_RETURN_IF(buffer->size() < min_len, + Status::Invalid("Buffer for selection vector is too small")); - // verify size of buffer. - int64_t min_len = max_slots * sizeof(C_TYPE); - if (buffer->size() < min_len) { - std::stringstream ss; - ss << "buffer for selection_data has size " << buffer->size() - << ", must have minimum size " << min_len; - return Status::Invalid(ss.str()); - } return Status::OK(); } diff --git a/cpp/src/gandiva/tests/projector_build_validation_test.cc b/cpp/src/gandiva/tests/projector_build_validation_test.cc index ddcb729b3bfee..18f02957fd479 100644 --- a/cpp/src/gandiva/tests/projector_build_validation_test.cc +++ b/cpp/src/gandiva/tests/projector_build_validation_test.cc @@ -191,8 +191,6 @@ TEST_F(TestProjector, TestIfNotMatchingReturnType) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Return type of if bool and then int32 not matching."; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestElseNotMatchingReturnType) { @@ -218,8 +216,6 @@ TEST_F(TestProjector, TestElseNotMatchingReturnType) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Return type of if int32 and else bool not matching."; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestElseNotSupportedType) { @@ -245,8 +241,7 @@ TEST_F(TestProjector, TestElseNotSupportedType) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Field c has unsupported data type list"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); + EXPECT_EQ(status.code(), StatusCode::ExpressionValidationError); } TEST_F(TestProjector, TestAndMinChildren) { @@ -266,8 +261,6 @@ TEST_F(TestProjector, TestAndMinChildren) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Boolean expression has 1 children, expected atleast two"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } TEST_F(TestProjector, TestAndBooleanArgType) { @@ -289,10 +282,6 @@ TEST_F(TestProjector, TestAndBooleanArgType) { std::shared_ptr projector; Status status = Projector::Make(schema, {expr}, &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = - "Boolean expression has a child with return type int32, expected return type " - "boolean"; - EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } } // namespace gandiva diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 7830b6abc75d1..b5905fddff489 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -690,10 +690,8 @@ Status FileReader::GetRecordBatchReader(const std::vector& row_group_indice int max_num = num_row_groups(); for (auto row_group_index : row_group_indices) { if (row_group_index < 0 || row_group_index >= max_num) { - std::ostringstream ss; - ss << "Some index in row_group_indices is " << row_group_index - << ", which is either < 0 or >= num_row_groups(" << max_num << ")"; - return Status::Invalid(ss.str()); + return Status::Invalid("Some index in row_group_indices is ", row_group_index, + ", which is either < 0 or >= num_row_groups(", max_num, ")"); } } @@ -1495,9 +1493,8 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, TRANSFER_CASE(TIME32, ::arrow::Time32Type, Int32Type) TRANSFER_CASE(TIME64, ::arrow::Time64Type, Int64Type) default: - std::stringstream ss; - ss << "No support for reading columns of type " << field_->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("No support for reading columns of type ", + field_->type()->ToString()); } DCHECK_NE(result.kind(), Datum::NONE); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index af9fbc91a5042..fed0e59dfa330 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -80,10 +80,9 @@ static Status FromFLBA(const PrimitiveNode& node, std::shared_ptr* ou *out = MakeDecimal128Type(node); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for fixed-length binary array"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for fixed-length binary array"); } return Status::OK(); @@ -122,10 +121,9 @@ static Status FromInt32(const PrimitiveNode& node, std::shared_ptr* o *out = MakeDecimal128Type(node); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for INT32"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for INT32"); } return Status::OK(); } @@ -154,10 +152,9 @@ static Status FromInt64(const PrimitiveNode& node, std::shared_ptr* o *out = ::arrow::time64(::arrow::TimeUnit::MICRO); break; default: - std::stringstream ss; - ss << "Unhandled logical type " << LogicalTypeToString(node.logical_type()) - << " for INT64"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unhandled logical type ", + LogicalTypeToString(node.logical_type()), + " for INT64"); } return Status::OK(); } @@ -613,10 +610,9 @@ Status FieldToNode(const std::shared_ptr& field, } default: { // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR - std::stringstream ss; - ss << "Unhandled type for Arrow to Parquet schema conversion: "; - ss << field->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented( + "Unhandled type for Arrow to Parquet schema conversion: ", + field->type()->ToString()); } } PARQUET_CATCH_NOT_OK(*out = diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index bce9f37026c97..a8153cac1ebea 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -676,10 +676,8 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_ auto DivideBy = [&](const int64_t factor) { for (int64_t i = 0; i < array.length(); i++) { if (!truncated_timestamps_allowed && !data.IsNull(i) && (values[i] % factor != 0)) { - std::stringstream ss; - ss << "Casting from " << type.ToString() << " to " << target_type->ToString() - << " would lose data: " << values[i]; - return Status::Invalid(ss.str()); + return Status::Invalid("Casting from ", type.ToString(), " to ", + target_type->ToString(), " would lose data: ", values[i]); } buffer[i] = values[i] / factor; } @@ -950,9 +948,8 @@ Status ArrowColumnWriter::Write(const Array& data) { default: break; } - std::stringstream ss; - ss << "Data type not supported as list value: " << values_array->type()->ToString(); - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Data type not supported as list value: ", + values_array->type()->ToString()); } } // namespace diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc index d63ceb6da24da..d2794e89d3ac0 100644 --- a/cpp/src/plasma/io.cc +++ b/cpp/src/plasma/io.cc @@ -49,7 +49,7 @@ Status WriteBytes(int fd, uint8_t* cursor, size_t length) { if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { continue; } - return Status::IOError(std::string(strerror(errno))); + return Status::IOError(strerror(errno)); } else if (nbytes == 0) { return Status::IOError("Encountered unexpected EOF"); } @@ -80,7 +80,7 @@ Status ReadBytes(int fd, uint8_t* cursor, size_t length) { if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { continue; } - return Status::IOError(std::string(strerror(errno))); + return Status::IOError(strerror(errno)); } else if (0 == nbytes) { return Status::IOError("Encountered unexpected EOF"); } @@ -171,12 +171,12 @@ Status ConnectIpcSocketRetry(const std::string& pathname, int num_retries, *fd = ConnectIpcSock(pathname); --num_retries; } + // If we could not connect to the socket, exit. if (*fd == -1) { - std::stringstream ss; - ss << "Could not connect to socket " << pathname; - return Status::IOError(ss.str()); + return Status::IOError("Could not connect to socket ", pathname); } + return Status::OK(); } From 2ab97bc3f9885fa95e8ad51aa3b119a5435440c2 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Thu, 20 Dec 2018 11:38:39 -0600 Subject: [PATCH 45/80] ARROW-4089: [Plasma] The tutorial is wrong regarding the parameter type of PlasmaClient.Create Plasma's tutorial says that the data type of one parameter is address of `uint8_t*` but it's actually address of `shared_ptr`. ``` uint8_t* data; <------------------------------- wrong data type here. // Create a Plasma object by specifying its ID and size. ARROW_CHECK_OK(client.Create(object_id, data_size, NULL, 0, &data)); ``` Author: Kousuke Saruta Closes #3235 from sarutak/fix-plasma-tutorial and squashes the following commits: a780a27cf Fix the data type of the pointer in the plasma's tutorial --- cpp/apidoc/tutorials/plasma.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/apidoc/tutorials/plasma.md b/cpp/apidoc/tutorials/plasma.md index b9046d50bc922..40c5a10603e71 100644 --- a/cpp/apidoc/tutorials/plasma.md +++ b/cpp/apidoc/tutorials/plasma.md @@ -182,7 +182,7 @@ was written by the `Create` command. int64_t data_size = 100; // The address of the buffer allocated by the Plasma store will be written at // this address. -uint8_t* data; +std::shared_ptr data; // Create a Plasma object by specifying its ID and size. ARROW_CHECK_OK(client.Create(object_id, data_size, NULL, 0, &data)); ``` @@ -194,7 +194,7 @@ metadata (as raw bytes) and the fourth argument is the size of the metadata. // Create a Plasma object with metadata. int64_t data_size = 100; std::string metadata = "{'author': 'john'}"; -uint8_t* data; +std::shared_ptr data; client.Create(object_id, data_size, (uint8_t*) metadata.data(), metadata.size(), &data); ``` From 398466e629bad593e72def8c892b030958a58a1a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 20 Dec 2018 14:04:18 -0600 Subject: [PATCH 46/80] ARROW-4079: [C++] Add machine benchmark Right now there is a single memory latency benchmark. Its output looks like this, showing the different cache levels up to main memory (this is on a CPU with 16 MB L3 cache): ``` ------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------ BM_memory_latency/2048 2 ns 2 ns 406878405 548.706M items/s BM_memory_latency/4096 2 ns 2 ns 395414303 557.74M items/s BM_memory_latency/8192 2 ns 2 ns 394141916 560.264M items/s BM_memory_latency/16384 2 ns 2 ns 401410292 535.202M items/s BM_memory_latency/32768 2 ns 2 ns 381828811 525.377M items/s BM_memory_latency/65536 4 ns 4 ns 189027575 262.929M items/s BM_memory_latency/131072 5 ns 5 ns 150798287 209.01M items/s BM_memory_latency/262144 5 ns 5 ns 129287045 185.606M items/s BM_memory_latency/524288 7 ns 7 ns 96543517 132.663M items/s BM_memory_latency/1048576 11 ns 11 ns 66380535 89.0397M items/s BM_memory_latency/2097152 12 ns 12 ns 55003164 76.6384M items/s BM_memory_latency/4194304 13 ns 13 ns 51559443 70.9488M items/s BM_memory_latency/8388608 28 ns 28 ns 25813875 33.6881M items/s BM_memory_latency/16777216 66 ns 66 ns 10463216 14.4577M items/s BM_memory_latency/33554432 90 ns 90 ns 7743594 10.5434M items/s ``` Author: Antoine Pitrou Closes #3225 from pitrou/ARROW-4079-machine-benchmark and squashes the following commits: 55f6de696 ARROW-4079: Add machine benchmark --- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/machine-benchmark.cc | 70 +++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 cpp/src/arrow/util/machine-benchmark.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index b13b2f367b022..ee64a32915f09 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -70,5 +70,6 @@ ADD_ARROW_BENCHMARK(decimal-benchmark) ADD_ARROW_BENCHMARK(hashing-benchmark) ADD_ARROW_BENCHMARK(int-util-benchmark) ADD_ARROW_BENCHMARK(lazy-benchmark) +ADD_ARROW_BENCHMARK(machine-benchmark) ADD_ARROW_BENCHMARK(number-parsing-benchmark) ADD_ARROW_BENCHMARK(utf8-util-benchmark) diff --git a/cpp/src/arrow/util/machine-benchmark.cc b/cpp/src/arrow/util/machine-benchmark.cc new file mode 100644 index 0000000000000..ad3f413e7f0fd --- /dev/null +++ b/cpp/src/arrow/util/machine-benchmark.cc @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Non-Arrow system benchmarks, provided for convenience. + +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +namespace arrow { + +// Generate a vector of indices such as following the indices describes +// a path over the whole vector. The path is randomized to avoid triggering +// automatic prefetching in the CPU. +std::vector RandomPath(int32_t size) { + std::default_random_engine gen(42); + std::vector indices(size); + + for (int32_t i = 0; i < size; ++i) { + indices[i] = i; + } + std::shuffle(indices.begin(), indices.end(), gen); + std::vector path(size, -999999); + int32_t prev; + prev = indices[size - 1]; + for (int32_t i = 0; i < size; ++i) { + int32_t next = indices[i]; + path[prev] = next; + prev = next; + } + return path; +} + +// Cache / main memory latency, depending on the working set size +static void BM_memory_latency(benchmark::State& state) { + const auto niters = static_cast(state.range(0)); + const std::vector path = RandomPath(niters / 4); + + int32_t total = 0; + int32_t index = 0; + for (auto _ : state) { + total += index; + index = path[index]; + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations()); +} + +BENCHMARK(BM_memory_latency)->RangeMultiplier(2)->Range(2 << 10, 2 << 24); + +} // namespace arrow From ff293196baa53a2608178b6d3768cb93f964f9f4 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 20 Dec 2018 16:18:29 -0600 Subject: [PATCH 47/80] ARROW-4087: [C++] Make CSV spellings of null values configurable Interestingly, there is no noticeable slowdown when reading CSV files (even though the trie is significantly slower than the hard-coded function in microbenchmarks). Author: Antoine Pitrou Closes #3236 from pitrou/ARROW-4087-csv-configure-nulls and squashes the following commits: 9a7596ddc ARROW-4087: Make CSV spellings of null values configurable --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/csv/converter-test.cc | 51 ++++- cpp/src/arrow/csv/converter.cc | 121 ++-------- cpp/src/arrow/csv/converter.h | 2 +- cpp/src/arrow/csv/options.cc | 9 +- cpp/src/arrow/csv/options.h | 3 + cpp/src/arrow/test-util.h | 12 +- cpp/src/arrow/util/CMakeLists.txt | 2 + cpp/src/arrow/util/hashing-benchmark.cc | 2 + cpp/src/arrow/util/trie-benchmark.cc | 221 ++++++++++++++++++ cpp/src/arrow/util/trie-test.cc | 283 ++++++++++++++++++++++++ cpp/src/arrow/util/trie.cc | 209 +++++++++++++++++ cpp/src/arrow/util/trie.h | 245 ++++++++++++++++++++ python/pyarrow/_csv.pyx | 18 +- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_csv.py | 38 +++- 16 files changed, 1103 insertions(+), 115 deletions(-) create mode 100644 cpp/src/arrow/util/trie-benchmark.cc create mode 100644 cpp/src/arrow/util/trie-test.cc create mode 100644 cpp/src/arrow/util/trie.cc create mode 100644 cpp/src/arrow/util/trie.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 8dd2ac082db0a..f2a811247287b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -110,6 +110,7 @@ set(ARROW_SRCS util/key_value_metadata.cc util/task-group.cc util/thread-pool.cc + util/trie.cc util/utf8.cc ) diff --git a/cpp/src/arrow/csv/converter-test.cc b/cpp/src/arrow/csv/converter-test.cc index 2534541d3154a..ea12c0b66a94b 100644 --- a/cpp/src/arrow/csv/converter-test.cc +++ b/cpp/src/arrow/csv/converter-test.cc @@ -176,13 +176,30 @@ TEST(IntegerConversion, Basics) { } TEST(IntegerConversion, Nulls) { - AssertConversion(int8(), {"12,34\n", ",-128\n"}, - {{12, 0}, {34, -128}}, - {{true, false}, {true, true}}); + AssertConversion(int8(), {"12,N/A\n", ",-128\n"}, + {{12, 0}, {0, -128}}, + {{true, false}, {false, true}}); AssertConversionAllNulls(int8()); } +TEST(IntegerConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(int8(), {"12,xxx\n", "zzz,-128\n"}, + {{12, 0}, {0, -128}}, {{true, false}, {false, true}}, + options); + + AssertConversionError(int8(), {",xxx,N/A\n"}, {0, 2}, options); + + // Duplicate nulls allowed + options.null_values = {"xxx", "zzz", "xxx"}; + AssertConversion(int8(), {"12,xxx\n", "zzz,-128\n"}, + {{12, 0}, {0, -128}}, {{true, false}, {false, true}}, + options); +} + TEST(IntegerConversion, Whitespace) { AssertConversion(int32(), {" 12,34 \n", " 56 ,78\n"}, {{12, 56}, {34, 78}}); @@ -203,6 +220,15 @@ TEST(FloatingPointConversion, Nulls) { AssertConversionAllNulls(float64()); } +TEST(FloatingPointConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(float32(), {"1.5,xxx\n", "zzz,-1e10\n"}, + {{1.5, 0.}, {0., -1e10f}}, + {{true, false}, {false, true}}, options); +} + TEST(FloatingPointConversion, Whitespace) { AssertConversion(float64(), {" 12,34.5\n", " 0 ,-1e100 \n"}, {{12., 0.}, {34.5, -1e100}}); @@ -220,6 +246,15 @@ TEST(BooleanConversion, Nulls) { {{true, true}, {false, true}}); } +TEST(BooleanConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + AssertConversion(boolean(), {"true,xxx\n", "zzz,0\n"}, + {{true, false}, {false, false}}, + {{true, false}, {false, true}}, options); +} + TEST(TimestampConversion, Basics) { auto type = timestamp(TimeUnit::SECOND); @@ -243,6 +278,16 @@ TEST(TimestampConversion, Nulls) { {{true}, {false}, {false}}); } +TEST(TimestampConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + auto type = timestamp(TimeUnit::MILLI); + AssertConversion(type, {"1970-01-01 00:01:00,xxx,zzz\n"}, + {{60000}, {0}, {0}}, + {{true}, {false}, {false}}, options); +} + TEST(DecimalConversion, NotImplemented) { std::shared_ptr converter; ASSERT_RAISES(NotImplemented, diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 1018f8553860e..22be7d6e58f3b 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include "arrow/builder.h" #include "arrow/csv/parser.h" @@ -29,12 +30,15 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/parsing.h" // IWYU pragma: keep +#include "arrow/util/trie.h" #include "arrow/util/utf8.h" namespace arrow { namespace csv { using internal::StringConverter; +using internal::Trie; +using internal::TrieBuilder; namespace { @@ -57,115 +61,28 @@ class ConcreteConverter : public Converter { using Converter::Converter; protected: - Status Initialize() override { return Status::OK(); } + Status Initialize() override; inline bool IsNull(const uint8_t* data, uint32_t size, bool quoted); + + Trie null_trie_; }; -// Recognize various spellings of null values. The list of possible spellings -// is taken from Pandas read_csv() documentation. +Status ConcreteConverter::Initialize() { + // TODO no need to build a separate Trie for each Converter instance + TrieBuilder builder; + for (const auto& s : options_.null_values) { + RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */)); + } + null_trie_ = builder.Finish(); + return Status::OK(); +} + bool ConcreteConverter::IsNull(const uint8_t* data, uint32_t size, bool quoted) { if (quoted) { return false; } - if (size == 0) { - return true; - } - // No 1-character null value exists - if (size == 1) { - return false; - } - - // XXX if the CSV parser guaranteed enough excess bytes at the end of the - // parsed area, we wouldn't need to always check size before comparing characters. - - auto chars = reinterpret_cast(data); - auto first = chars[0]; - auto second = chars[1]; - switch (first) { - case 'N': { - // "NA", "N/A", "NaN", "NULL" - if (size == 2) { - return second == 'A'; - } - auto third = chars[2]; - if (size == 3) { - return (second == '/' && third == 'A') || (second == 'a' && third == 'N'); - } - if (size == 4) { - return (second == 'U' && third == 'L' && chars[3] == 'L'); - } - return false; - } - case 'n': { - // "n/a", "nan", "null" - if (size == 2) { - return false; - } - auto third = chars[2]; - if (size == 3) { - return (second == '/' && third == 'a') || (second == 'a' && third == 'n'); - } - if (size == 4) { - return (second == 'u' && third == 'l' && chars[3] == 'l'); - } - return false; - } - case '1': { - // '1.#IND', '1.#QNAN' - if (size == 6) { - // '#' is the most unlikely char here, check it first - return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'I' && - chars[4] == 'N' && chars[5] == 'D'); - } - if (size == 7) { - return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'Q' && - chars[4] == 'N' && chars[5] == 'A' && chars[6] == 'N'); - } - return false; - } - case '-': { - switch (second) { - case 'N': - // "-NaN" - return (size == 4 && chars[2] == 'a' && chars[3] == 'N'); - case 'n': - // "-nan" - return (size == 4 && chars[2] == 'a' && chars[3] == 'n'); - case '1': - // "-1.#IND", "-1.#QNAN" - if (size == 7) { - return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'I' && - chars[5] == 'N' && chars[6] == 'D'); - } - if (size == 8) { - return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'Q' && - chars[5] == 'N' && chars[6] == 'A' && chars[7] == 'N'); - } - return false; - default: - return false; - } - } - case '#': { - // "#N/A", "#N/A N/A", "#NA" - if (size < 3 || chars[1] != 'N') { - return false; - } - auto third = chars[2]; - if (size == 3) { - return third == 'A'; - } - if (size == 4) { - return third == '/' && chars[3] == 'A'; - } - if (size == 8) { - return std::memcmp(data + 2, "/A N/A", 5) == 0; - } - return false; - } - default: - return false; - } + return null_trie_.Find(util::string_view(reinterpret_cast(data), size)) >= + 0; } ///////////////////////////////////////////////////////////////////////// diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h index 38ade1d21a846..d64fe695d0a26 100644 --- a/cpp/src/arrow/csv/converter.h +++ b/cpp/src/arrow/csv/converter.h @@ -57,7 +57,7 @@ class ARROW_EXPORT Converter { virtual Status Initialize() = 0; - ConvertOptions options_; + const ConvertOptions options_; MemoryPool* pool_; std::shared_ptr type_; }; diff --git a/cpp/src/arrow/csv/options.cc b/cpp/src/arrow/csv/options.cc index fccf0b67db98c..01e687b8342a3 100644 --- a/cpp/src/arrow/csv/options.cc +++ b/cpp/src/arrow/csv/options.cc @@ -22,7 +22,14 @@ namespace csv { ParseOptions ParseOptions::Defaults() { return ParseOptions(); } -ConvertOptions ConvertOptions::Defaults() { return ConvertOptions(); } +ConvertOptions ConvertOptions::Defaults() { + auto options = ConvertOptions(); + // The default list of possible null spellings is taken from Pandas' read_csv(). + options.null_values = {"", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", + "-NaN", "-nan", "1.#IND", "1.#QNAN", "N/A", "NA", + "NULL", "NaN", "n/a", "nan", "null"}; + return options; +} ReadOptions ReadOptions::Defaults() { return ReadOptions(); } diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 10232d45e8df4..2b4653ccdce81 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "arrow/util/visibility.h" @@ -66,6 +67,8 @@ struct ARROW_EXPORT ConvertOptions { bool check_utf8 = true; // Optional per-column types (disabling type inference on those columns) std::unordered_map> column_types; + // Recognized spellings for null values + std::vector null_values; static ConvertOptions Defaults(); }; diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 7fe7685f5a39f..33321633090af 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -69,12 +69,12 @@ ASSERT_EQ((message), s.ToString()); \ } while (false) -#define ASSERT_OK(expr) \ - do { \ - ::arrow::Status s = (expr); \ - if (!s.ok()) { \ - FAIL() << "'" STRINGIFY(expr) "' failed with " << s.ToString(); \ - } \ +#define ASSERT_OK(expr) \ + do { \ + ::arrow::Status _s = (expr); \ + if (!_s.ok()) { \ + FAIL() << "'" STRINGIFY(expr) "' failed with " << _s.ToString(); \ + } \ } while (false) #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index ee64a32915f09..b02dc113c5459 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -62,6 +62,7 @@ ADD_ARROW_TEST(rle-encoding-test) ADD_ARROW_TEST(stl-util-test) ADD_ARROW_TEST(task-group-test) ADD_ARROW_TEST(thread-pool-test) +ADD_ARROW_TEST(trie-test) ADD_ARROW_TEST(utf8-util-test) ADD_ARROW_BENCHMARK(bit-util-benchmark) @@ -72,4 +73,5 @@ ADD_ARROW_BENCHMARK(int-util-benchmark) ADD_ARROW_BENCHMARK(lazy-benchmark) ADD_ARROW_BENCHMARK(machine-benchmark) ADD_ARROW_BENCHMARK(number-parsing-benchmark) +ADD_ARROW_BENCHMARK(trie-benchmark) ADD_ARROW_BENCHMARK(utf8-util-benchmark) diff --git a/cpp/src/arrow/util/hashing-benchmark.cc b/cpp/src/arrow/util/hashing-benchmark.cc index 7d91f0f536ac1..09d00afd5fea4 100644 --- a/cpp/src/arrow/util/hashing-benchmark.cc +++ b/cpp/src/arrow/util/hashing-benchmark.cc @@ -74,6 +74,7 @@ static void BM_HashIntegers(benchmark::State& state) { // NOLINT non-const refe benchmark::DoNotOptimize(total); } state.SetBytesProcessed(2 * state.iterations() * values.size() * sizeof(int64_t)); + state.SetItemsProcessed(2 * state.iterations() * values.size()); } static void BenchmarkStringHashing(benchmark::State& state, // NOLINT non-const reference @@ -92,6 +93,7 @@ static void BenchmarkStringHashing(benchmark::State& state, // NOLINT non-const benchmark::DoNotOptimize(total); } state.SetBytesProcessed(2 * state.iterations() * total_size); + state.SetItemsProcessed(2 * state.iterations() * values.size()); } static void BM_HashSmallStrings(benchmark::State& state) { // NOLINT non-const reference diff --git a/cpp/src/arrow/util/trie-benchmark.cc b/cpp/src/arrow/util/trie-benchmark.cc new file mode 100644 index 0000000000000..acc2892689ff4 --- /dev/null +++ b/cpp/src/arrow/util/trie-benchmark.cc @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/test-util.h" +#include "arrow/util/trie.h" + +namespace arrow { +namespace internal { + +static inline bool InlinedNullLookup(util::string_view s) { + // An inlined version of trie lookup for a specific set of strings + // (see AllNulls()) + auto size = s.length(); + auto data = s.data(); + if (size == 0) { + return false; + } + if (size == 1) { + return false; + } + + auto chars = reinterpret_cast(data); + auto first = chars[0]; + auto second = chars[1]; + switch (first) { + case 'N': { + // "NA", "N/A", "NaN", "NULL" + if (size == 2) { + return second == 'A'; + } + auto third = chars[2]; + if (size == 3) { + return (second == '/' && third == 'A') || (second == 'a' && third == 'N'); + } + if (size == 4) { + return (second == 'U' && third == 'L' && chars[3] == 'L'); + } + return false; + } + case 'n': { + // "n/a", "nan", "null" + if (size == 2) { + return false; + } + auto third = chars[2]; + if (size == 3) { + return (second == '/' && third == 'a') || (second == 'a' && third == 'n'); + } + if (size == 4) { + return (second == 'u' && third == 'l' && chars[3] == 'l'); + } + return false; + } + case '1': { + // '1.#IND', '1.#QNAN' + if (size == 6) { + // '#' is the most unlikely char here, check it first + return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'I' && + chars[4] == 'N' && chars[5] == 'D'); + } + if (size == 7) { + return (chars[2] == '#' && chars[1] == '.' && chars[3] == 'Q' && + chars[4] == 'N' && chars[5] == 'A' && chars[6] == 'N'); + } + return false; + } + case '-': { + switch (second) { + case 'N': + // "-NaN" + return (size == 4 && chars[2] == 'a' && chars[3] == 'N'); + case 'n': + // "-nan" + return (size == 4 && chars[2] == 'a' && chars[3] == 'n'); + case '1': + // "-1.#IND", "-1.#QNAN" + if (size == 7) { + return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'I' && + chars[5] == 'N' && chars[6] == 'D'); + } + if (size == 8) { + return (chars[3] == '#' && chars[2] == '.' && chars[4] == 'Q' && + chars[5] == 'N' && chars[6] == 'A' && chars[7] == 'N'); + } + return false; + default: + return false; + } + } + case '#': { + // "#N/A", "#N/A N/A", "#NA" + if (size < 3 || chars[1] != 'N') { + return false; + } + auto third = chars[2]; + if (size == 3) { + return third == 'A'; + } + if (size == 4) { + return third == '/' && chars[3] == 'A'; + } + if (size == 8) { + return std::memcmp(data + 2, "/A N/A", 5) == 0; + } + return false; + } + default: + return false; + } +} + +std::vector AllNulls() { + return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", + "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"}; +} + +Trie MakeNullsTrie() { + auto nulls = AllNulls(); + + TrieBuilder builder; + for (const auto& str : AllNulls()) { + ABORT_NOT_OK(builder.Append(str)); + } + return builder.Finish(); +} + +std::vector Expand(const std::vector& base, size_t n) { + std::vector result; + result.reserve(n); + + while (true) { + for (const auto& v : base) { + result.push_back(v); + if (result.size() == n) { + return result; + } + } + } +} + +static void BenchmarkTrieLookups(benchmark::State& state, // NOLINT non-const reference + const std::vector& strings) { + Trie trie = MakeNullsTrie(); + int32_t total = 0; + + auto lookups = Expand(strings, 100); + + for (auto _ : state) { + for (const auto& s : lookups) { + total += trie.Find(s); + } + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations() * lookups.size()); +} + +static void BenchmarkInlinedTrieLookups( + benchmark::State& state, // NOLINT non-const reference + const std::vector& strings) { + int32_t total = 0; + + auto lookups = Expand(strings, 100); + + for (auto _ : state) { + for (const auto& s : lookups) { + total += InlinedNullLookup(s); + } + } + benchmark::DoNotOptimize(total); + state.SetItemsProcessed(state.iterations() * lookups.size()); +} + +static void BM_TrieLookupFound(benchmark::State& state) { // NOLINT non-const reference + BenchmarkTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); +} + +static void BM_TrieLookupNotFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkTrieLookups(state, {"None", "1.0", "", "abc"}); +} + +static void BM_InlinedTrieLookupFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkInlinedTrieLookups(state, {"N/A", "null", "-1.#IND", "N/A"}); +} + +static void BM_InlinedTrieLookupNotFound( + benchmark::State& state) { // NOLINT non-const reference + BenchmarkInlinedTrieLookups(state, {"None", "1.0", "", "abc"}); +} + +static const int kRepetitions = 2; + +BENCHMARK(BM_TrieLookupFound)->Repetitions(kRepetitions); +BENCHMARK(BM_TrieLookupNotFound)->Repetitions(kRepetitions); +BENCHMARK(BM_InlinedTrieLookupFound)->Repetitions(kRepetitions); +BENCHMARK(BM_InlinedTrieLookupNotFound)->Repetitions(kRepetitions); + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie-test.cc b/cpp/src/arrow/util/trie-test.cc new file mode 100644 index 0000000000000..33eefa9d9335f --- /dev/null +++ b/cpp/src/arrow/util/trie-test.cc @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/test-util.h" +#include "arrow/util/trie.h" + +namespace arrow { +namespace internal { + +TEST(SmallString, Basics) { + using SS = SmallString<5>; + { + SS s; + ASSERT_EQ(s.length(), 0); + ASSERT_EQ(util::string_view(s), util::string_view("")); + ASSERT_EQ(s, ""); + ASSERT_NE(s, "x"); + ASSERT_EQ(sizeof(s), 6); + } + { + SS s("abc"); + ASSERT_EQ(s.length(), 3); + ASSERT_EQ(util::string_view(s), util::string_view("abc")); + ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0); + ASSERT_EQ(s, "abc"); + ASSERT_NE(s, "ab"); + } +} + +TEST(SmallString, Assign) { + using SS = SmallString<5>; + auto s = SS(); + + s = util::string_view("abc"); + ASSERT_EQ(s.length(), 3); + ASSERT_EQ(util::string_view(s), util::string_view("abc")); + ASSERT_EQ(std::memcmp(s.data(), "abc", 3), 0); + ASSERT_EQ(s, "abc"); + ASSERT_NE(s, "ab"); + + s = std::string("ghijk"); + ASSERT_EQ(s.length(), 5); + ASSERT_EQ(util::string_view(s), util::string_view("ghijk")); + ASSERT_EQ(std::memcmp(s.data(), "ghijk", 5), 0); + ASSERT_EQ(s, "ghijk"); + ASSERT_NE(s, ""); + + s = SS("xy"); + ASSERT_EQ(s.length(), 2); + ASSERT_EQ(util::string_view(s), util::string_view("xy")); + ASSERT_EQ(std::memcmp(s.data(), "xy", 2), 0); + ASSERT_EQ(s, "xy"); + ASSERT_NE(s, "xyz"); +} + +TEST(SmallString, Substr) { + using SS = SmallString<5>; + { + auto s = SS(); + ASSERT_EQ(s.substr(0), ""); + ASSERT_EQ(s.substr(0, 2), ""); + } + { + auto s = SS("abcd"); + ASSERT_EQ(s.substr(0), "abcd"); + ASSERT_EQ(s.substr(1), "bcd"); + ASSERT_EQ(s.substr(4), ""); + ASSERT_EQ(s.substr(0, 0), ""); + ASSERT_EQ(s.substr(0, 3), "abc"); + ASSERT_EQ(s.substr(0, 4), "abcd"); + ASSERT_EQ(s.substr(1, 0), ""); + ASSERT_EQ(s.substr(1, 2), "bc"); + ASSERT_EQ(s.substr(4, 0), ""); + ASSERT_EQ(s.substr(4, 1), ""); + } +} + +static std::vector AllNulls() { + return {"#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", + "1.#QNAN", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"}; +} + +static void TestTrieContents(const Trie& trie, const std::vector& entries) { + std::unordered_map control; + auto n_entries = static_cast(entries.size()); + + // Build control container + for (int32_t i = 0; i < n_entries; ++i) { + auto p = control.insert({entries[i], i}); + ASSERT_TRUE(p.second); + } + + // Check all existing entries in trie + for (int32_t i = 0; i < n_entries; ++i) { + ASSERT_EQ(i, trie.Find(entries[i])) << "for string '" << entries[i] << "'"; + } + + auto CheckNotExists = [&control, &trie](const std::string& s) { + auto p = control.find(s); + if (p == control.end()) { + ASSERT_EQ(-1, trie.Find(s)) << "for string '" << s << "'"; + } + }; + + // Check potentially non-existing strings + CheckNotExists(""); + CheckNotExists("X"); + CheckNotExists("abcdefxxxxxxxxxxxxxxx"); + + // Check potentially non-existing variations of existing entries + for (const auto& e : entries) { + CheckNotExists(e + "X"); + if (e.size() > 0) { + CheckNotExists(e.substr(0, 1)); + auto prefix = e.substr(0, e.size() - 1); + CheckNotExists(prefix); + CheckNotExists(prefix + "X"); + auto split_at = e.size() / 2; + CheckNotExists(e.substr(0, split_at) + 'x' + e.substr(split_at + 1)); + } + } +} + +static void TestTrieContents(const std::vector& entries) { + TrieBuilder builder; + for (const auto& s : entries) { + ASSERT_OK(builder.Append(s)); + } + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + TestTrieContents(trie, entries); +} + +TEST(Trie, Empty) { + TrieBuilder builder; + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + ASSERT_EQ(-1, trie.Find("")); + ASSERT_EQ(-1, trie.Find("x")); +} + +TEST(Trie, EmptyString) { + TrieBuilder builder; + ASSERT_OK(builder.Append("")); + const Trie trie = builder.Finish(); + ASSERT_OK(trie.Validate()); + + ASSERT_EQ(0, trie.Find("")); + ASSERT_EQ(-1, trie.Find("x")); +} + +TEST(Trie, Basics1) { + TestTrieContents({"abc", "de", "f"}); + TestTrieContents({"abc", "de", "f", ""}); +} + +TEST(Trie, Basics2) { + TestTrieContents({"a", "abc", "abcd", "abcdef"}); + TestTrieContents({"", "a", "abc", "abcd", "abcdef"}); +} + +TEST(Trie, Basics3) { + TestTrieContents({"abcd", "ab", "a"}); + TestTrieContents({"abcd", "ab", "a", ""}); +} + +TEST(Trie, LongStrings) { + TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "defghijklmnopqrst"}); + TestTrieContents({"abcdefghijklmnopqr", "abcdefghijklmnoprq", "abcde"}); +} + +TEST(Trie, NullChars) { + const std::string empty; + const std::string nul(1, '\x00'); + std::string a, b, c, d; + a = "x" + nul + "y"; + b = "x" + nul + "z"; + c = nul + "y"; + d = nul; + ASSERT_EQ(a.length(), 3); + ASSERT_EQ(d.length(), 1); + + TestTrieContents({a, b, c, d}); + TestTrieContents({a, b, c}); + TestTrieContents({a, b, c, d, ""}); + TestTrieContents({a, b, c, ""}); + TestTrieContents({d, c, b, a}); + TestTrieContents({c, b, a}); + TestTrieContents({d, c, b, a, ""}); + TestTrieContents({c, b, a, ""}); +} + +TEST(Trie, NegativeChars) { + // Test with characters >= 0x80 (to check the absence of sign issues) + TestTrieContents({"\x7f\x80\x81\xff", "\x7f\x80\x81", "\x7f\xff\x81", "\xff\x80\x81"}); +} + +TEST(Trie, CSVNulls) { TestTrieContents(AllNulls()); } + +TEST(Trie, Duplicates) { + { + TrieBuilder builder; + ASSERT_OK(builder.Append("ab")); + ASSERT_OK(builder.Append("abc")); + ASSERT_RAISES(Invalid, builder.Append("abc")); + ASSERT_OK(builder.Append("abcd")); + ASSERT_RAISES(Invalid, builder.Append("ab")); + ASSERT_OK(builder.Append("abcde")); + const Trie trie = builder.Finish(); + + TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"}); + } + { + // With allow_duplicates = true + TrieBuilder builder; + ASSERT_OK(builder.Append("ab", true)); + ASSERT_OK(builder.Append("abc", true)); + ASSERT_OK(builder.Append("abc", true)); + ASSERT_OK(builder.Append("abcd", true)); + ASSERT_OK(builder.Append("ab", true)); + ASSERT_OK(builder.Append("abcde", true)); + const Trie trie = builder.Finish(); + + TestTrieContents(trie, {"ab", "abc", "abcd", "abcde"}); + } +} + +TEST(Trie, CapacityError) { + // A trie uses 16-bit indices into various internal structures and + // therefore has limited size available. + TrieBuilder builder; + uint8_t first, second, third; + bool had_capacity_error = false; + uint8_t s[] = "\x00\x00\x00\x00"; + + for (first = 1; first < 125; ++first) { + s[0] = first; + for (second = 1; second < 125; ++second) { + s[1] = second; + for (third = 1; third < 125; ++third) { + s[2] = third; + auto st = builder.Append(reinterpret_cast(s)); + if (st.IsCapacityError()) { + DCHECK_GE(first, 2); + had_capacity_error = true; + break; + } else { + ASSERT_OK(st); + } + } + } + } + ASSERT_TRUE(had_capacity_error) << "Should have produced CapacityError"; +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie.cc b/cpp/src/arrow/util/trie.cc new file mode 100644 index 0000000000000..eaa02b7c5352e --- /dev/null +++ b/cpp/src/arrow/util/trie.cc @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/trie.h" + +#include +#include + +#include "arrow/util/logging.h" + +namespace arrow { +namespace internal { + +Status Trie::Validate() const { + const auto n_nodes = static_cast(nodes_.size()); + if (size_ > n_nodes) { + return Status::Invalid("Number of entries larger than number of nodes"); + } + for (const auto& node : nodes_) { + if (node.found_index_ >= size_) { + return Status::Invalid("Found index >= size"); + } + if (node.child_lookup_ != -1 && + node.child_lookup_ * 256 > + static_cast(lookup_table_.size() - 256)) { + return Status::Invalid("Child lookup base doesn't point to 256 valid indices"); + } + } + for (const auto index : lookup_table_) { + if (index >= n_nodes) { + return Status::Invalid("Child lookup index out of bounds"); + } + } + return Status::OK(); +} + +void Trie::Dump(const Node* node, const std::string& indent) const { + std::cerr << "[\"" << node->substring_ << "\"]"; + if (node->found_index_ >= 0) { + std::cerr << " *"; + } + std::cerr << "\n"; + if (node->child_lookup_ >= 0) { + auto child_indent = indent + " "; + std::cerr << child_indent << "|\n"; + for (fast_index_type i = 0; i < 256; ++i) { + auto child_index = lookup_table_[node->child_lookup_ * 256 + i]; + if (child_index >= 0) { + const Node* child = &nodes_[child_index]; + std::cerr << child_indent << "|-> '" << static_cast(i) << "' (" << i + << ") -> "; + Dump(child, child_indent); + } + } + } +} + +void Trie::Dump() const { Dump(&nodes_[0], ""); } + +TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, ""}); } + +Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) { + if (parent->child_lookup_ == -1) { + RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_)); + } + auto parent_lookup = parent->child_lookup_ * 256 + ch; + + DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1); + if (trie_.nodes_.size() >= static_cast(kMaxIndex)) { + return Status::CapacityError("Trie out of bounds"); + } + trie_.nodes_.push_back(std::move(node)); + trie_.lookup_table_[parent_lookup] = static_cast(trie_.nodes_.size() - 1); + return Status::OK(); +} + +Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch, + util::string_view substring) { + const auto kMaxSubstringLength = Trie::kMaxSubstringLength; + + while (substring.length() > kMaxSubstringLength) { + // Substring doesn't fit in node => create intermediate node + auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)}; + RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node))); + // Recurse + parent = &trie_.nodes_.back(); + ch = static_cast(substring[kMaxSubstringLength]); + substring = substring.substr(kMaxSubstringLength + 1); + } + + // Create final matching node + auto child_node = Trie::Node{trie_.size_, -1, substring}; + RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node))); + ++trie_.size_; + return Status::OK(); +} + +Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch, + util::string_view substring) { + return CreateChildNode(parent, static_cast(ch), substring); +} + +Status TrieBuilder::ExtendLookupTable(index_type* out_index) { + auto cur_size = trie_.lookup_table_.size(); + auto cur_index = cur_size / 256; + if (cur_index > static_cast(kMaxIndex)) { + return Status::CapacityError("Trie out of bounds"); + } + trie_.lookup_table_.resize(cur_size + 256, -1); + *out_index = static_cast(cur_index); + return Status::OK(); +} + +Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) { + Trie::Node* node = &trie_.nodes_[node_index]; + + DCHECK_LT(split_at, node->substring_length()); + + // Before: + // {node} -> [...] + // After: + // {node} -> [c] -> {out_node} -> [...] + auto child_node = Trie::Node{node->found_index_, node->child_lookup_, + node->substring_.substr(split_at + 1)}; + auto ch = node->substring_[split_at]; + node->child_lookup_ = -1; + node->found_index_ = -1; + node->substring_ = node->substring_.substr(0, split_at); + RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node))); + + return Status::OK(); +} + +Status TrieBuilder::Append(util::string_view s, bool allow_duplicate) { + // Find or create node for string + fast_index_type node_index = 0; + fast_index_type pos = 0; + fast_index_type remaining = static_cast(s.length()); + + while (true) { + Trie::Node* node = &trie_.nodes_[node_index]; + const auto substring_length = node->substring_length(); + const auto substring_data = node->substring_data(); + + for (fast_index_type i = 0; i < substring_length; ++i) { + if (remaining == 0) { + // New string too short => need to split node + RETURN_NOT_OK(SplitNode(node_index, i)); + // Current node matches exactly + node = &trie_.nodes_[node_index]; + node->found_index_ = trie_.size_++; + return Status::OK(); + } + if (s[pos] != substring_data[i]) { + // Mismatching substring => need to split node + RETURN_NOT_OK(SplitNode(node_index, i)); + // Create new node for mismatching char + node = &trie_.nodes_[node_index]; + return CreateChildNode(node, s[pos], s.substr(pos + 1)); + } + ++pos; + --remaining; + } + if (remaining == 0) { + // Node matches exactly + if (node->found_index_ >= 0) { + if (allow_duplicate) { + return Status::OK(); + } else { + return Status::Invalid("Duplicate entry in trie"); + } + } + node->found_index_ = trie_.size_++; + return Status::OK(); + } + // Lookup child using next input character + if (node->child_lookup_ == -1) { + // Need to extend lookup table for this node + RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_)); + } + auto c = static_cast(s[pos++]); + --remaining; + node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c]; + if (node_index == -1) { + // Child not found => need to create child node + return CreateChildNode(node, c, s.substr(pos)); + } + node = &trie_.nodes_[node_index]; + } +} + +Trie TrieBuilder::Finish() { return std::move(trie_); } + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/trie.h b/cpp/src/arrow/util/trie.h new file mode 100644 index 0000000000000..3e82bfd8ee28f --- /dev/null +++ b/cpp/src/arrow/util/trie.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_TRIE_H +#define ARROW_UTIL_TRIE_H + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +// A non-zero-terminated small string class. +// std::string usually has a small string optimization +// (see review at https://shaharmike.com/cpp/std-string/) +// but this one allows tight control and optimization of memory layout. +template +class SmallString { + public: + SmallString() : length_(0) {} + + template + SmallString(const T& v) { // NOLINT implicit constructor + *this = util::string_view(v); + } + + SmallString& operator=(const util::string_view s) { +#ifndef NDEBUG + CheckSize(s.size()); +#endif + length_ = static_cast(s.size()); + std::memcpy(data_, s.data(), length_); + return *this; + } + + SmallString& operator=(const std::string& s) { + *this = util::string_view(s); + return *this; + } + + SmallString& operator=(const char* s) { + *this = util::string_view(s); + return *this; + } + + explicit operator util::string_view() const { + return util::string_view(data_, length_); + } + + const char* data() const { return data_; } + size_t length() const { return length_; } + bool empty() const { return length_ == 0; } + char operator[](size_t pos) const { +#ifdef NDEBUG + assert(pos <= length_); +#endif + return data_[pos]; + } + + SmallString substr(size_t pos) const { + return SmallString(util::string_view(*this).substr(pos)); + } + + SmallString substr(size_t pos, size_t count) const { + return SmallString(util::string_view(*this).substr(pos, count)); + } + + template + bool operator==(T&& other) const { + return util::string_view(*this) == util::string_view(std::forward(other)); + } + + template + bool operator!=(T&& other) const { + return util::string_view(*this) != util::string_view(std::forward(other)); + } + + protected: + uint8_t length_; + char data_[N]; + +#ifndef NDEBUG + void CheckSize(size_t n) { assert(n <= N); } +#endif +}; + +template +std::ostream& operator<<(std::ostream& os, const SmallString& str) { + return os << util::string_view(str); +} + +// A trie class for byte strings, optimized for small sets of short strings. +// This class is immutable by design, use a TrieBuilder to construct it. +class ARROW_EXPORT Trie { + using index_type = int16_t; + using fast_index_type = int_fast16_t; + + public: + Trie() : size_(0) {} + Trie(Trie&&) = default; + Trie& operator=(Trie&&) = default; + + int32_t Find(util::string_view s) const { + const Node* node = &nodes_[0]; + fast_index_type pos = 0; + fast_index_type remaining = static_cast(s.length()); + + while (remaining > 0) { + auto substring_length = node->substring_length(); + if (substring_length > 0) { + auto substring_data = node->substring_data(); + if (remaining < substring_length) { + // Input too short + return -1; + } + for (fast_index_type i = 0; i < substring_length; ++i) { + if (s[pos++] != substring_data[i]) { + // Mismatching substring + return -1; + } + --remaining; + } + if (remaining == 0) { + // Matched node exactly + return node->found_index_; + } + } + // Lookup child using next input character + if (node->child_lookup_ == -1) { + // Input too long + return -1; + } + auto c = static_cast(s[pos++]); + --remaining; + auto child_index = lookup_table_[node->child_lookup_ * 256 + c]; + if (child_index == -1) { + // Child not found + return -1; + } + node = &nodes_[child_index]; + } + + // Input exhausted + if (node->substring_.empty()) { + // Matched node exactly + return node->found_index_; + } else { + return -1; + } + } + + Status Validate() const; + + void Dump() const; + + protected: + static constexpr size_t kNodeSize = 16; + static constexpr auto kMaxSubstringLength = + kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t); + + struct Node { + // If this node is a valid end of string, index of found string, otherwise -1 + index_type found_index_; + // Base index for child lookup in lookup_table_ (-1 if no child nodes) + index_type child_lookup_; + // The substring for this node. + SmallString substring_; + + fast_index_type substring_length() const { + return static_cast(substring_.length()); + } + const char* substring_data() const { return substring_.data(); } + }; + + static_assert(sizeof(Node) == kNodeSize, "Unexpected node size"); + + ARROW_DISALLOW_COPY_AND_ASSIGN(Trie); + + void Dump(const Node* node, const std::string& indent) const; + + // Node table: entry 0 is the root node + std::vector nodes_; + + // Indexed lookup structure: gives index in node table, or -1 if not found + std::vector lookup_table_; + + // Number of entries + index_type size_; + + friend class TrieBuilder; +}; + +class ARROW_EXPORT TrieBuilder { + using index_type = Trie::index_type; + using fast_index_type = Trie::fast_index_type; + + public: + TrieBuilder(); + Status Append(util::string_view s, bool allow_duplicate = false); + Trie Finish(); + + protected: + // Extend the lookup table by 256 entries, return the index of the new span + Status ExtendLookupTable(index_type* out_lookup_index); + // Split the node given by the index at the substring index `split_at` + Status SplitNode(fast_index_type node_index, fast_index_type split_at); + // Append an already constructed child node to the parent + Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node); + // Create a matching child node from this parent + Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring); + Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring); + + Trie trie_; + + static constexpr auto kMaxIndex = std::numeric_limits::max(); +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_TRIE_H diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 91d1b08deefad..db8104659884b 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -252,6 +252,9 @@ cdef class ConvertOptions: column_types: dict, optional Map column names to column types (disabling type inference on those columns). + null_values: list, optional + A sequence of strings that denote nulls in the data + (defaults are appropriate in most cases). """ cdef: CCSVConvertOptions options @@ -259,12 +262,14 @@ cdef class ConvertOptions: # Avoid mistakingly creating attributes __slots__ = () - def __init__(self, check_utf8=None, column_types=None): + def __init__(self, check_utf8=None, column_types=None, null_values=None): self.options = CCSVConvertOptions.Defaults() if check_utf8 is not None: self.check_utf8 = check_utf8 if column_types is not None: self.column_types = column_types + if null_values is not None: + self.null_values = null_values @property def check_utf8(self): @@ -306,6 +311,17 @@ cdef class ConvertOptions: assert typ != NULL self.options.column_types[tobytes(k)] = typ + @property + def null_values(self): + """ + A sequence of strings that denote nulls in the data. + """ + return [frombytes(x) for x in self.options.null_values] + + @null_values.setter + def null_values(self, value): + self.options.null_values = [tobytes(x) for x in value] + cdef _get_reader(input_file, shared_ptr[InputStream]* out): use_memory_map = False diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index f4629af0617fb..7ce03bf6eb80c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -954,6 +954,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil: cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions": c_bool check_utf8 unordered_map[c_string, shared_ptr[CDataType]] column_types + vector[c_string] null_values @staticmethod CCSVConvertOptions Defaults() diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index c5816de8a4203..14ba999fea77b 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -151,9 +151,17 @@ def test_convert_options(): with pytest.raises(TypeError): opts.column_types = 0 - opts = cls(check_utf8=False, column_types={'a': pa.null()}) + assert isinstance(opts.null_values, list) + assert '' in opts.null_values + assert 'N/A' in opts.null_values + opts.null_values = ['xxx', 'yyy'] + assert opts.null_values == ['xxx', 'yyy'] + + opts = cls(check_utf8=False, column_types={'a': pa.null()}, + null_values=['xxx', 'yyy']) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} + assert opts.null_values == ['xxx', 'yyy'] class BaseTestCSVRead: @@ -233,6 +241,34 @@ def test_simple_timestamps(self): 'b': [datetime(1970, 1, 1), datetime(1989, 7, 14)], } + def test_custom_nulls(self): + # Infer nulls with custom values + opts = ConvertOptions(null_values=['Xxx', 'Zzz']) + rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n" + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.null()), + ('b', pa.string()), + ('c', pa.string()), + ('d', pa.int64())]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [None, None], + 'b': [u"Xxx", u"#N/A"], + 'c': [u"1", u""], + 'd': [2, None], + } + + opts = ConvertOptions(null_values=[]) + rows = b"a,b\n#N/A,\n" + table = self.read_bytes(rows, convert_options=opts) + schema = pa.schema([('a', pa.string()), + ('b', pa.string())]) + assert table.schema == schema + assert table.to_pydict() == { + 'a': [u"#N/A"], + 'b': [u""], + } + def test_column_types(self): # Ask for specific column types in ConvertOptions opts = ConvertOptions(column_types={'b': 'float32', From 1a86ab51d8ee86e132645c9671f5355774b8f71b Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 20 Dec 2018 16:19:42 -0600 Subject: [PATCH 48/80] ARROW-3982: [C++] Allow "binary" input in simple JSON format Since rapidjson doesn't validate UTF8 by default, we can represent arbitrary binary bytes in the JSON input (bytes < 0x20 have to be represented as unicode escapes). Author: Antoine Pitrou Closes #3222 from pitrou/ARROW-3982-json-simple-binary and squashes the following commits: 5aaa5edc8 ARROW-3982: Allow "binary" input in simple JSON format --- cpp/src/arrow/ipc/json-simple-test.cc | 40 ++++++++++++++++++++++++ cpp/src/arrow/ipc/json-simple.cc | 45 +++++++++++++++++++++++++-- cpp/src/arrow/pretty_print-test.cc | 11 ++----- 3 files changed, 84 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 84a2210157f53..2e80a0ca85822 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -289,6 +289,7 @@ TEST(TestDouble, Errors) { } TEST(TestString, Basics) { + // String type std::shared_ptr type = utf8(); std::shared_ptr expected, actual; @@ -300,6 +301,20 @@ TEST(TestString, Basics) { s += '\x00'; s += "char"; AssertJSONArray(type, "[\"\", \"some\\u0000char\"]", {"", s}); + // UTF8 sequence in string + AssertJSONArray(type, "[\"\xc3\xa9\"]", {"\xc3\xa9"}); + + // Binary type + type = binary(); + AssertJSONArray(type, "[\"\", \"foo\", null]", + {true, true, false}, {"", "foo", ""}); + // Arbitrary binary (non-UTF8) sequence in string + s = "\xff\x9f"; + AssertJSONArray(type, "[\"" + s + "\"]", {s}); + // Bytes < 0x20 can be represented as JSON unicode escapes + s = '\x00'; + s += "\x1f"; + AssertJSONArray(type, "[\"\\u0000\\u001f\"]", {s}); } TEST(TestString, Errors) { @@ -310,6 +325,31 @@ TEST(TestString, Errors) { ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); } +TEST(TestFixedSizeBinary, Basics) { + std::shared_ptr type = fixed_size_binary(3); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[\"foo\", \"bar\"]", + {"foo", "bar"}); + AssertJSONArray(type, "[null, \"foo\"]", + {false, true}, {"", "foo"}); + // Arbitrary binary (non-UTF8) sequence in string + std::string s = "\xff\x9f\xcc"; + AssertJSONArray(type, "[\"" + s + "\"]", {s}); +} + +TEST(TestFixedSizeBinary, Errors) { + std::shared_ptr type = fixed_size_binary(3); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); + // Invalid length + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"abcd\"]", &array)); +} + TEST(TestDecimal, Basics) { std::shared_ptr type = decimal(10, 4); std::shared_ptr expected, actual; diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index d812f841d9353..7a78fe4986cd5 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -41,7 +41,8 @@ using ::arrow::internal::checked_cast; static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; static Status JSONTypeError(const char* expected_type, rj::Type json_type) { - return Status::Invalid("Expected ", expected_type, " or null, got type ", json_type); + return Status::Invalid("Expected ", expected_type, " or null, got JSON type ", + json_type); } class Converter { @@ -91,7 +92,6 @@ class ConcreteConverter : public Converter { }; // TODO : dates and times? -// TODO : binary / fixed size binary? // ------------------------------------------------------------------------ // Converter for null arrays @@ -284,7 +284,7 @@ class DecimalConverter final : public ConcreteConverter { }; // ------------------------------------------------------------------------ -// Converter for string arrays +// Converter for binary and string arrays class StringConverter final : public ConcreteConverter { public: @@ -313,6 +313,43 @@ class StringConverter final : public ConcreteConverter { std::shared_ptr builder_; }; +// ------------------------------------------------------------------------ +// Converter for fixed-size binary arrays + +class FixedSizeBinaryConverter final + : public ConcreteConverter { + public: + explicit FixedSizeBinaryConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(type, default_memory_pool()); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsString()) { + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + if (view.length() != static_cast(builder_->byte_width())) { + std::stringstream ss; + ss << "Invalid string length " << view.length() << " in JSON input for " + << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + protected: + std::shared_ptr builder_; +}; + // ------------------------------------------------------------------------ // Converter for list arrays @@ -449,6 +486,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) default: { return Status::NotImplemented("JSON conversion to ", type->ToString(), diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index a1acfb81aeff1..8696efc735b8a 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -277,18 +277,11 @@ TEST_F(TestPrettyPrint, ListType) { TEST_F(TestPrettyPrint, FixedSizeBinaryType) { std::vector is_valid = {true, true, false, true, false}; - std::vector values = {"foo", "bar", "baz"}; - std::shared_ptr array; auto type = fixed_size_binary(3); - FixedSizeBinaryBuilder builder(type); - - ASSERT_OK(builder.Append(values[0])); - ASSERT_OK(builder.Append(values[1])); - ASSERT_OK(builder.Append(values[2])); - ASSERT_OK(builder.Finish(&array)); + auto array = ArrayFromJSON(type, "[\"foo\", \"bar\", null, \"baz\"]"); - static const char* ex = "[\n 666F6F,\n 626172,\n 62617A\n]"; + static const char* ex = "[\n 666F6F,\n 626172,\n null,\n 62617A\n]"; CheckArray(*array, {0, 10}, ex); static const char* ex_2 = " [\n 666F6F,\n ...\n 62617A\n ]"; CheckArray(*array, {2, 1}, ex_2); From 700bd40afab973d00229a43dff5ce764ed996873 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Thu, 20 Dec 2018 16:55:09 -0600 Subject: [PATCH 49/80] ARROW-4052: [C++] Linker errors with glog and gflags After #3196, a potential bug appears. If we use glog installed instead of downloading one at build time and the installed glog is linked to gflags, linker error can be occurred. I modified ThirdpartyToolchain.cmake to add a dependency from glog to gflag. Author: Kousuke Saruta Closes #3234 from sarutak/ARROW-4052 and squashes the following commits: 3c65cbee6 Modified ThirdpartyToolchain.cmake to add a dependency from glog to gflag --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 112 +++++++++++--------- 1 file changed, 61 insertions(+), 51 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 3381b5cda16b4..d8b34862eeaab 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -534,56 +534,11 @@ message(STATUS "double-conversion include dir: ${DOUBLE_CONVERSION_INCLUDE_DIR}" message(STATUS "double-conversion static library: ${DOUBLE_CONVERSION_STATIC_LIB}") # ---------------------------------------------------------------------- -# Google gtest & gflags - -if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS - OR ARROW_BUILD_BENCHMARKS) - if("${GTEST_HOME}" STREQUAL "") - if(APPLE) - set(GTEST_CMAKE_CXX_FLAGS "-fPIC -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") - elseif(NOT MSVC) - set(GTEST_CMAKE_CXX_FLAGS "-fPIC") - endif() - string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) - set(GTEST_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}} ${GTEST_CMAKE_CXX_FLAGS}") - - set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep") - set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") - set(GTEST_STATIC_LIB - "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GTEST_MAIN_STATIC_LIB - "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GTEST_VENDORED 1) - set(GTEST_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} - -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}) - if (MSVC AND NOT ARROW_USE_STATIC_CRT) - set(GTEST_CMAKE_ARGS ${GTEST_CMAKE_ARGS} -Dgtest_force_shared_crt=ON) - endif() - - ExternalProject_Add(googletest_ep - URL ${GTEST_SOURCE_URL} - BUILD_BYPRODUCTS ${GTEST_STATIC_LIB} ${GTEST_MAIN_STATIC_LIB} - CMAKE_ARGS ${GTEST_CMAKE_ARGS} - ${EP_LOG_OPTIONS}) - else() - find_package(GTest REQUIRED) - set(GTEST_VENDORED 0) - endif() - - message(STATUS "GTest include dir: ${GTEST_INCLUDE_DIR}") - message(STATUS "GTest static library: ${GTEST_STATIC_LIB}") - include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(gtest - STATIC_LIB ${GTEST_STATIC_LIB}) - ADD_THIRDPARTY_LIB(gtest_main - STATIC_LIB ${GTEST_MAIN_STATIC_LIB}) - - if(GTEST_VENDORED) - add_dependencies(gtest_static googletest_ep) - add_dependencies(gtest_main_static googletest_ep) - endif() +# gflags +if(ARROW_BUILD_TESTS OR + ARROW_BUILD_BENCHMARKS OR + (ARROW_USE_GLOG AND GLOG_HOME)) # gflags (formerly Googleflags) command line parsing if("${GFLAGS_HOME}" STREQUAL "") set(GFLAGS_CMAKE_CXX_FLAGS ${EP_CXX_FLAGS}) @@ -636,6 +591,57 @@ if(ARROW_BUILD_TESTS OR ARROW_GANDIVA_BUILD_TESTS endif() endif() +# ---------------------------------------------------------------------- +# Google gtest + +if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) + if("${GTEST_HOME}" STREQUAL "") + if(APPLE) + set(GTEST_CMAKE_CXX_FLAGS "-fPIC -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") + elseif(NOT MSVC) + set(GTEST_CMAKE_CXX_FLAGS "-fPIC") + endif() + string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) + set(GTEST_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}} ${GTEST_CMAKE_CXX_FLAGS}") + + set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep") + set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") + set(GTEST_STATIC_LIB + "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GTEST_MAIN_STATIC_LIB + "${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GTEST_VENDORED 1) + set(GTEST_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${GTEST_PREFIX} + -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}) + if (MSVC AND NOT ARROW_USE_STATIC_CRT) + set(GTEST_CMAKE_ARGS ${GTEST_CMAKE_ARGS} -Dgtest_force_shared_crt=ON) + endif() + + ExternalProject_Add(googletest_ep + URL ${GTEST_SOURCE_URL} + BUILD_BYPRODUCTS ${GTEST_STATIC_LIB} ${GTEST_MAIN_STATIC_LIB} + CMAKE_ARGS ${GTEST_CMAKE_ARGS} + ${EP_LOG_OPTIONS}) + else() + find_package(GTest REQUIRED) + set(GTEST_VENDORED 0) + endif() + + message(STATUS "GTest include dir: ${GTEST_INCLUDE_DIR}") + message(STATUS "GTest static library: ${GTEST_STATIC_LIB}") + include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(gtest + STATIC_LIB ${GTEST_STATIC_LIB}) + ADD_THIRDPARTY_LIB(gtest_main + STATIC_LIB ${GTEST_MAIN_STATIC_LIB}) + + if(GTEST_VENDORED) + add_dependencies(gtest_static googletest_ep) + add_dependencies(gtest_main_static googletest_ep) + endif() +endif() + if(ARROW_BUILD_BENCHMARKS) if("$ENV{GBENCHMARK_HOME}" STREQUAL "") if(CMAKE_VERSION VERSION_LESS 3.6) @@ -1506,10 +1512,14 @@ if (ARROW_USE_GLOG) message(STATUS "Glog static library: ${GLOG_STATIC_LIB}") include_directories(SYSTEM ${GLOG_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(glog - STATIC_LIB ${GLOG_STATIC_LIB}) if (GLOG_VENDORED) + ADD_THIRDPARTY_LIB(glog + STATIC_LIB ${GLOG_STATIC_LIB}) add_dependencies(glog_static glog_ep) + else() + ADD_THIRDPARTY_LIB(glog + STATIC_LIB ${GLOG_STATIC_LIB} + DEPS gflags_static) endif() endif() From c6d97c59ef047cc9d5e2836b1945df26cd7c4622 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 20 Dec 2018 18:19:29 -0600 Subject: [PATCH 50/80] ARROW-4093: [C++] Fix wrong suggested method name Author: Kouhei Sutou Closes #3238 from kou/cpp-fix-typo and squashes the following commits: b5b880af9 Fix wrong suggested method name --- cpp/src/arrow/type.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 95b5189de0343..eb00f43caa172 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -525,7 +525,7 @@ class ARROW_EXPORT StructType : public NestedType { ARROW_DEPRECATED("Use GetFieldByName") std::shared_ptr GetChildByName(const std::string& name) const; - ARROW_DEPRECATED("Use GetChildIndex") + ARROW_DEPRECATED("Use GetFieldIndex") int GetChildIndex(const std::string& name) const; private: From 747590afc84481f61ead4d4c14e25ff9b79213f6 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sat, 22 Dec 2018 00:31:47 +0900 Subject: [PATCH 51/80] ARROW-4011: [Gandiva] Install irhelpers.bc and use it If we don't install irhelpers.bc, users need to keep build directory that has irhelpers.bc. Author: Kouhei Sutou Closes #3232 from kou/gandiva-use-installed-bc and squashes the following commits: 5a0c6228 Adjust irhelper.bc path in Java 829212c4 Adjust irhelper.bc path in Java ea9c6b36 Adjust irhelper.bc path in Java cb3d473b Adjust irhelper.bc path in Java ab60eda9 Remove "gandiva_" prefix and put built file to current binary dir 934e258c Add "gandiva_" prefix 7ff4cf24 Define GANDIVA_BYTE_COMPILE_FILE_PATH for all build ad615b4b Install irhelpers.bc and use it --- cpp/src/gandiva/CMakeLists.txt | 9 +- cpp/src/gandiva/bc_file_path.cc.in | 2 +- cpp/src/gandiva/engine_llvm_test.cc | 5 +- cpp/src/gandiva/llvm_generator_test.cc | 7 +- cpp/src/gandiva/tests/binary_test.cc | 2 +- cpp/src/gandiva/tests/boolean_expr_test.cc | 12 +-- cpp/src/gandiva/tests/date_time_test.cc | 16 ++-- cpp/src/gandiva/tests/filter_test.cc | 20 ++-- cpp/src/gandiva/tests/hash_test.cc | 6 +- cpp/src/gandiva/tests/huge_table_test.cc | 4 +- cpp/src/gandiva/tests/if_expr_test.cc | 12 +-- cpp/src/gandiva/tests/in_expr_test.cc | 6 +- cpp/src/gandiva/tests/literal_test.cc | 12 +-- cpp/src/gandiva/tests/micro_benchmarks.cc | 16 ++-- cpp/src/gandiva/tests/null_validity_test.cc | 6 +- .../tests/projector_build_validation_test.cc | 22 ++--- cpp/src/gandiva/tests/projector_test.cc | 95 ++++++------------- cpp/src/gandiva/tests/test_util.h | 6 ++ cpp/src/gandiva/tests/utf8_test.cc | 19 ++-- java/gandiva/pom.xml | 2 +- 20 files changed, 131 insertions(+), 148 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 23ad93e201e71..d28c372a9e6ab 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -27,14 +27,18 @@ find_package(LLVM) # Set the path where the byte-code files will be installed. set(GANDIVA_BC_INSTALL_DIR - ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gandiva) + ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/gandiva) set(GANDIVA_BC_FILE_NAME irhelpers.bc) set(GANDIVA_BC_INSTALL_PATH ${GANDIVA_BC_INSTALL_DIR}/${GANDIVA_BC_FILE_NAME}) -set(GANDIVA_BC_OUTPUT_PATH ${BUILD_OUTPUT_ROOT_DIRECTORY}/${GANDIVA_BC_FILE_NAME}) +set(GANDIVA_BC_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR}/${GANDIVA_BC_FILE_NAME}) +install(FILES + ${GANDIVA_BC_OUTPUT_PATH} + DESTINATION ${GANDIVA_BC_INSTALL_DIR}) set(BC_FILE_PATH_CC "${CMAKE_CURRENT_BINARY_DIR}/bc_file_path.cc") configure_file(bc_file_path.cc.in ${BC_FILE_PATH_CC}) +add_definitions(-DGANDIVA_BYTE_COMPILE_FILE_PATH="${GANDIVA_BC_OUTPUT_PATH}") set(SRC_FILES annotator.cc bitmap_accumulator.cc @@ -59,7 +63,6 @@ set(SRC_FILES annotator.cc selection_vector.cc tree_expr_builder.cc to_date_holder.cc - ${SHARED_HELPER_FILES} ${BC_FILE_PATH_CC}) set(GANDIVA_SHARED_PRIVATE_LINK_LIBS diff --git a/cpp/src/gandiva/bc_file_path.cc.in b/cpp/src/gandiva/bc_file_path.cc.in index d6b4e342b6714..54e81ca2bfa18 100644 --- a/cpp/src/gandiva/bc_file_path.cc.in +++ b/cpp/src/gandiva/bc_file_path.cc.in @@ -18,6 +18,6 @@ namespace gandiva { // Path to the byte-code file. -extern const char kByteCodeFilePath[] = "${GANDIVA_BC_OUTPUT_PATH}"; +extern const char kByteCodeFilePath[] = "${GANDIVA_BC_INSTALL_PATH}"; } // namespace gandiva diff --git a/cpp/src/gandiva/engine_llvm_test.cc b/cpp/src/gandiva/engine_llvm_test.cc index fe4f82e19320c..627c385f97363 100644 --- a/cpp/src/gandiva/engine_llvm_test.cc +++ b/cpp/src/gandiva/engine_llvm_test.cc @@ -19,6 +19,7 @@ #include #include "gandiva/llvm_types.h" +#include "gandiva/tests/test_util.h" namespace gandiva { @@ -100,7 +101,7 @@ llvm::Function* TestEngine::BuildVecAdd(Engine* engine, LLVMTypes* types) { TEST_F(TestEngine, TestAddUnoptimised) { std::unique_ptr engine; - Status status = Engine::Make(ConfigurationBuilder::DefaultConfiguration(), &engine); + auto status = Engine::Make(TestConfiguration(), &engine); EXPECT_TRUE(status.ok()) << status.message(); LLVMTypes types(*engine->context()); llvm::Function* ir_func = BuildVecAdd(engine.get(), &types); @@ -115,7 +116,7 @@ TEST_F(TestEngine, TestAddUnoptimised) { TEST_F(TestEngine, TestAddOptimised) { std::unique_ptr engine; - Status status = Engine::Make(ConfigurationBuilder::DefaultConfiguration(), &engine); + auto status = Engine::Make(TestConfiguration(), &engine); EXPECT_TRUE(status.ok()) << status.message(); LLVMTypes types(*engine->context()); llvm::Function* ir_func = BuildVecAdd(engine.get(), &types); diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 818c7912150a9..fed6339314850 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -26,6 +26,7 @@ #include "gandiva/expression.h" #include "gandiva/func_descriptor.h" #include "gandiva/function_registry.h" +#include "gandiva/tests/test_util.h" namespace gandiva { @@ -39,8 +40,7 @@ class TestLLVMGenerator : public ::testing::Test { // Verify that a valid pc function exists for every function in the registry. TEST_F(TestLLVMGenerator, VerifyPCFunctions) { std::unique_ptr generator; - Status status = - LLVMGenerator::Make(ConfigurationBuilder::DefaultConfiguration(), &generator); + auto status = LLVMGenerator::Make(TestConfiguration(), &generator); EXPECT_TRUE(status.ok()) << status.message(); llvm::Module* module = generator->module(); @@ -54,8 +54,7 @@ TEST_F(TestLLVMGenerator, VerifyPCFunctions) { TEST_F(TestLLVMGenerator, TestAdd) { // Setup LLVM generator to do an arithmetic add of two vectors std::unique_ptr generator; - Status status = - LLVMGenerator::Make(ConfigurationBuilder::DefaultConfiguration(), &generator); + auto status = LLVMGenerator::Make(TestConfiguration(), &generator); EXPECT_TRUE(status.ok()); Annotator annotator; diff --git a/cpp/src/gandiva/tests/binary_test.cc b/cpp/src/gandiva/tests/binary_test.cc index d5d99db910b9d..6ac3c5155196e 100644 --- a/cpp/src/gandiva/tests/binary_test.cc +++ b/cpp/src/gandiva/tests/binary_test.cc @@ -61,7 +61,7 @@ TEST_F(TestBinary, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/boolean_expr_test.cc b/cpp/src/gandiva/tests/boolean_expr_test.cc index 3351ab3ccf3ff..9226f357159c6 100644 --- a/cpp/src/gandiva/tests/boolean_expr_test.cc +++ b/cpp/src/gandiva/tests/boolean_expr_test.cc @@ -60,7 +60,7 @@ TEST_F(TestBooleanExpr, SimpleAnd) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // FALSE_VALID && ? => FALSE_VALID @@ -133,7 +133,7 @@ TEST_F(TestBooleanExpr, SimpleOr) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // TRUE_VALID && ? => TRUE_VALID @@ -210,7 +210,7 @@ TEST_F(TestBooleanExpr, AndThree) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 8; @@ -257,7 +257,7 @@ TEST_F(TestBooleanExpr, OrThree) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 8; @@ -317,7 +317,7 @@ TEST_F(TestBooleanExpr, BooleanAndInsideIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 4; @@ -368,7 +368,7 @@ TEST_F(TestBooleanExpr, IfInsideBooleanAnd) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); int num_records = 4; diff --git a/cpp/src/gandiva/tests/date_time_test.cc b/cpp/src/gandiva/tests/date_time_test.cc index 3914558d716c7..643b8c8dda3ce 100644 --- a/cpp/src/gandiva/tests/date_time_test.cc +++ b/cpp/src/gandiva/tests/date_time_test.cc @@ -73,7 +73,8 @@ TEST_F(TestProjector, TestIsNull) { auto isnotnull_expr = TreeExprBuilder::MakeExpression("isnotnull", {t0}, b0); std::shared_ptr projector; - Status status = Projector::Make(schema, {isnull_expr, isnotnull_expr}, &projector); + auto status = Projector::Make(schema, {isnull_expr, isnotnull_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); int num_records = 4; @@ -126,8 +127,9 @@ TEST_F(TestProjector, TestDateTime) { auto ts2day_expr = TreeExprBuilder::MakeExpression("extractDay", {field2}, field_day); std::shared_ptr projector; - Status status = Projector::Make( - schema, {date2year_expr, date2month_expr, ts2month_expr, ts2day_expr}, &projector); + auto status = Projector::Make( + schema, {date2year_expr, date2month_expr, ts2month_expr, ts2day_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); struct tm y1970; @@ -196,7 +198,8 @@ TEST_F(TestProjector, TestTime) { TreeExprBuilder::MakeExpression("extractHour", {field0}, field_hour); std::shared_ptr projector; - Status status = Projector::Make(schema, {time2min_expr, time2hour_expr}, &projector); + auto status = Projector::Make(schema, {time2min_expr, time2hour_expr}, + TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); // create input data @@ -264,7 +267,7 @@ TEST_F(TestProjector, TestTimestampDiff) { std::shared_ptr projector; auto exprs = {diff_secs_expr, diff_mins_expr, diff_hours_expr, diff_days_expr, diff_weeks_expr, diff_months_expr, diff_quarters_expr, diff_years_expr}; - Status status = Projector::Make(schema, exprs, &projector); + auto status = Projector::Make(schema, exprs, TestConfiguration(), &projector); ASSERT_TRUE(status.ok()); struct tm y1970; @@ -337,7 +340,8 @@ TEST_F(TestProjector, TestMonthsBetween) { TreeExprBuilder::MakeExpression("months_between", {f0, f1}, output); std::shared_ptr projector; - Status status = Projector::Make(schema, {months_between_expr}, &projector); + auto status = + Projector::Make(schema, {months_between_expr}, TestConfiguration(), &projector); std::cout << status.message(); ASSERT_TRUE(status.ok()); diff --git a/cpp/src/gandiva/tests/filter_test.cc b/cpp/src/gandiva/tests/filter_test.cc index f95cdcc3fef9c..ee60388d5dc1f 100644 --- a/cpp/src/gandiva/tests/filter_test.cc +++ b/cpp/src/gandiva/tests/filter_test.cc @@ -50,14 +50,15 @@ TEST_F(TestFilter, TestFilterCache) { auto less_than_10 = TreeExprBuilder::MakeFunction("less_than", {sum_func, literal_10}, arrow::boolean()); auto condition = TreeExprBuilder::MakeCondition(less_than_10); + auto configuration = TestConfiguration(); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, configuration, &filter); EXPECT_TRUE(status.ok()); // same schema and condition, should return the same filter as above. std::shared_ptr cached_filter; - status = Filter::Make(schema, condition, &cached_filter); + status = Filter::Make(schema, condition, configuration, &cached_filter); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() == filter.get()); @@ -65,7 +66,8 @@ TEST_F(TestFilter, TestFilterCache) { auto field2 = field("f2", int32()); auto different_schema = arrow::schema({field0, field1, field2}); std::shared_ptr should_be_new_filter; - status = Filter::Make(different_schema, condition, &should_be_new_filter); + status = + Filter::Make(different_schema, condition, configuration, &should_be_new_filter); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() != should_be_new_filter.get()); @@ -74,7 +76,7 @@ TEST_F(TestFilter, TestFilterCache) { "greater_than", {sum_func, literal_10}, arrow::boolean()); auto new_condition = TreeExprBuilder::MakeCondition(greater_than_10); std::shared_ptr should_be_new_filter1; - status = Filter::Make(schema, new_condition, &should_be_new_filter1); + status = Filter::Make(schema, new_condition, configuration, &should_be_new_filter1); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_filter.get() != should_be_new_filter1.get()); } @@ -96,7 +98,7 @@ TEST_F(TestFilter, TestSimple) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -134,7 +136,7 @@ TEST_F(TestFilter, TestSimpleCustomConfig) { std::shared_ptr config = config_builder.build(); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -168,7 +170,7 @@ TEST_F(TestFilter, TestZeroCopy) { auto condition = TreeExprBuilder::MakeCondition("isnotnull", {field0}); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -208,7 +210,7 @@ TEST_F(TestFilter, TestZeroCopyNegative) { auto condition = TreeExprBuilder::MakeCondition("isnotnull", {field0}); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -265,7 +267,7 @@ TEST_F(TestFilter, TestSimpleSVInt32) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/hash_test.cc b/cpp/src/gandiva/tests/hash_test.cc index 96f92284a5ca1..afaa885dfe26b 100644 --- a/cpp/src/gandiva/tests/hash_test.cc +++ b/cpp/src/gandiva/tests/hash_test.cc @@ -61,7 +61,8 @@ TEST_F(TestHash, TestSimple) { // Build a projector for the expression. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_0, expr_1}, &projector); + auto status = + Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -113,7 +114,8 @@ TEST_F(TestHash, TestBuf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_0, expr_1}, &projector); + auto status = + Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/huge_table_test.cc b/cpp/src/gandiva/tests/huge_table_test.cc index bffcb1994707f..cecf290a1439f 100644 --- a/cpp/src/gandiva/tests/huge_table_test.cc +++ b/cpp/src/gandiva/tests/huge_table_test.cc @@ -58,7 +58,7 @@ TEST_F(DISABLED_TestHugeProjector, SimpleTestSumHuge) { // Build expression auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum); std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr}, &projector); + auto status = Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -136,7 +136,7 @@ TEST_F(DISABLED_TestHugeFilter, TestSimpleHugeFilter) { auto condition = TreeExprBuilder::MakeCondition(less_than_50); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // prepare input record batch diff --git a/cpp/src/gandiva/tests/if_expr_test.cc b/cpp/src/gandiva/tests/if_expr_test.cc index 93b35673b9467..54b6d43b4df1c 100644 --- a/cpp/src/gandiva/tests/if_expr_test.cc +++ b/cpp/src/gandiva/tests/if_expr_test.cc @@ -61,7 +61,7 @@ TEST_F(TestIfExpr, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -110,7 +110,7 @@ TEST_F(TestIfExpr, TestSimpleArithmetic) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -165,7 +165,7 @@ TEST_F(TestIfExpr, TestNested) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -228,7 +228,7 @@ TEST_F(TestIfExpr, TestNestedInIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -296,7 +296,7 @@ TEST_F(TestIfExpr, TestNestedInCondition) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -353,7 +353,7 @@ TEST_F(TestIfExpr, TestBigNested) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/in_expr_test.cc b/cpp/src/gandiva/tests/in_expr_test.cc index 13ef97cfb8814..2103874cb1e2c 100644 --- a/cpp/src/gandiva/tests/in_expr_test.cc +++ b/cpp/src/gandiva/tests/in_expr_test.cc @@ -51,7 +51,7 @@ TEST_F(TestIn, TestInSimple) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -88,7 +88,7 @@ TEST_F(TestIn, TestInString) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -125,7 +125,7 @@ TEST_F(TestIn, TestInStringValidationError) { auto condition = TreeExprBuilder::MakeCondition(in_expr); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Evaluation expression for IN clause returns "; diff --git a/cpp/src/gandiva/tests/literal_test.cc b/cpp/src/gandiva/tests/literal_test.cc index ced66452a2d45..53323cb4e7cbb 100644 --- a/cpp/src/gandiva/tests/literal_test.cc +++ b/cpp/src/gandiva/tests/literal_test.cc @@ -88,8 +88,8 @@ TEST_F(TestLiteral, TestSimpleArithmetic) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = - Projector::Make(schema, {expr_a, expr_b, expr_c, expr_d, expr_e}, &projector); + auto status = Projector::Make(schema, {expr_a, expr_b, expr_c, expr_d, expr_e}, + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -133,7 +133,7 @@ TEST_F(TestLiteral, TestLiteralHash) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); auto res1 = field("a", int64()); @@ -142,7 +142,7 @@ TEST_F(TestLiteral, TestLiteralHash) { // Build a projector for the expressions. std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, TestConfiguration(), &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector.get() != projector1.get()); } @@ -165,7 +165,7 @@ TEST_F(TestLiteral, TestNullLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -207,7 +207,7 @@ TEST_F(TestLiteral, TestNullLiteralInIf) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/micro_benchmarks.cc b/cpp/src/gandiva/tests/micro_benchmarks.cc index 7d844eb378bf8..ce86bf0612402 100644 --- a/cpp/src/gandiva/tests/micro_benchmarks.cc +++ b/cpp/src/gandiva/tests/micro_benchmarks.cc @@ -56,7 +56,7 @@ static void TimedTestAdd3(benchmark::State& state) { auto sum_expr = TreeExprBuilder::MakeExpression(sum, field_sum); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {sum_expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {sum_expr}, TestConfiguration(), &projector)); Int64DataGenerator data_generator; ProjectEvaluator evaluator(projector); @@ -99,7 +99,7 @@ static void TimedTestBigNested(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); BoundedInt32DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); @@ -122,7 +122,7 @@ static void TimedTestExtractYear(benchmark::State& state) { auto expr = TreeExprBuilder::MakeExpression("extractYear", {field0}, field_res); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); Int64DataGenerator data_generator; ProjectEvaluator evaluator(projector); @@ -149,7 +149,7 @@ static void TimedTestFilterAdd2(benchmark::State& state) { auto condition = TreeExprBuilder::MakeCondition(less_than); std::shared_ptr filter; - ASSERT_OK(Filter::Make(schema, condition, &filter)); + ASSERT_OK(Filter::Make(schema, condition, TestConfiguration(), &filter)); Int64DataGenerator data_generator; FilterEvaluator evaluator(filter); @@ -173,7 +173,7 @@ static void TimedTestFilterLike(benchmark::State& state) { auto condition = TreeExprBuilder::MakeCondition(like_yellow); std::shared_ptr filter; - ASSERT_OK(Filter::Make(schema, condition, &filter)); + ASSERT_OK(Filter::Make(schema, condition, TestConfiguration(), &filter)); FastUtf8DataGenerator data_generator(32); FilterEvaluator evaluator(filter); @@ -199,7 +199,7 @@ static void TimedTestAllocs(benchmark::State& state) { auto expr = TreeExprBuilder::MakeExpression(length, field_res); std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(64); ProjectEvaluator evaluator(projector); @@ -237,7 +237,7 @@ static void TimedTestMultiOr(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); @@ -269,7 +269,7 @@ static void TimedTestInExpr(benchmark::State& state) { // Build a projector for the expressions. std::shared_ptr projector; - ASSERT_OK(Projector::Make(schema, {expr}, &projector)); + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); FastUtf8DataGenerator data_generator(250); ProjectEvaluator evaluator(projector); diff --git a/cpp/src/gandiva/tests/null_validity_test.cc b/cpp/src/gandiva/tests/null_validity_test.cc index 06cfdc08ba906..0374b68d46288 100644 --- a/cpp/src/gandiva/tests/null_validity_test.cc +++ b/cpp/src/gandiva/tests/null_validity_test.cc @@ -60,7 +60,7 @@ TEST_F(TestNullValidity, TestFunc) { auto condition = TreeExprBuilder::MakeCondition(less_than_10); std::shared_ptr filter; - Status status = Filter::Make(schema, condition, &filter); + auto status = Filter::Make(schema, condition, TestConfiguration(), &filter); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -111,7 +111,7 @@ TEST_F(TestNullValidity, TestIfElse) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -148,7 +148,7 @@ TEST_F(TestNullValidity, TestUtf8) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/projector_build_validation_test.cc b/cpp/src/gandiva/tests/projector_build_validation_test.cc index 18f02957fd479..6c4eef53ded68 100644 --- a/cpp/src/gandiva/tests/projector_build_validation_test.cc +++ b/cpp/src/gandiva/tests/projector_build_validation_test.cc @@ -50,7 +50,7 @@ TEST_F(TestProjector, TestNonExistentFunction) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Function bool non_existent_function(float, float) not supported yet."; @@ -71,7 +71,7 @@ TEST_F(TestProjector, TestNotMatchingDataType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Return type of root node float does not match that of expression bool"; @@ -92,7 +92,7 @@ TEST_F(TestProjector, TestNotSupportedDataType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field f0 has unsupported data type list"; EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); @@ -113,7 +113,7 @@ TEST_F(TestProjector, TestIncorrectSchemaMissingField) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field f2 not in schema"; EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); @@ -135,7 +135,7 @@ TEST_F(TestProjector, TestIncorrectSchemaTypeNotMatching) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); std::string expected_error = "Field definition in schema f2: int32 different from field in expression f2: float"; @@ -166,7 +166,7 @@ TEST_F(TestProjector, TestIfNotSupportedFunction) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -189,7 +189,7 @@ TEST_F(TestProjector, TestIfNotMatchingReturnType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -214,7 +214,7 @@ TEST_F(TestProjector, TestElseNotMatchingReturnType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -239,7 +239,7 @@ TEST_F(TestProjector, TestElseNotSupportedType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); EXPECT_EQ(status.code(), StatusCode::ExpressionValidationError); } @@ -259,7 +259,7 @@ TEST_F(TestProjector, TestAndMinChildren) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } @@ -280,7 +280,7 @@ TEST_F(TestProjector, TestAndBooleanArgType) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); } diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 61d9dc3ad1629..1aeb43b49b0dc 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -50,14 +50,17 @@ TEST_F(TestProjector, TestProjectCache) { auto sub_expr = TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); + auto configuration = TestConfiguration(); + std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr, sub_expr}, &projector); + auto status = Projector::Make(schema, {sum_expr, sub_expr}, configuration, &projector); EXPECT_TRUE(status.ok()); // everything is same, should return the same projector. auto schema_same = arrow::schema({field0, field1}); std::shared_ptr cached_projector; - status = Projector::Make(schema_same, {sum_expr, sub_expr}, &cached_projector); + status = Projector::Make(schema_same, {sum_expr, sub_expr}, configuration, + &cached_projector); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_projector.get() == projector.get()); @@ -65,14 +68,14 @@ TEST_F(TestProjector, TestProjectCache) { auto field2 = field("f2", int32()); auto different_schema = arrow::schema({field0, field1, field2}); std::shared_ptr should_be_new_projector; - status = - Projector::Make(different_schema, {sum_expr, sub_expr}, &should_be_new_projector); + status = Projector::Make(different_schema, {sum_expr, sub_expr}, configuration, + &should_be_new_projector); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_projector.get() != should_be_new_projector.get()); // expression list is different should return a new projector. std::shared_ptr should_be_new_projector1; - status = Projector::Make(schema, {sum_expr}, &should_be_new_projector1); + status = Projector::Make(schema, {sum_expr}, configuration, &should_be_new_projector1); EXPECT_TRUE(status.ok()); EXPECT_TRUE(cached_projector.get() != should_be_new_projector1.get()); } @@ -90,12 +93,13 @@ TEST_F(TestProjector, TestProjectCacheFieldNames) { auto sum_expr_01 = TreeExprBuilder::MakeExpression("add", {field0, field1}, sum_01); std::shared_ptr projector_01; - Status status = Projector::Make(schema, {sum_expr_01}, &projector_01); + auto status = + Projector::Make(schema, {sum_expr_01}, TestConfiguration(), &projector_01); EXPECT_TRUE(status.ok()); auto sum_expr_12 = TreeExprBuilder::MakeExpression("add", {field1, field2}, sum_12); std::shared_ptr projector_12; - status = Projector::Make(schema, {sum_expr_12}, &projector_12); + status = Projector::Make(schema, {sum_expr_12}, TestConfiguration(), &projector_12); EXPECT_TRUE(status.ok()); // add(f0, f1) != add(f1, f2) @@ -111,14 +115,16 @@ TEST_F(TestProjector, TestProjectCacheDouble) { auto literal0 = TreeExprBuilder::MakeLiteral(d0); auto expr0 = TreeExprBuilder::MakeExpression(literal0, res); + auto configuration = TestConfiguration(); + std::shared_ptr projector0; - auto status = Projector::Make(schema, {expr0}, &projector0); + auto status = Projector::Make(schema, {expr0}, configuration, &projector0); EXPECT_TRUE(status.ok()) << status.message(); auto literal1 = TreeExprBuilder::MakeLiteral(d1); auto expr1 = TreeExprBuilder::MakeExpression(literal1, res); std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, configuration, &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector0.get() != projector1.get()); @@ -134,13 +140,13 @@ TEST_F(TestProjector, TestProjectCacheFloat) { auto literal0 = TreeExprBuilder::MakeLiteral(f0); auto expr0 = TreeExprBuilder::MakeExpression(literal0, res); std::shared_ptr projector0; - auto status = Projector::Make(schema, {expr0}, &projector0); + auto status = Projector::Make(schema, {expr0}, TestConfiguration(), &projector0); EXPECT_TRUE(status.ok()) << status.message(); auto literal1 = TreeExprBuilder::MakeLiteral(f1); auto expr1 = TreeExprBuilder::MakeExpression(literal1, res); std::shared_ptr projector1; - status = Projector::Make(schema, {expr1}, &projector1); + status = Projector::Make(schema, {expr1}, TestConfiguration(), &projector1); EXPECT_TRUE(status.ok()) << status.message(); EXPECT_TRUE(projector0.get() != projector1.get()); @@ -162,50 +168,8 @@ TEST_F(TestProjector, TestIntSumSub) { TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); std::shared_ptr projector; - Status status = Projector::Make(schema, {sum_expr, sub_expr}, &projector); - EXPECT_TRUE(status.ok()); - - // Create a row-batch with some sample data - int num_records = 4; - auto array0 = MakeArrowArrayInt32({1, 2, 3, 4}, {true, true, true, false}); - auto array1 = MakeArrowArrayInt32({11, 13, 15, 17}, {true, true, false, true}); - // expected output - auto exp_sum = MakeArrowArrayInt32({12, 15, 0, 0}, {true, true, false, false}); - auto exp_sub = MakeArrowArrayInt32({-10, -11, 0, 0}, {true, true, false, false}); - - // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); - - // Evaluate expression - arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); - EXPECT_TRUE(status.ok()); - - // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0)); - EXPECT_ARROW_ARRAY_EQUALS(exp_sub, outputs.at(1)); -} - -TEST_F(TestProjector, TestIntSumSubCustomConfig) { - // schema for input fields - auto field0 = field("f0", int32()); - auto field1 = field("f2", int32()); - auto schema = arrow::schema({field0, field1}); - - // output fields - auto field_sum = field("add", int32()); - auto field_sub = field("subtract", int32()); - - // Build expression - auto sum_expr = TreeExprBuilder::MakeExpression("add", {field0, field1}, field_sum); - auto sub_expr = - TreeExprBuilder::MakeExpression("subtract", {field0, field1}, field_sub); - - std::shared_ptr projector; - ConfigurationBuilder config_builder; - std::shared_ptr config = config_builder.build(); - - Status status = Projector::Make(schema, {sum_expr, sub_expr}, config, &projector); + auto status = + Projector::Make(schema, {sum_expr, sub_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -257,8 +221,9 @@ static void TestArithmeticOpsForType(arrow::MemoryPool* pool) { auto lt_expr = TreeExprBuilder::MakeExpression("less_than", {field0, field1}, field_lt); std::shared_ptr projector; - Status status = Projector::Make( - schema, {sum_expr, sub_expr, mul_expr, div_expr, eq_expr, lt_expr}, &projector); + auto status = + Projector::Make(schema, {sum_expr, sub_expr, mul_expr, div_expr, eq_expr, lt_expr}, + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -344,9 +309,9 @@ TEST_F(TestProjector, TestExtendedMath) { TreeExprBuilder::MakeExpression("power", {field0, field1}, field_power); std::shared_ptr projector; - Status status = Projector::Make( + auto status = Projector::Make( schema, {cbrt_expr, exp_expr, log_expr, log10_expr, logb_expr, power_expr}, - &projector); + TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -412,7 +377,7 @@ TEST_F(TestProjector, TestFloatLessThan) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {lt_expr}, &projector); + auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -447,7 +412,7 @@ TEST_F(TestProjector, TestIsNotNull) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {myexpr}, &projector); + auto status = Projector::Make(schema, {myexpr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -480,7 +445,7 @@ TEST_F(TestProjector, TestZeroCopy) { auto cast_expr = TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res); std::shared_ptr projector; - Status status = Projector::Make(schema, {cast_expr}, &projector); + auto status = Projector::Make(schema, {cast_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -527,7 +492,7 @@ TEST_F(TestProjector, TestZeroCopyNegative) { auto cast_expr = TreeExprBuilder::MakeExpression("castFLOAT4", {field0}, res); std::shared_ptr projector; - Status status = Projector::Make(schema, {cast_expr}, &projector); + auto status = Projector::Make(schema, {cast_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data @@ -597,7 +562,7 @@ TEST_F(TestProjector, TestDivideZero) { auto div_expr = TreeExprBuilder::MakeExpression("divide", {field0, field1}, field_div); std::shared_ptr projector; - Status status = Projector::Make(schema, {div_expr}, &projector); + auto status = Projector::Make(schema, {div_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -646,7 +611,7 @@ TEST_F(TestProjector, TestModZero) { auto mod_expr = TreeExprBuilder::MakeExpression("mod", {field0, field1}, field_div); std::shared_ptr projector; - Status status = Projector::Make(schema, {mod_expr}, &projector); + auto status = Projector::Make(schema, {mod_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/cpp/src/gandiva/tests/test_util.h b/cpp/src/gandiva/tests/test_util.h index d24448727bd83..72b45b124b8dd 100644 --- a/cpp/src/gandiva/tests/test_util.h +++ b/cpp/src/gandiva/tests/test_util.h @@ -73,6 +73,12 @@ static ArrayPtr MakeArrowTypeArray(const std::shared_ptr& type, EXPECT_TRUE((a)->Equals(b)) << "expected array: " << (a)->ToString() \ << " actual array: " << (b)->ToString(); +std::shared_ptr TestConfiguration() { + auto builder = ConfigurationBuilder(); + builder.set_byte_code_file_path(GANDIVA_BYTE_COMPILE_FILE_PATH); + return builder.build(); +} + } // namespace gandiva #endif // GANDIVA_TEST_UTIL_H diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index 8b09b72f32d03..925ceea836280 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -67,7 +67,8 @@ TEST_F(TestUtf8, TestSimple) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr_a, expr_b, expr_c}, &projector); + auto status = + Projector::Make(schema, {expr_a, expr_b, expr_c}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -113,7 +114,7 @@ TEST_F(TestUtf8, TestLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -155,7 +156,7 @@ TEST_F(TestUtf8, TestNullLiteral) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -197,7 +198,7 @@ TEST_F(TestUtf8, TestLike) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -245,7 +246,7 @@ TEST_F(TestUtf8, TestBeginsEnds) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr1, expr2}, &projector); + auto status = Projector::Make(schema, {expr1, expr2}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -291,7 +292,7 @@ TEST_F(TestUtf8, TestInternalAllocs) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -334,7 +335,7 @@ TEST_F(TestUtf8, TestCastDate) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -389,7 +390,7 @@ TEST_F(TestUtf8, TestToDateNoError) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data @@ -444,7 +445,7 @@ TEST_F(TestUtf8, TestToDateError) { // Build a projector for the expressions. std::shared_ptr projector; - Status status = Projector::Make(schema, {expr}, &projector); + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 39752e2d36913..d365eb9193ac1 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -133,7 +133,7 @@ - ${gandiva.cpp.build.dir} + ${gandiva.cpp.build.dir}/../src/gandiva irhelpers.bc From 48dd1724ddf2354391f6b7b3fdb043ab780b2c27 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 21 Dec 2018 10:36:55 -0600 Subject: [PATCH 52/80] ARROW-2970: [Python] Support conversions of NumPy string arrays requiring chunked binary output Author: Wes McKinney Closes #3240 from wesm/ARROW-2970 and squashes the following commits: 8b04eb3c4 Make the test data a bit more diverse 60d35f0e4 Use internal::ChunkedBinaryBuilder for converting NumPy string/binary array to Arrow --- cpp/src/arrow/python/CMakeLists.txt | 2 ++ cpp/src/arrow/python/numpy_to_arrow.cc | 12 ++++++++--- python/pyarrow/tests/test_array.py | 30 ++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index cccbf09d4fb4d..0f037ad4b0571 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -77,6 +77,8 @@ ADD_ARROW_LIB(arrow_python EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" ) +add_dependencies(arrow_python ${ARROW_PYTHON_LIBRARIES}) + foreach(LIB_TARGET ${ARROW_PYTHON_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_PYTHON_EXPORTING) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 461a085722243..aa28b6e870834 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -534,8 +534,11 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d return Status::OK(); } +// Create 16MB chunks for binary data +constexpr int32_t kBinaryChunksize = 1 << 24; + Status NumPyConverter::Visit(const BinaryType& type) { - BinaryBuilder builder(pool_); + ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_); auto data = reinterpret_cast(PyArray_DATA(arr_)); @@ -564,9 +567,12 @@ Status NumPyConverter::Visit(const BinaryType& type) { } } - std::shared_ptr result; + ArrayVector result; RETURN_NOT_OK(builder.Finish(&result)); - return PushArray(result->data()); + for (auto arr : result) { + RETURN_NOT_OK(PushArray(arr->data())); + } + return Status::OK(); } Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 95a60435e3460..352c8558c881b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1268,3 +1268,33 @@ def test_array_from_numpy_str_utf8(): with pytest.raises(ValueError): pa.array(vec, pa.string(), mask=np.array([False])) + + +@pytest.mark.large_memory +def test_numpy_string_overflow_to_chunked(): + # ARROW-3762 + + # 2^31 + 1 bytes + values = [b'x'] + + # Make 10 unique 1MB strings then repeat then 2048 times + unique_strings = { + i: b'x' * ((1 << 20) - 1) + str(i % 10).encode('utf8') + for i in range(10) + } + values += [unique_strings[i % 10] for i in range(1 << 11)] + + arr = np.array(values) + arrow_arr = pa.array(arr) + + assert isinstance(arrow_arr, pa.ChunkedArray) + + # Split up into 16MB chunks. 128 * 16 = 2048, so 129 + assert arrow_arr.num_chunks == 129 + + value_index = 0 + for i in range(arrow_arr.num_chunks): + chunk = arrow_arr.chunk(i) + for val in chunk: + assert val.as_py() == values[value_index] + value_index += 1 From 7ebd7b3aaa5646af8bf9707a590daf29d384cf1d Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sat, 22 Dec 2018 21:09:48 +0100 Subject: [PATCH 53/80] ARROW-4105: [Rust] Add rust-toolchain to enforce user to use nightly toolchain for building The Rust binding needs to be built by nightly toolchain so if we supply rust-toolchain file, user can build without changing the toolchain explicitly. Author: Kousuke Saruta Closes #3247 from sarutak/add-rust-toolchain and squashes the following commits: 6ab619b8 Add rust-toolchain to rat_exclude_files.txt c3fb2aba Add rust-toolchain to enforce to use nightly toolchain for building --- dev/release/rat_exclude_files.txt | 1 + rust/rust-toolchain | 1 + 2 files changed, 2 insertions(+) create mode 100644 rust/rust-toolchain diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 66d62c6257570..bcb474b79b060 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -179,3 +179,4 @@ r/README.Rmd r/man/*.Rd .gitattributes rust/test/data/*.csv +rust/rust-toolchain diff --git a/rust/rust-toolchain b/rust/rust-toolchain new file mode 100644 index 0000000000000..07ade694b1a3c --- /dev/null +++ b/rust/rust-toolchain @@ -0,0 +1 @@ +nightly \ No newline at end of file From b23cedd12f7638cf7d6c042970090e248de95f80 Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Sat, 22 Dec 2018 21:56:16 +0100 Subject: [PATCH 54/80] ARROW-4075: [Rust] Reuse array builder after calling finish() Currently a buffer/array builder is consumed after `finish()` is called. This may not be very convenient as one may want to use the same builder for multiple arrays. This changes the behavior of it to reset the builder instead. Author: Chao Sun Closes #3221 from sunchao/ARROW-4075 and squashes the following commits: 49f6c4c6 ARROW-4075: Reuse array builder after calling finish() --- rust/src/builder.rs | 172 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 137 insertions(+), 35 deletions(-) diff --git a/rust/src/builder.rs b/rust/src/builder.rs index d5d222d006fe8..a4c8666233877 100644 --- a/rust/src/builder.rs +++ b/rust/src/builder.rs @@ -60,7 +60,7 @@ pub trait BufferBuilderTrait { fn reserve(&mut self, n: usize) -> Result<()>; fn push(&mut self, v: T::Native) -> Result<()>; fn push_slice(&mut self, slice: &[T::Native]) -> Result<()>; - fn finish(self) -> Buffer; + fn finish(&mut self) -> Buffer; } impl BufferBuilderTrait for BufferBuilder { @@ -114,9 +114,11 @@ impl BufferBuilderTrait for BufferBuilder { self.write_bytes(slice.to_byte_slice(), array_slots) } - /// Consumes this builder and returns an immutable `Buffer`. - default fn finish(self) -> Buffer { - self.buffer.freeze() + /// Reset this builder and returns an immutable `Buffer`. + default fn finish(&mut self) -> Buffer { + let buf = ::std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); + self.len = 0; + buf.freeze() } } @@ -196,13 +198,15 @@ impl BufferBuilderTrait for BufferBuilder { Ok(()) } - /// Consumes this and returns an immutable `Buffer`. - fn finish(mut self) -> Buffer { + /// Reset this builder and returns an immutable `Buffer`. + fn finish(&mut self) -> Buffer { // `push` does not update the buffer's `len` so do it before `freeze` is called. let new_buffer_len = bit_util::ceil(self.len, 8); debug_assert!(new_buffer_len >= self.buffer.len()); - self.buffer.resize(new_buffer_len).unwrap(); - self.buffer.freeze() + let mut buf = ::std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); + self.len = 0; + buf.resize(new_buffer_len).unwrap(); + buf.freeze() } } @@ -211,15 +215,25 @@ pub trait ArrayBuilder { /// The type of array that this builder creates type ArrayType: Array; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method - fn into_any(self) -> Box; - /// Returns the number of array slots in the builder fn len(&self) -> usize; /// Builds the array - fn finish(self) -> Self::ArrayType; + fn finish(&mut self) -> Self::ArrayType; + + /// Returns the builder as an non-mutable `Any` reference. + /// + /// This is most useful when one wants to call non-mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_ref` to get a reference on the specific builder. + fn as_any(&self) -> &Any; + + /// Returns the builder as an mutable `Any` reference. + /// + /// This is most useful when one wants to call mutable APIs on a specific builder + /// type. In this case, one can first cast this into a `Any`, and then use + /// `downcast_mut` to get a reference on the specific builder. + fn as_any_mut(&mut self) -> &mut Any; } /// Array builder for fixed-width primitive types @@ -243,10 +257,14 @@ pub type Float64Builder = PrimitiveArrayBuilder; impl ArrayBuilder for PrimitiveArrayBuilder { type ArrayType = PrimitiveArray; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method - fn into_any(self) -> Box { - Box::new(self) + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut Any { + self } /// Returns the number of array slots in the builder @@ -254,8 +272,8 @@ impl ArrayBuilder for PrimitiveArrayBuilder { self.values_builder.len } - /// Builds the PrimitiveArray - fn finish(self) -> PrimitiveArray { + /// Builds the `PrimitiveArray` and reset this builder. + fn finish(&mut self) -> PrimitiveArray { let len = self.len(); let null_bit_buffer = self.bitmap_builder.finish(); let data = ArrayData::builder(T::get_data_type()) @@ -341,10 +359,14 @@ where { type ArrayType = ListArray; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method. - fn into_any(self) -> Box { - Box::new(self) + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut Any { + self } /// Returns the number of array slots in the builder @@ -352,22 +374,25 @@ where self.len } - /// Builds the `ListArray` - fn finish(self) -> ListArray { + /// Builds the `ListArray` and reset this builder. + fn finish(&mut self) -> ListArray { let len = self.len(); + self.len = 0; let values_arr = self .values_builder - .into_any() - .downcast::() + .as_any_mut() + .downcast_mut::() .unwrap() .finish(); let values_data = values_arr.data(); + let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.bitmap_builder.finish(); + self.offsets_builder.push(0).unwrap(); let data = ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone()))) .len(len) .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) - .add_buffer(self.offsets_builder.finish()) + .add_buffer(offset_buffer) .add_child_data(values_data) .null_bit_buffer(null_bit_buffer) .build(); @@ -403,10 +428,14 @@ pub struct BinaryArrayBuilder { impl ArrayBuilder for BinaryArrayBuilder { type ArrayType = BinaryArray; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method. - fn into_any(self) -> Box { - Box::new(self) + /// Returns the builder as an non-mutable `Any` reference. + fn as_any(&self) -> &Any { + self + } + + /// Returns the builder as an mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut Any { + self } /// Returns the number of array slots in the builder @@ -414,8 +443,8 @@ impl ArrayBuilder for BinaryArrayBuilder { self.builder.len() } - /// Builds the `BinaryArray` - fn finish(self) -> BinaryArray { + /// Builds the `BinaryArray` and reset this builder. + fn finish(&mut self) -> BinaryArray { BinaryArray::from(self.builder.finish()) } } @@ -462,7 +491,7 @@ mod tests { #[test] fn test_builder_i32_empty() { - let b = Int32BufferBuilder::new(5); + let mut b = Int32BufferBuilder::new(5); assert_eq!(0, b.len()); assert_eq!(16, b.capacity()); let a = b.finish(); @@ -500,6 +529,27 @@ mod tests { assert_eq!(80, a.len()); } + #[test] + fn test_builder_finish() { + let mut b = Int32BufferBuilder::new(5); + assert_eq!(16, b.capacity()); + for i in 0..10 { + b.push(i).unwrap(); + } + let mut a = b.finish(); + assert_eq!(40, a.len()); + assert_eq!(0, b.len()); + assert_eq!(0, b.capacity()); + + // Try build another buffer after cleaning up. + for i in 0..20 { + b.push(i).unwrap() + } + assert_eq!(32, b.capacity()); + a = b.finish(); + assert_eq!(80, a.len()); + } + #[test] fn test_reserve() { let mut b = UInt8BufferBuilder::new(2); @@ -702,6 +752,20 @@ mod tests { } } + #[test] + fn test_primitive_array_builder_finish() { + let mut builder = Int32Builder::new(5); + builder.push_slice(&[2, 4, 6, 8]).unwrap(); + let mut arr = builder.finish(); + assert_eq!(4, arr.len()); + assert_eq!(0, builder.len()); + + builder.push_slice(&[1, 3, 5, 7, 9]).unwrap(); + arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } + #[test] fn test_list_array_builder() { let values_builder = Int32Builder::new(10); @@ -768,6 +832,27 @@ mod tests { assert_eq!(3, list_array.value_length(2)); } + #[test] + fn test_list_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListArrayBuilder::new(values_builder); + + builder.values().push_slice(&[1, 2, 3]).unwrap(); + builder.append(true).unwrap(); + builder.values().push_slice(&[4, 5, 6]).unwrap(); + builder.append(true).unwrap(); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert_eq!(0, builder.len()); + + builder.values().push_slice(&[7, 8, 9]).unwrap(); + builder.append(true).unwrap(); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert_eq!(0, builder.len()); + } + #[test] fn test_list_list_array_builder() { let primitive_builder = Int32Builder::new(10); @@ -857,6 +942,23 @@ mod tests { assert_eq!(5, binary_array.value_length(2)); } + #[test] + fn test_binary_array_builder_finish() { + let mut builder = BinaryArrayBuilder::new(10); + + builder.push_string("hello").unwrap(); + builder.push_string("world").unwrap(); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert_eq!(0, builder.len()); + + builder.push_string("arrow").unwrap(); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert_eq!(0, builder.len()); + } + #[test] fn test_binary_array_builder_push_string() { let mut builder = BinaryArrayBuilder::new(20); From ddc5e9a721451d8492dfdf797402b2ab7e5e3845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 22 Dec 2018 23:23:08 +0100 Subject: [PATCH 55/80] ARROW-4106: [Python] Tests fail to run because hypothesis update broke its API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Closes #3250 from kszucs/ARROW-4106 and squashes the following commits: d87cc14c don't use defines_strategy --- python/pyarrow/tests/strategies.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index bc8ded2e896d0..c95b75b270e56 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -89,43 +89,35 @@ metadata = st.dictionaries(st.text(), st.text()) -@st.defines_strategy def fields(type_strategy=primitive_types): return st.builds(pa.field, name=custom_text, type=type_strategy, nullable=st.booleans(), metadata=metadata) -@st.defines_strategy def list_types(item_strategy=primitive_types): return st.builds(pa.list_, item_strategy) -@st.defines_strategy def struct_types(item_strategy=primitive_types): return st.builds(pa.struct, st.lists(fields(item_strategy))) -@st.defines_strategy def complex_types(inner_strategy=primitive_types): return list_types(inner_strategy) | struct_types(inner_strategy) -@st.defines_strategy def nested_list_types(item_strategy=primitive_types): return st.recursive(item_strategy, list_types) -@st.defines_strategy def nested_struct_types(item_strategy=primitive_types): return st.recursive(item_strategy, struct_types) -@st.defines_strategy def nested_complex_types(inner_strategy=primitive_types): return st.recursive(inner_strategy, complex_types) -@st.defines_strategy def schemas(type_strategy=primitive_types): return st.builds(pa.schema, st.lists(fields(type_strategy))) From ffc8877aa6c2d80418cb805076fc0545e6b0204c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 23 Dec 2018 00:43:38 +0100 Subject: [PATCH 56/80] ARROW-4101: [C++] Identity BinaryType cast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Author: François Saint-Jacques Closes #3245 from fsaintjacques/ARROW-4101-cast-identity and squashes the following commits: 4bb2fb7b parametrize 4319bace ARROW-4101: Identity BinaryType cast --- cpp/src/arrow/compute/kernels/cast.cc | 4 +++- python/pyarrow/tests/test_array.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 2ce0702f20c32..7976ef0beffc6 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -1258,7 +1258,9 @@ class CastKernel : public UnaryKernel { FN(TimestampType, Date64Type); \ FN(TimestampType, Int64Type); -#define BINARY_CASES(FN, IN_TYPE) FN(BinaryType, StringType); +#define BINARY_CASES(FN, IN_TYPE) \ + FN(BinaryType, BinaryType); \ + FN(BinaryType, StringType); #define STRING_CASES(FN, IN_TYPE) \ FN(StringType, StringType); \ diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 352c8558c881b..3d3402139cb43 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -755,6 +755,26 @@ def test_cast_date64_to_int(): assert result.equals(expected) +@pytest.mark.parametrize(('ty', 'values'), [ + ('bool', [True, False, True]), + ('uint8', range(0, 255)), + ('int8', range(0, 128)), + ('uint16', range(0, 10)), + ('int16', range(0, 10)), + ('uint32', range(0, 10)), + ('int32', range(0, 10)), + ('uint64', range(0, 10)), + ('int64', range(0, 10)), + ('float', [0.0, 0.1, 0.2]), + ('double', [0.0, 0.1, 0.2]), + ('string', ['a', 'b', 'c']), + ('binary', [b'a', b'b', b'c']) +]) +def test_cast_identities(ty, values): + arr = pa.array(values, type=ty) + assert arr.cast(ty).equals(arr) + + pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [ From e179dda432e1f67020a0c832a11fc496eec67e7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 23 Dec 2018 00:59:06 +0100 Subject: [PATCH 57/80] ARROW-4098: [Python] Deprecate open_file/open_stream top level APIs in favor of using ipc namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will mean some user code will have to change (e.g. https://github.com/apache/spark/blob/8edae94fa7ec1a1cc2c69e0924da0da85d4aac83/python/pyspark/serializers.py#L240) but it is the most maintainable option for the long term. We should not remove the deprecated APIs until we are confident that at least our open source downstream dependencies are taken care of Author: Krisztián Szűcs Author: Wes McKinney Closes #3244 from wesm/ARROW-4098 and squashes the following commits: ec3c54be update ipc doc 9017e7ff remove accidentally committed file 36b6a861 Fix up API docs 7ed5343e Deprecate pyarrow.open_stream/open_file in favor of ipc-namespaced versions --- docs/source/python/api.rst | 4 ++-- docs/source/python/ipc.rst | 10 ++++---- python/pyarrow/__init__.py | 22 ++++++++++++++++++ python/pyarrow/tests/test_ipc.py | 40 ++++++++++++++++---------------- 4 files changed, 49 insertions(+), 27 deletions(-) diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 40ccb68c36f38..0bad76ff0bf63 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -259,14 +259,14 @@ Serialization and IPC .. autosummary:: :toctree: generated/ + ipc.open_file + ipc.open_stream Message MessageReader RecordBatchFileReader RecordBatchFileWriter RecordBatchStreamReader RecordBatchStreamWriter - open_file - open_stream read_message read_record_batch get_record_batch_size diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index 3f7e787cd0c2f..812d843b0df56 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -84,11 +84,11 @@ particular stream. Now we can do: Now ``buf`` contains the complete stream as an in-memory byte buffer. We can read such a stream with :class:`~pyarrow.RecordBatchStreamReader` or the -convenience function ``pyarrow.open_stream``: +convenience function ``pyarrow.ipc.open_stream``: .. ipython:: python - reader = pa.open_stream(buf) + reader = pa.ipc.open_stream(buf) reader.schema batches = [b for b in reader] @@ -125,11 +125,11 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as The difference between :class:`~pyarrow.RecordBatchFileReader` and :class:`~pyarrow.RecordBatchStreamReader` is that the input source must have a ``seek`` method for random access. The stream reader only requires read -operations. We can also use the ``pyarrow.open_file`` method to open a file: +operations. We can also use the ``pyarrow.ipc.open_file`` method to open a file: .. ipython:: python - reader = pa.open_file(buf) + reader = pa.ipc.open_file(buf) Because we have access to the entire payload, we know the number of record batches in the file, and can read any at random: @@ -149,7 +149,7 @@ DataFrame output: .. ipython:: python - df = pa.open_file(buf).read_pandas() + df = pa.ipc.open_file(buf).read_pandas() df[:5] Arbitrary Object Serialization diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 7f0a371b4bfd2..0d1c1bef87a1c 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -146,6 +146,28 @@ def parse_git(root, **kwargs): open_stream, open_file, serialize_pandas, deserialize_pandas) +import pyarrow.ipc as ipc + + +def open_stream(source): + """ + pyarrow.open_stream deprecated since 0.12, use pyarrow.ipc.open_stream + """ + import warnings + warnings.warn("pyarrow.open_stream is deprecated, please use " + "pyarrow.ipc.open_stream") + return ipc.open_stream(source) + + +def open_file(source): + """ + pyarrow.open_file deprecated since 0.12, use pyarrow.ipc.open_file + """ + import warnings + warnings.warn("pyarrow.open_file is deprecated, please use " + "pyarrow.ipc.open_file") + return ipc.open_file(source) + localfs = LocalFileSystem.get_instance() diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 0fb66f8fa4d43..67a91b9ddd440 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -80,7 +80,7 @@ def _check_roundtrip(self, as_table=False): _, batches = self.write_batches(as_table=as_table) file_contents = pa.BufferReader(self.get_source()) - reader = pa.open_file(file_contents) + reader = pa.ipc.open_file(file_contents) assert reader.num_record_batches == len(batches) @@ -121,7 +121,7 @@ def stream_fixture(): def test_empty_file(): buf = b'' with pytest.raises(pa.ArrowInvalid): - pa.open_file(pa.BufferReader(buf)) + pa.ipc.open_file(pa.BufferReader(buf)) def test_file_simple_roundtrip(file_fixture): @@ -142,7 +142,7 @@ def test_file_read_all(sink_factory): _, batches = fixture.write_batches() file_contents = pa.BufferReader(fixture.get_source()) - reader = pa.open_file(file_contents) + reader = pa.ipc.open_file(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) @@ -154,8 +154,8 @@ def test_open_file_from_buffer(file_fixture): _, batches = file_fixture.write_batches() source = file_fixture.get_source() - reader1 = pa.open_file(source) - reader2 = pa.open_file(pa.BufferReader(source)) + reader1 = pa.ipc.open_file(source) + reader2 = pa.ipc.open_file(pa.BufferReader(source)) reader3 = pa.RecordBatchFileReader(source) result1 = reader1.read_all() @@ -170,7 +170,7 @@ def test_file_read_pandas(file_fixture): frames, _ = file_fixture.write_batches() file_contents = pa.BufferReader(file_fixture.get_source()) - reader = pa.open_file(file_contents) + reader = pa.ipc.open_file(file_contents) result = reader.read_pandas() expected = pd.concat(frames) @@ -189,8 +189,8 @@ def test_file_pathlib(file_fixture, tmpdir): with open(path, 'wb') as f: f.write(source) - t1 = pa.open_file(pathlib.Path(path)).read_all() - t2 = pa.open_file(pa.OSFile(path)).read_all() + t1 = pa.ipc.open_file(pathlib.Path(path)).read_all() + t2 = pa.ipc.open_file(pa.OSFile(path)).read_all() assert t1.equals(t2) @@ -198,7 +198,7 @@ def test_file_pathlib(file_fixture, tmpdir): def test_empty_stream(): buf = io.BytesIO(b'') with pytest.raises(pa.ArrowInvalid): - pa.open_stream(buf) + pa.ipc.open_stream(buf) def test_stream_categorical_roundtrip(stream_fixture): @@ -213,7 +213,7 @@ def test_stream_categorical_roundtrip(stream_fixture): writer.write_batch(pa.RecordBatch.from_pandas(df)) writer.close() - table = (pa.open_stream(pa.BufferReader(stream_fixture.get_source())) + table = (pa.ipc.open_stream(pa.BufferReader(stream_fixture.get_source())) .read_all()) assert_frame_equal(table.to_pandas(), df) @@ -223,8 +223,8 @@ def test_open_stream_from_buffer(stream_fixture): _, batches = stream_fixture.write_batches() source = stream_fixture.get_source() - reader1 = pa.open_stream(source) - reader2 = pa.open_stream(pa.BufferReader(source)) + reader1 = pa.ipc.open_stream(source) + reader2 = pa.ipc.open_stream(pa.BufferReader(source)) reader3 = pa.RecordBatchStreamReader(source) result1 = reader1.read_all() @@ -250,7 +250,7 @@ def test_stream_write_dispatch(stream_fixture): writer.write(batch) writer.close() - table = (pa.open_stream(pa.BufferReader(stream_fixture.get_source())) + table = (pa.ipc.open_stream(pa.BufferReader(stream_fixture.get_source())) .read_all()) assert_frame_equal(table.to_pandas(), pd.concat([df, df], ignore_index=True)) @@ -271,7 +271,7 @@ def test_stream_write_table_batches(stream_fixture): writer.write_table(table, chunksize=15) writer.close() - batches = list(pa.open_stream(stream_fixture.get_source())) + batches = list(pa.ipc.open_stream(stream_fixture.get_source())) assert list(map(len, batches)) == [10, 15, 5, 10] result_table = pa.Table.from_batches(batches) @@ -283,7 +283,7 @@ def test_stream_write_table_batches(stream_fixture): def test_stream_simple_roundtrip(stream_fixture): _, batches = stream_fixture.write_batches() file_contents = pa.BufferReader(stream_fixture.get_source()) - reader = pa.open_stream(file_contents) + reader = pa.ipc.open_stream(file_contents) assert reader.schema.equals(batches[0].schema) @@ -301,7 +301,7 @@ def test_stream_simple_roundtrip(stream_fixture): def test_stream_read_all(stream_fixture): _, batches = stream_fixture.write_batches() file_contents = pa.BufferReader(stream_fixture.get_source()) - reader = pa.open_stream(file_contents) + reader = pa.ipc.open_stream(file_contents) result = reader.read_all() expected = pa.Table.from_batches(batches) @@ -311,7 +311,7 @@ def test_stream_read_all(stream_fixture): def test_stream_read_pandas(stream_fixture): frames, _ = stream_fixture.write_batches() file_contents = stream_fixture.get_source() - reader = pa.open_stream(file_contents) + reader = pa.ipc.open_stream(file_contents) result = reader.read_pandas() expected = pd.concat(frames) @@ -393,7 +393,7 @@ def run(self): connection, client_address = self._sock.accept() try: source = connection.makefile(mode='rb') - reader = pa.open_stream(source) + reader = pa.ipc.open_stream(source) self._schema = reader.schema if self._do_read_all: self._table = reader.read_all() @@ -494,7 +494,7 @@ def test_ipc_stream_no_batches(): writer.close() source = sink.getvalue() - reader = pa.open_stream(source) + reader = pa.ipc.open_stream(source) result = reader.read_all() assert result.schema.equals(table.schema) @@ -636,7 +636,7 @@ def write_file(batch, sink): def read_file(source): - reader = pa.open_file(source) + reader = pa.ipc.open_file(source) return [reader.get_batch(i) for i in range(reader.num_record_batches)] From 6578089472958b20126d5c56fe8f8737b02b5544 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 23 Dec 2018 01:37:13 +0100 Subject: [PATCH 58/80] ARROW-2592: [Python] Add "ignore_metadata" option to Table.to_pandas This option circumvents the index reconstruction logic if there is `'pandas'` metadata. This can also be achieved using `table.cast(table.schema.remove_metadata()).to_pandas()`, but this makes it more obvious and discoverable to users. A user had an issue reading a Parquet file with some old metadata that we are no longer able to correctly process. Author: Wes McKinney Closes #3239 from wesm/ARROW-2592 and squashes the following commits: 82ac7a01 Unit test for ignore_metadata option 6c4246ef Test stub 8cf45a7a Add ignore_metadata option to Table.to_pandas --- python/pyarrow/pandas_compat.py | 6 ++++-- python/pyarrow/table.pxi | 16 ++++++++++++---- python/pyarrow/tests/test_convert_pandas.py | 11 +++++++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 0eebcf6e1eec3..6acca0c35cf40 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -548,7 +548,8 @@ def _make_datetimetz(tz): # Converting pyarrow.Table efficiently to pandas.DataFrame -def table_to_blockmanager(options, table, memory_pool, categories=None): +def table_to_blockmanager(options, table, memory_pool, categories=None, + ignore_metadata=False): from pyarrow.compat import DatetimeTZDtype index_columns = [] @@ -560,7 +561,8 @@ def table_to_blockmanager(options, table, memory_pool, categories=None): row_count = table.num_rows metadata = schema.metadata - has_pandas_metadata = metadata is not None and b'pandas' in metadata + has_pandas_metadata = (not ignore_metadata and metadata is not None + and b'pandas' in metadata) if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 4d52f26e749fc..29a784d60f5a8 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -890,7 +890,7 @@ cdef class RecordBatch: def to_pandas(self, MemoryPool memory_pool=None, categories=None, bint strings_to_categorical=False, bint zero_copy_only=False, bint integer_object_nulls=False, bint date_as_object=False, - bint use_threads=True): + bint use_threads=True, bint ignore_metadata=False): """ Convert the arrow::RecordBatch to a pandas DataFrame @@ -911,6 +911,9 @@ cdef class RecordBatch: Cast dates to objects use_threads: boolean, default True Whether to parallelize the conversion using multiple threads + ignore_metadata : boolean, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present Returns ------- @@ -921,7 +924,8 @@ cdef class RecordBatch: strings_to_categorical=strings_to_categorical, zero_copy_only=zero_copy_only, integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, use_threads=use_threads + date_as_object=date_as_object, use_threads=use_threads, + ignore_metadata=ignore_metadata ) @classmethod @@ -1385,7 +1389,7 @@ cdef class Table: def to_pandas(self, MemoryPool memory_pool=None, categories=None, bint strings_to_categorical=False, bint zero_copy_only=False, bint integer_object_nulls=False, bint date_as_object=False, - bint use_threads=True): + bint use_threads=True, bint ignore_metadata=False): """ Convert the arrow::Table to a pandas DataFrame @@ -1406,6 +1410,9 @@ cdef class Table: Cast dates to objects use_threads: boolean, default True Whether to parallelize the conversion using multiple threads + ignore_metadata : boolean, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present Returns ------- @@ -1422,7 +1429,8 @@ cdef class Table: use_threads=use_threads) mgr = pdcompat.table_to_blockmanager(options, self, memory_pool, - categories) + categories, + ignore_metadata=ignore_metadata) return pd.DataFrame(mgr) def to_pydict(self): diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 41bcae83db516..12214847f3e53 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -376,6 +376,17 @@ def test_metadata_with_mixed_types(self): assert data_column['pandas_type'] == 'bytes' assert data_column['numpy_type'] == 'object' + def test_ignore_metadata(self): + df = pd.DataFrame({'a': [1, 2, 3], 'b': ['foo', 'bar', 'baz']}, + index=['one', 'two', 'three']) + table = pa.Table.from_pandas(df) + + result = table.to_pandas(ignore_metadata=True) + expected = (table.cast(table.schema.remove_metadata()) + .to_pandas()) + + assert result.equals(expected) + def test_list_metadata(self): df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]}) schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))]) From 79d8bf2de3c4d7f6e17d6bea5d5d477310e58668 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 23 Dec 2018 16:16:46 +0100 Subject: [PATCH 59/80] =?UTF-8?q?ARROW-4107:=20[Python]=C2=A0Use=20ninja?= =?UTF-8?q?=20in=20pyarrow=20manylinux1=20build?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Uwe L. Korn Closes #3253 from xhochy/ARROW-4107 and squashes the following commits: 6ed02454 ARROW-4107:  Use ninja in pyarrow manylinux1 build --- python/manylinux1/build_arrow.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index b1d8f8588dfc5..902bcb3eff360 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -35,6 +35,7 @@ cd /arrow/python # PyArrow build configuration export PYARROW_BUILD_TYPE='release' +export PYARROW_CMAKE_GENERATOR='Ninja' export PYARROW_WITH_ORC=1 export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PLASMA=1 From 6b798875c0e5a328e007f7ce634a8b4ce50eb553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 24 Dec 2018 10:12:30 -0600 Subject: [PATCH 60/80] ARROW-4109: [Packaging] Missing glog dependency from arrow-cpp conda recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up of https://github.com/apache/arrow/pull/3234 Crossbow builds: [kszucs/crossbow/build-386](https://github.com/kszucs/crossbow/branches/all?utf8=%E2%9C%93&query=build-386) Author: Krisztián Szűcs Closes #3255 from kszucs/conda_recipe_glogs and squashes the following commits: ed110abb6 add glog to arrow-cpp --- dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index 725fd2291e75a..129136e2580ea 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -44,6 +44,7 @@ requirements: - rapidjson - zlib - glog + - gflags - snappy - brotli - zstd From 385c4384eb0dcc384b443f24765c64e9d6d88d28 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Mon, 24 Dec 2018 10:26:37 -0600 Subject: [PATCH 61/80] ARROW-3938: [Packaging] Stop to refer java/pom.xml to get version information Author: Kouhei Sutou Closes #3259 from kou/stop-to-refer-pom-xml-for-version and squashes the following commits: 3dce0a035 Stop to refer java/pom.xml to get version information --- c_glib/Makefile.am | 3 +- c_glib/configure.ac | 8 +- c_glib/doc/parquet-glib/parquet-glib-docs.xml | 4 + c_glib/meson.build | 3 +- c_glib/test/test-cuda.rb | 2 +- cpp/CMakeLists.txt | 8 +- cpp/src/gandiva/CMakeLists.txt | 2 + cpp/src/plasma/CMakeLists.txt | 1 + dev/release/00-prepare.sh | 92 ++++++++++++++++--- matlab/CMakeLists.txt | 6 +- python/setup.py | 30 +----- r/DESCRIPTION | 2 +- ruby/red-arrow-cuda/.gitignore | 2 - .../red-arrow-cuda/lib/arrow-cuda/version.rb | 19 ++-- ruby/red-arrow-cuda/red-arrow-cuda.gemspec | 6 +- ruby/red-arrow-cuda/test/helper.rb | 3 - ruby/red-arrow-cuda/version.rb | 71 -------------- ruby/red-arrow/.gitignore | 2 - .../red-arrow/lib/arrow/version.rb | 11 ++- ruby/red-arrow/red-arrow.gemspec | 6 +- ruby/red-arrow/test/helper.rb | 2 - ruby/red-arrow/version.rb | 71 -------------- ruby/red-gandiva/.gitignore | 2 - ruby/red-gandiva/lib/gandiva/version.rb | 26 ++++++ ruby/red-gandiva/red-gandiva.gemspec | 6 +- ruby/red-gandiva/test/helper.rb | 3 - ruby/red-gandiva/version.rb | 71 -------------- ruby/red-parquet/.gitignore | 2 - ruby/red-parquet/lib/parquet/version.rb | 26 ++++++ ruby/red-parquet/red-parquet.gemspec | 6 +- ruby/red-parquet/test/helper.rb | 3 - ruby/red-parquet/version.rb | 71 -------------- ruby/red-plasma/.gitignore | 2 - ruby/red-plasma/lib/plasma/version.rb | 26 ++++++ ruby/red-plasma/red-plasma.gemspec | 6 +- ruby/red-plasma/test/helper.rb | 3 - ruby/red-plasma/version.rb | 71 -------------- 37 files changed, 212 insertions(+), 466 deletions(-) rename c_glib/tool/get-version.py => ruby/red-arrow-cuda/lib/arrow-cuda/version.rb (69%) mode change 100755 => 100644 delete mode 100644 ruby/red-arrow-cuda/version.rb rename c_glib/tool/Makefile.am => ruby/red-arrow/lib/arrow/version.rb (80%) delete mode 100644 ruby/red-arrow/version.rb create mode 100644 ruby/red-gandiva/lib/gandiva/version.rb delete mode 100644 ruby/red-gandiva/version.rb create mode 100644 ruby/red-parquet/lib/parquet/version.rb delete mode 100644 ruby/red-parquet/version.rb create mode 100644 ruby/red-plasma/lib/plasma/version.rb delete mode 100644 ruby/red-plasma/version.rb diff --git a/c_glib/Makefile.am b/c_glib/Makefile.am index 149894c8241c2..53bb57e411b0c 100644 --- a/c_glib/Makefile.am +++ b/c_glib/Makefile.am @@ -24,8 +24,7 @@ SUBDIRS = \ parquet-glib \ plasma-glib \ doc \ - example \ - tool + example EXTRA_DIST = \ Gemfile \ diff --git a/c_glib/configure.ac b/c_glib/configure.ac index a6d8ed8e1d185..c63bfffa1d7f8 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -17,12 +17,7 @@ AC_PREREQ(2.65) -m4_define([arrow_glib_version], - m4_esyscmd(grep "^ " "$(dirname $0)/../java/pom.xml" | \ - sed -E \ - -e 's/(^ )//g' \ - -e 's/(<\/version>$)//g' | \ - tr -d '\n')) +m4_define([arrow_glib_version], 0.12.0-SNAPSHOT) AC_INIT([arrow-glib], arrow_glib_version, [https://issues.apache.org/jira/browse/ARROW], @@ -283,7 +278,6 @@ AC_CONFIG_FILES([ doc/plasma-glib/entities.xml example/Makefile example/lua/Makefile - tool/Makefile ]) AC_OUTPUT diff --git a/c_glib/doc/parquet-glib/parquet-glib-docs.xml b/c_glib/doc/parquet-glib/parquet-glib-docs.xml index 0f2c30ba7863f..4485a6765cb6b 100644 --- a/c_glib/doc/parquet-glib/parquet-glib-docs.xml +++ b/c_glib/doc/parquet-glib/parquet-glib-docs.xml @@ -57,6 +57,10 @@ Index of deprecated API + + Index of new symbols in 0.12.0 + + Index of new symbols in 0.11.0 diff --git a/c_glib/meson.build b/c_glib/meson.build index 194421c13d316..c2cf36c5d7c02 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -23,8 +23,7 @@ project('arrow-glib', 'c', 'cpp', 'cpp_std=c++11', ]) -python = find_program('python', 'python3', 'python2') -version = run_command(python, 'tool/get-version.py').stdout().strip() +version = '0.12.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/test/test-cuda.rb b/c_glib/test/test-cuda.rb index 32d486ef8ba97..ae915307b70f0 100644 --- a/c_glib/test/test-cuda.rb +++ b/c_glib/test/test-cuda.rb @@ -58,7 +58,7 @@ def test_export Arrow = GI.load("Arrow") ArrowCUDA = GI.load("ArrowCUDA") -manager = ArrowCUDA::ADeviceManager.new +manager = ArrowCUDA::DeviceManager.new context = manager.get_context(0) serialized_handle = #{serialized_handle.to_s.dump} handle = ArrowCUDA::IPCMemoryHandle.new(serialized_handle) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1672245924fb5..006b406ba0762 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -18,12 +18,8 @@ cmake_minimum_required(VERSION 3.2) message(STATUS "Building using CMake version: ${CMAKE_VERSION}") -# Extract Arrow version number -file(READ "${CMAKE_CURRENT_SOURCE_DIR}/../java/pom.xml" POM_XML) -string(REGEX MATCHALL - "\n [^<]+" ARROW_VERSION_TAG "${POM_XML}") -string(REGEX REPLACE - "(\n |)" "" ARROW_VERSION "${ARROW_VERSION_TAG}") +set(ARROW_VERSION "0.12.0-SNAPSHOT") + string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index d28c372a9e6ab..b574c67af3811 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +set(GANDIVA_VERSION "${ARROW_VERSION}") + # For "make gandiva" to build everything Gandiva-related add_custom_target(gandiva-all) add_custom_target(gandiva) diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index a71acf8ae43d8..2be5740bdd670 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -23,6 +23,7 @@ add_dependencies(plasma-all plasma plasma-tests plasma-benchmarks) # For the moment, Plasma is versioned like Arrow project(plasma VERSION "${ARROW_BASE_VERSION}") +set(PLASMA_VERSION "${ARROW_VERSION}") find_package(Threads) diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 5ff4ddc8f28a6..35d1998496fe0 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -21,6 +21,78 @@ set -e SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +update_versions() { + local version=$1 + + cd "${SOURCE_DIR}/../../cpp" + sed -i.bak -r -e \ + "s/^set\(ARROW_VERSION \".+\"\)/set(ARROW_VERSION \"${version}\")/" \ + CMakeLists.txt + rm -f CMakeLists.txt.bak + git add CMakeLists.txt + cd - + + cd "${SOURCE_DIR}/../../c_glib" + sed -i.bak -r -e \ + "s/^m4_define\(\[arrow_glib_version\], .+\)/m4_define([arrow_glib_version], ${version})/" \ + configure.ac + sed -i.bak -r -e \ + "s/^version = '.+'/version = '${version}'/" \ + meson.build + rm -f configure.ac.bak meson.build.bak + git add configure.ac meson.build + cd - + + # We can enable this when Arrow JS uses the same version. + # cd "${SOURCE_DIR}/../../js" + # sed -i.bak -r -e \ + # "s/^ \"version\": \".+\"/ \"version\": \"${version}\"/" \ + # package.json + # rm -f package.json + # git add package.json + # cd - + + cd "${SOURCE_DIR}/../../matlab" + sed -i.bak -r -e \ + "s/^set\(MLARROW_VERSION \".+\"\)/set(MLARROW_VERSION \"${version}\")/" \ + CMakeLists.txt + rm -f CMakeLists.txt.bak + git add CMakeLists.txt + cd - + + cd "${SOURCE_DIR}/../../python" + sed -i.bak -r -e \ + "s/^default_version: '.+'/default_version = '${version}'/" \ + setup.py + rm -f setup.py.bak + git add setup.py + cd - + + cd "${SOURCE_DIR}/../../r" + sed -i.bak -r -e \ + "s/^Version: .+/Version: ${version}/" \ + DESCRIPTION + rm -f DESCRIPTION.bak + git add DESCRIPTION + cd - + + cd "${SOURCE_DIR}/../../ruby" + sed -i.bak -r -e \ + "s/^ VERSION = \".+\"/ VERSION = \"${version}\"/g" \ + */*/*/version.rb + rm -f */*/*/version.rb.bak + git add */*/*/version.rb + cd - + + cd "${SOURCE_DIR}/../../rust" + sed -i.bak -r -e \ + "s/^version = \".+\"/version = \"${version}\"/g" \ + Cargo.toml + rm -f Cargo.toml.bak + git add Cargo.toml + cd - +} + if [ "$#" -eq 2 ]; then version=$1 nextVersion=$2 @@ -43,14 +115,19 @@ if [ "$#" -eq 2 ]; then echo "prepare release ${version} on tag ${tag} then reset to version ${nextVersionSNAPSHOT}" - cd "${SOURCE_DIR}/../../java" + update_versions "${version}" + git commit -m "[Release] Update versions for ${version}" + cd "${SOURCE_DIR}/../../java" mvn release:clean mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${nextVersionSNAPSHOT} - cd - - echo "Updating .deb package names for $nextVersion" + echo "Updating versions for ${nextVersionSNAPSHOT}" + update_versions "${nextVersionSNAPSHOT}" + git commit -m "[Release] Update versions for ${nextVersionSNAPSHOT}" + + echo "Updating .deb package names for ${nextVersion}" deb_lib_suffix=$(echo $version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') next_deb_lib_suffix=$(echo $nextVersion | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') cd $SOURCE_DIR/../tasks/linux-packages/ @@ -76,15 +153,6 @@ if [ "$#" -eq 2 ]; then git commit -m "[Release] Update .deb package names for $nextVersion" cd - - echo "prepare release ${version} in Rust crate" - - cd "${SOURCE_DIR}/../../rust" - sed -i.bak -r -e "s/version = \"$version\"/version = \"$nextVersion\"/g" Cargo.toml - rm -f Cargo.toml.bak - git add Cargo.toml - git commit -m "[Release] Update Rust Cargo.toml version for $nextVersion" - cd - - echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" else diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index 897086637beaf..a6371d1dee4fa 100755 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -18,7 +18,11 @@ cmake_minimum_required(VERSION 3.2) set(CMAKE_CXX_STANDARD 11) -project(mlarrow) +set(MLARROW_VERSION "0.12.0-SNAPSHOT") +string(REGEX MATCH + "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}") + +project(mlarrow VERSION "${MLARROW_BASE_VERSION}") # Grab CMAKE Modules from the CPP interface set(CPP_CMAKE_MODULES "${CMAKE_SOURCE_DIR}/../cpp/cmake_modules") diff --git a/python/setup.py b/python/setup.py index b8d192ddaec45..742851918c124 100755 --- a/python/setup.py +++ b/python/setup.py @@ -483,39 +483,15 @@ def _move_shared_libs_unix(build_prefix, build_lib, lib_name): # If the event of not running from a git clone (e.g. from a git archive # or a Python sdist), see if we can set the version number ourselves +default_version = '0.12.0-SNAPSHOT' if (not os.path.exists('../.git') and not os.environ.get('SETUPTOOLS_SCM_PRETEND_VERSION')): if os.path.exists('PKG-INFO'): # We're probably in a Python sdist, setuptools_scm will handle fine pass - elif os.path.exists('../java/pom.xml'): - # We're probably in a git archive - import xml.etree.ElementTree as ET - tree = ET.parse('../java/pom.xml') - version_tag = list(tree.getroot().findall( - '{http://maven.apache.org/POM/4.0.0}version'))[0] - use_setuptools_scm = False - os.environ['SETUPTOOLS_SCM_PRETEND_VERSION'] = \ - version_tag.text.replace("-SNAPSHOT", "a0") else: - raise RuntimeError("""\ - No reliable source available to get Arrow version. - - This is either because you copied the python/ directory yourself - outside of a git clone or source archive, or because you ran - `pip install` on the python/ directory. - - * Recommended workaround: first run `python sdist`, then - `pip install` the resulting source distribution. - - * If you're looking for an editable (in-place) install, - `python setup.py develop` should work fine in place of - `pip install -e .`. - - * If you really want to `pip install` the python/ directory, - set the SETUPTOOLS_SCM_PRETEND_VERSION environment variable - to force the Arrow version to the given value. - """) + os.environ['SETUPTOOLS_SCM_PRETEND_VERSION'] = \ + default_version.replace('-SNAPSHOT', 'a0') def parse_git(root, **kwargs): diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 5f93c83f236eb..10c28c3e7c42e 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: R Integration to 'Apache' 'Arrow' -Version: 0.0.0.9000 +Version: 0.12.0-SNAPSHOT Authors@R: c( person("Romain", "François", email = "romain@rstudio.com", role = c("aut", "cre")), person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")), diff --git a/ruby/red-arrow-cuda/.gitignore b/ruby/red-arrow-cuda/.gitignore index 3ec5511596306..779545d9026f1 100644 --- a/ruby/red-arrow-cuda/.gitignore +++ b/ruby/red-arrow-cuda/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/arrow-cuda/version.rb - /pkg/ diff --git a/c_glib/tool/get-version.py b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb old mode 100755 new mode 100644 similarity index 69% rename from c_glib/tool/get-version.py rename to ruby/red-arrow-cuda/lib/arrow-cuda/version.rb index aacea6da3e865..6426d2db7a471 --- a/c_glib/tool/get-version.py +++ b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb @@ -1,5 +1,3 @@ -#!/usr/bin/env python -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,13 +15,12 @@ # specific language governing permissions and limitations # under the License. -import os -import re +module ArrowCUDA + VERSION = "0.12.0-SNAPSHOT" -root = os.environ.get("MESON_SOURCE_ROOT", ".") -pom_xml = os.path.join(root, "..", "java", "pom.xml") -with open(pom_xml) as pom: - version_tag = re.search('^ (.+)', - pom.read(), - re.MULTILINE) - print(version_tag.group(1)) + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-arrow-cuda/red-arrow-cuda.gemspec b/ruby/red-arrow-cuda/red-arrow-cuda.gemspec index b2ee982945605..0c593ff37aa3a 100644 --- a/ruby/red-arrow-cuda/red-arrow-cuda.gemspec +++ b/ruby/red-arrow-cuda/red-arrow-cuda.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/arrow-cuda/version" Gem::Specification.new do |spec| spec.name = "red-arrow-cuda" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| ArrowCUDA::Version::MAJOR.to_s, ArrowCUDA::Version::MINOR.to_s, ArrowCUDA::Version::MICRO.to_s, - # "beta1", + ArrowCUDA::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-arrow-cuda/test/helper.rb b/ruby/red-arrow-cuda/test/helper.rb index 4d018332677ec..045eb10eea5d0 100644 --- a/ruby/red-arrow-cuda/test/helper.rb +++ b/ruby/red-arrow-cuda/test/helper.rb @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../../red-arrow/version" -require_relative "../version" - require "arrow-cuda" require "test-unit" diff --git a/ruby/red-arrow-cuda/version.rb b/ruby/red-arrow-cuda/version.rb deleted file mode 100644 index c8bbbc7165f29..0000000000000 --- a/ruby/red-arrow-cuda/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "arrow-cuda", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module ArrowCUDA - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/arrow-cuda/version" diff --git a/ruby/red-arrow/.gitignore b/ruby/red-arrow/.gitignore index 9fcc9cdc16527..779545d9026f1 100644 --- a/ruby/red-arrow/.gitignore +++ b/ruby/red-arrow/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/arrow/version.rb - /pkg/ diff --git a/c_glib/tool/Makefile.am b/ruby/red-arrow/lib/arrow/version.rb similarity index 80% rename from c_glib/tool/Makefile.am rename to ruby/red-arrow/lib/arrow/version.rb index 5d7498b957520..8ff0779f0851f 100644 --- a/c_glib/tool/Makefile.am +++ b/ruby/red-arrow/lib/arrow/version.rb @@ -15,5 +15,12 @@ # specific language governing permissions and limitations # under the License. -EXTRA_DIST = \ - get-version.py +module Arrow + VERSION = "0.12.0-SNAPSHOT" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec index 3f0f68aa332cf..9db755fc67ccc 100644 --- a/ruby/red-arrow/red-arrow.gemspec +++ b/ruby/red-arrow/red-arrow.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/arrow/version" Gem::Specification.new do |spec| spec.name = "red-arrow" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| Arrow::Version::MAJOR.to_s, Arrow::Version::MINOR.to_s, Arrow::Version::MICRO.to_s, - # "beta1", + Arrow::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-arrow/test/helper.rb b/ruby/red-arrow/test/helper.rb index 2aa868bfa7c01..12f12d3a192e9 100644 --- a/ruby/red-arrow/test/helper.rb +++ b/ruby/red-arrow/test/helper.rb @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../version" - require "arrow" require "pathname" diff --git a/ruby/red-arrow/version.rb b/ruby/red-arrow/version.rb deleted file mode 100644 index e8f043f897d1f..0000000000000 --- a/ruby/red-arrow/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "arrow", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Arrow - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/arrow/version" diff --git a/ruby/red-gandiva/.gitignore b/ruby/red-gandiva/.gitignore index 99c64a5d3dd52..779545d9026f1 100644 --- a/ruby/red-gandiva/.gitignore +++ b/ruby/red-gandiva/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/gandiva/version.rb - /pkg/ diff --git a/ruby/red-gandiva/lib/gandiva/version.rb b/ruby/red-gandiva/lib/gandiva/version.rb new file mode 100644 index 0000000000000..dbdaf36857bd8 --- /dev/null +++ b/ruby/red-gandiva/lib/gandiva/version.rb @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Gandiva + VERSION = "0.12.0-SNAPSHOT" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-gandiva/red-gandiva.gemspec b/ruby/red-gandiva/red-gandiva.gemspec index 7f84faf2ec035..857559e021183 100644 --- a/ruby/red-gandiva/red-gandiva.gemspec +++ b/ruby/red-gandiva/red-gandiva.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/gandiva/version" Gem::Specification.new do |spec| spec.name = "red-gandiva" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| Gandiva::Version::MAJOR.to_s, Gandiva::Version::MINOR.to_s, Gandiva::Version::MICRO.to_s, - # "beta1", + Gandiva::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-gandiva/test/helper.rb b/ruby/red-gandiva/test/helper.rb index 2f4e7dc46b1e3..9c291f7aebf42 100644 --- a/ruby/red-gandiva/test/helper.rb +++ b/ruby/red-gandiva/test/helper.rb @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../../red-arrow/version" -require_relative "../version" - require "gandiva" require "test-unit" diff --git a/ruby/red-gandiva/version.rb b/ruby/red-gandiva/version.rb deleted file mode 100644 index ba769796accad..0000000000000 --- a/ruby/red-gandiva/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "gandiva", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Gandiva - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/gandiva/version" diff --git a/ruby/red-parquet/.gitignore b/ruby/red-parquet/.gitignore index 542f54c56a5ca..779545d9026f1 100644 --- a/ruby/red-parquet/.gitignore +++ b/ruby/red-parquet/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/parquet/version.rb - /pkg/ diff --git a/ruby/red-parquet/lib/parquet/version.rb b/ruby/red-parquet/lib/parquet/version.rb new file mode 100644 index 0000000000000..997a92e4c321d --- /dev/null +++ b/ruby/red-parquet/lib/parquet/version.rb @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Parquet + VERSION = "0.12.0-SNAPSHOT" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-parquet/red-parquet.gemspec b/ruby/red-parquet/red-parquet.gemspec index 491648b7af97f..7688dcb5708f9 100644 --- a/ruby/red-parquet/red-parquet.gemspec +++ b/ruby/red-parquet/red-parquet.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/parquet/version" Gem::Specification.new do |spec| spec.name = "red-parquet" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| Parquet::Version::MAJOR.to_s, Parquet::Version::MINOR.to_s, Parquet::Version::MICRO.to_s, - # "beta1", + Parquet::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-parquet/test/helper.rb b/ruby/red-parquet/test/helper.rb index 43013ab5686d6..169d1df424ea7 100644 --- a/ruby/red-parquet/test/helper.rb +++ b/ruby/red-parquet/test/helper.rb @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../../red-arrow/version" -require_relative "../version" - require "parquet" require "tempfile" diff --git a/ruby/red-parquet/version.rb b/ruby/red-parquet/version.rb deleted file mode 100644 index 06045167e9495..0000000000000 --- a/ruby/red-parquet/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "parquet", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Parquet - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/parquet/version" diff --git a/ruby/red-plasma/.gitignore b/ruby/red-plasma/.gitignore index bd50ff8187f6d..779545d9026f1 100644 --- a/ruby/red-plasma/.gitignore +++ b/ruby/red-plasma/.gitignore @@ -15,6 +15,4 @@ # specific language governing permissions and limitations # under the License. -/lib/plasma/version.rb - /pkg/ diff --git a/ruby/red-plasma/lib/plasma/version.rb b/ruby/red-plasma/lib/plasma/version.rb new file mode 100644 index 0000000000000..e88f2def82ec1 --- /dev/null +++ b/ruby/red-plasma/lib/plasma/version.rb @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Plasma + VERSION = "0.12.0-SNAPSHOT" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/ruby/red-plasma/red-plasma.gemspec b/ruby/red-plasma/red-plasma.gemspec index 53b4d1ec0dade..09b4a551ab571 100644 --- a/ruby/red-plasma/red-plasma.gemspec +++ b/ruby/red-plasma/red-plasma.gemspec @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -require_relative "version" +require_relative "lib/plasma/version" Gem::Specification.new do |spec| spec.name = "red-plasma" @@ -25,9 +25,9 @@ Gem::Specification.new do |spec| Plasma::Version::MAJOR.to_s, Plasma::Version::MINOR.to_s, Plasma::Version::MICRO.to_s, - # "beta1", + Plasma::Version::TAG, ] - spec.version = version_components.join(".") + spec.version = version_components.compact.join(".") spec.homepage = "https://arrow.apache.org/" spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] diff --git a/ruby/red-plasma/test/helper.rb b/ruby/red-plasma/test/helper.rb index d66d43ecc94c0..255cad2870044 100644 --- a/ruby/red-plasma/test/helper.rb +++ b/ruby/red-plasma/test/helper.rb @@ -15,9 +15,6 @@ # specific language governing permissions and limitations # under the License. -require_relative "../../red-arrow/version" -require_relative "../version" - require "plasma" require "tempfile" diff --git a/ruby/red-plasma/version.rb b/ruby/red-plasma/version.rb deleted file mode 100644 index 015aac9594d26..0000000000000 --- a/ruby/red-plasma/version.rb +++ /dev/null @@ -1,71 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "pathname" - -version_rb_path = Pathname.new(__FILE__) -base_dir = version_rb_path.dirname -pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "plasma", "version.rb") - -need_update = false -if not lib_version_rb_path.exist? - need_update = true -elsif version_rb_path.mtime > lib_version_rb_path.mtime - need_update = true -elsif pom_xml_path.exist? and pom_xml_path.mtime > lib_version_rb_path.mtime - need_update = true -end - -if need_update - version = pom_xml_path.read.scan(/^ (.+?)<\/version>/)[0][0] - major, minor, micro, tag = version.split(/[.-]/) - lib_version_rb_path.open("w") do |lib_version_rb| - lib_version_rb.puts(<<-RUBY) -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Plasma - module Version - MAJOR = #{major} - MINOR = #{minor} - MICRO = #{micro} - TAG = #{tag ? tag.dump : nil} - STRING = #{version.dump} - end - - VERSION = Version::STRING -end - RUBY - end -end - -require_relative "lib/plasma/version" From cd543b9756d602ebabda749c60a14d629db7a35a Mon Sep 17 00:00:00 2001 From: Tanya Schlusser Date: Mon, 24 Dec 2018 15:19:26 -0600 Subject: [PATCH 62/80] ARROW-2504: [Website] Add ApacheCon NA link Place a 234x60 link in the navbar next to the Apache Software Foundation link. Screenshot for full width: ![image](https://user-images.githubusercontent.com/7432951/48995065-1213f700-f10c-11e8-944f-e5d26f1bfe8b.png) Screenshot for medium width: ![image](https://user-images.githubusercontent.com/7432951/48995076-1b04c880-f10c-11e8-8abf-a3d2ef204596.png) Disappears in small width -- screenshot: ![image](https://user-images.githubusercontent.com/7432951/48995108-31128900-f10c-11e8-96a7-066e377081fa.png) Author: Tanya Schlusser Closes #3030 from tanyaschlusser/master and squashes the following commits: 77c6cd323 ARROW-2504: Add ApacheCon NA link --- site/_includes/header.html | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/site/_includes/header.html b/site/_includes/header.html index e0f23ecd24e30..5344501acfe29 100644 --- a/site/_includes/header.html +++ b/site/_includes/header.html @@ -77,9 +77,14 @@ - - - + From cfaea429d0f2d3d9baa2a10d6da759ffd0f9d7f8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Dec 2018 16:49:22 -0600 Subject: [PATCH 63/80] PARQUET-1481: [C++] Throw exception when encountering bad Thrift metadata in RecordReader Author: Wes McKinney Closes #3242 from wesm/PARQUET-1481 and squashes the following commits: b074227ba Add test case with example corrupt data file 59400a2f1 Throw exception when encountering bad Thrift metadata in RecordReader --- .../parquet/arrow/arrow-reader-writer-test.cc | 29 ++++++++++++++----- cpp/src/parquet/arrow/record_reader.cc | 8 +++-- cpp/submodules/parquet-testing | 2 +- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 4e62a22c350ff..bb9763224f3ba 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -2291,21 +2291,34 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) { ASSERT_EQ(expected, calculated); } -TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { - // PARQUET-995 +void TryReadDataFile(const std::string& testing_file_path, bool should_succeed = true) { std::string dir_string(test::get_data_dir()); std::stringstream ss; - ss << dir_string << "/" - << "alltypes_plain.parquet"; + ss << dir_string << "/" << testing_file_path; auto path = ss.str(); auto pool = ::arrow::default_memory_pool(); std::unique_ptr arrow_reader; - ASSERT_NO_THROW( - arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false)))); - std::shared_ptr<::arrow::Table> table; - ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table)); + try { + arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false))); + std::shared_ptr<::arrow::Table> table; + ASSERT_OK(arrow_reader->ReadTable(&table)); + } catch (const ParquetException& e) { + if (should_succeed) { + FAIL() << "Exception thrown when reading file: " << e.what(); + } + } +} + +TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { + // PARQUET-995 + TryReadDataFile("alltypes_plain.parquet"); +} + +TEST(TestArrowReaderAdHoc, CorruptedSchema) { + // PARQUET-1481 + TryReadDataFile("bad_data/PARQUET-1481.parquet", false /* should_succeed */); } class TestArrowReaderAdHocSparkAndHvr diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index d1bf2c5cdfdc6..4a988dacdd9aa 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -850,8 +850,12 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, case Type::FIXED_LEN_BYTE_ARRAY: return std::shared_ptr( new RecordReader(new TypedRecordReader(descr, pool))); - default: - DCHECK(false); + default: { + // PARQUET-1481: This can occur if the file is corrupt + std::stringstream ss; + ss << "Invalid physical column type: " << static_cast(descr->physical_type()); + throw ParquetException(ss.str()); + } } // Unreachable code, but supress compiler warning return nullptr; diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 92a8e6c2efdce..8eb0213c49175 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 92a8e6c2efdce1925c605d6313994db2c94478fb +Subproject commit 8eb0213c491752c9bbb1b884fcbb21deb548e464 From 49f93e0dc06023d664ecc82b625ad4d72f0fc0cd Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 26 Dec 2018 10:43:21 -0600 Subject: [PATCH 64/80] ARROW-4114: [C++] Add python to requirements list for running on ubuntu Author: Micah Kornfield Closes #3260 from emkornfield/update_build_instructions and squashes the following commits: 80c112b25 Add python to requirements list for running on ubuntu --- cpp/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/README.md b/cpp/README.md index b602bef1c7710..7e92648dc37aa 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -33,6 +33,10 @@ Building Arrow requires: * CMake 3.2 or higher * Boost +Testing arrow with ctest requires: + +* python + On Ubuntu/Debian you can install the requirements with: ```shell @@ -43,7 +47,8 @@ sudo apt-get install \ libboost-dev \ libboost-filesystem-dev \ libboost-regex-dev \ - libboost-system-dev + libboost-system-dev \ + python ``` On Alpine Linux: From 91c585d54b635212c78106790cf0ebed020fc758 Mon Sep 17 00:00:00 2001 From: Praveen Date: Wed, 26 Dec 2018 11:27:16 -0600 Subject: [PATCH 65/80] ARROW-4100: [Gandiva][C++] Fix regex for special character dot. Make dot a special character that needs to be escaped, else it does not match the sql standards. Author: Praveen Closes #3241 from praveenbingo/regex and squashes the following commits: 7792fec23 ARROW-4100: Add more valgrind suppressions. 12fb046e2 ARROW-4050: Fix valgrind suppressions. e97d38375 ARROW-4050: Fix regex for special character dot. --- cpp/src/gandiva/CMakeLists.txt | 1 + cpp/src/gandiva/like_holder_test.cc | 10 ++++++++++ cpp/src/gandiva/regex_util.cc | 2 +- cpp/valgrind.supp | 16 ++++++++++++++-- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index b574c67af3811..6b67c8699c511 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -168,6 +168,7 @@ ADD_GANDIVA_TEST(selection_vector_test) ADD_GANDIVA_TEST(lru_cache_test) ADD_GANDIVA_TEST(to_date_holder_test) ADD_GANDIVA_TEST(simple_arena_test) +ADD_GANDIVA_TEST(like_holder_test) if (ARROW_GANDIVA_JAVA) add_subdirectory(jni) diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 3e3cd37c4fed1..d0ce8bb595021 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -84,6 +84,16 @@ TEST_F(TestLikeHolder, TestRegexEscape) { EXPECT_EQ(res, "%hello_abc.def#"); } +TEST_F(TestLikeHolder, TestDot) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("abc.", &like_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_FALSE(like("abcd")); +} + TEST_F(TestLikeHolder, TestOptimise) { // optimise for 'starts_with' auto fnode = LikeHolder::TryOptimize(BuildLike("xy 123z%")); diff --git a/cpp/src/gandiva/regex_util.cc b/cpp/src/gandiva/regex_util.cc index 1d3860615d57f..abdd579d1f5e4 100644 --- a/cpp/src/gandiva/regex_util.cc +++ b/cpp/src/gandiva/regex_util.cc @@ -20,7 +20,7 @@ namespace gandiva { const std::set RegexUtil::pcre_regex_specials_ = { - '[', ']', '(', ')', '|', '^', '-', '+', '*', '?', '{', '}', '$', '\\'}; + '[', ']', '(', ')', '|', '^', '-', '+', '*', '?', '{', '}', '$', '\\', '.'}; Status RegexUtil::SqlLikePatternToPcre(const std::string& sql_pattern, char escape_char, std::string& pcre_pattern) { diff --git a/cpp/valgrind.supp b/cpp/valgrind.supp index d8bc8fb28f2d5..08076aade4d9e 100644 --- a/cpp/valgrind.supp +++ b/cpp/valgrind.supp @@ -25,11 +25,23 @@ :Conditional jump or move depends on uninitialised value(s) Memcheck:Cond ... - fun:_ZN3re23RE2C1E* + fun:*re2*RE2* } { :Use of uninitialised value of size 8 Memcheck:Value8 ... - fun:_ZN3re23RE2C1E* + fun:*re2*RE2* +} +{ + :Conditional jump or move depends on uninitialised value(s) + Memcheck:Cond + ... + fun:*re2*Prog* +} +{ + :Use of uninitialised value of size 8 + Memcheck:Value8 + ... + fun:*re2*Prog* } From 2849f46fcc203e4c9c5e09b3065ffb92cd133dce Mon Sep 17 00:00:00 2001 From: Pindikura Ravindra Date: Wed, 26 Dec 2018 13:44:49 -0600 Subject: [PATCH 66/80] ARROW-4115: [Gandiva] zero-init boolean data bufs Author: Pindikura Ravindra Closes #3263 from pravindra/arrow-4115 and squashes the following commits: d6b7834e3 ARROW-4115: zero-init boolean data bufs --- cpp/src/gandiva/projector.cc | 6 +++--- cpp/src/gandiva/tests/projector_test.cc | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index d5902fc72f16d..4cb352f2ad3c1 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -155,10 +155,10 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, int64_t data_len = arrow::BitUtil::BytesForBits(num_records * fw_type.bit_width()); ARROW_RETURN_NOT_OK(arrow::AllocateBuffer(pool, data_len, &data)); - // Valgrind detects unitialized memory at byte level. Boolean types use bits - // and can leave buffer memory uninitialized in the last byte. + // This is not strictly required but valgrind gets confused and detects this + // as uninitialized memory access. See arrow::util::SetBitTo(). if (type->id() == arrow::Type::BOOL) { - data->mutable_data()[data_len - 1] = 0; + memset(data->mutable_data(), 0, data_len); } *array_data = arrow::ArrayData::Make(type, num_records, {null_bitmap, data}); diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 1aeb43b49b0dc..33cdce07ae6f7 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -227,10 +227,11 @@ static void TestArithmeticOpsForType(arrow::MemoryPool* pool) { EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data - int num_records = 4; - std::vector input0 = {1, 2, 53, 84}; - std::vector input1 = {10, 15, 23, 84}; - std::vector validity = {true, true, true, true}; + int num_records = 12; + std::vector input0 = {1, 2, 53, 84, 5, 15, 0, 1, 52, 83, 4, 120}; + std::vector input1 = {10, 15, 23, 84, 4, 51, 68, 9, 16, 18, 19, 37}; + std::vector validity = {true, true, true, true, true, true, + true, true, true, true, true, true}; auto array0 = MakeArrowArray(input0, validity); auto array1 = MakeArrowArray(input1, validity); From 46ecbb64e13d942803a21b23e5d7b7eff46bc752 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 26 Dec 2018 14:50:40 -0600 Subject: [PATCH 67/80] ARROW-4103: [Docs] Move documentation build instructions from source/python/development.rst to docs/README.md Author: Wes McKinney Closes #3243 from wesm/ARROW-4103 and squashes the following commits: 6873ac1c0 Direct user to project build instructions in docs/README.md --- docs/README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 docs/README.md diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000000..e20b59df109cb --- /dev/null +++ b/docs/README.md @@ -0,0 +1,30 @@ + + +# Apache Arrow Documentation + +This directory contains source files for building the main project +documentation. This includes the [Arrow columnar format specification][2]. + +Instructions for building the documentation site are found in +[docs/source/python/development.rst][1]. The build depends on the API +documentation for some of the project subcomponents. + +[1]: https://github.com/apache/arrow/blob/master/docs/source/python/development.rst#building-the-documentation +[2]: https://github.com/apache/arrow/tree/master/docs/source/format \ No newline at end of file From 0c2f3541efc86923fc2aff30efe664fb48ba1efd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 26 Dec 2018 18:00:15 -0600 Subject: [PATCH 68/80] ARROW-4116: [Python] Add warning to development instructions to avoid virtualenv when using Anaconda/miniconda Author: Wes McKinney Closes #3264 from wesm/ARROW-4116 and squashes the following commits: 1a2d8c590 Add warning to avoid virtualenv when using Anaconda/miniconda --- docs/source/python/development.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst index 1dcfda862817f..63e6051a7b864 100644 --- a/docs/source/python/development.rst +++ b/docs/source/python/development.rst @@ -113,6 +113,13 @@ about our build toolchain: Using pip ~~~~~~~~~ +.. warning:: + + If you installed Python using the Anaconda distribution or `Miniconda + `_, you cannot currently use ``virtualenv`` + to manage your development. Please follow the conda-based development + instructions instead. + On macOS, install all dependencies through Homebrew that are required for building Arrow C++: From 9c76600af968d6f22642ae06fab13d16813fc009 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 27 Dec 2018 09:52:36 +0900 Subject: [PATCH 69/80] ARROW-4112: [Packaging] Add support for Gandiva .deb Author: Kouhei Sutou Closes #3258 from kou/linux-packages-gandiva and squashes the following commits: fa621931 Add support for Gandiva .deb/.rpm --- dev/release/00-prepare.sh | 2 +- dev/release/rat_exclude_files.txt | 8 ++ .../apt/debian-stretch/Dockerfile | 4 + .../apt/ubuntu-bionic/Dockerfile | 1 + .../apt/ubuntu-cosmic/Dockerfile | 1 + .../apt/ubuntu-xenial/Dockerfile | 1 + .../linux-packages/debian.ubuntu-trusty/rules | 1 - dev/tasks/linux-packages/debian/control | 82 +++++++++++++++++++ .../debian/gir1.2-gandiva-1.0.install | 1 + .../debian/libgandiva-dev.install | 3 + .../debian/libgandiva-glib-dev.install | 5 ++ .../debian/libgandiva-glib-doc.doc-base | 9 ++ .../debian/libgandiva-glib-doc.install | 1 + .../debian/libgandiva-glib-doc.links | 3 + .../debian/libgandiva-glib12.install | 1 + .../debian/libgandiva12.install | 2 + dev/tasks/linux-packages/debian/rules | 3 +- dev/tasks/linux-packages/yum/arrow.spec.in | 3 +- .../linux-packages/yum/centos-6/Dockerfile | 6 +- dev/tasks/tasks.yml | 26 ++++++ 20 files changed, 155 insertions(+), 8 deletions(-) create mode 100644 dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva-dev.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib-dev.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib-doc.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib-doc.links create mode 100644 dev/tasks/linux-packages/debian/libgandiva-glib12.install create mode 100644 dev/tasks/linux-packages/debian/libgandiva12.install diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 35d1998496fe0..141882e22566a 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -136,7 +136,7 @@ if [ "$#" -eq 2 ]; then ${target} \ $(echo $target | sed -e "s/${deb_lib_suffix}/${next_deb_lib_suffix}/") done - deb_lib_suffix_substitute_pattern="s/(lib(arrow|parquet)[-a-z]*)${deb_lib_suffix}/\\1${next_deb_lib_suffix}/g" + deb_lib_suffix_substitute_pattern="s/(lib(arrow|gandiva|parquet|plasma)[-a-z]*)${deb_lib_suffix}/\\1${next_deb_lib_suffix}/g" sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" debian*/control rm -f debian*/control.bak git add debian*/control diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index bcb474b79b060..7674e2fee0f29 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -72,6 +72,7 @@ dev/tasks/linux-packages/debian/compat dev/tasks/linux-packages/debian/control dev/tasks/linux-packages/debian/gir1.2-arrow-1.0.install dev/tasks/linux-packages/debian/gir1.2-arrow-cuda-1.0.install +dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install dev/tasks/linux-packages/debian/gir1.2-parquet-1.0.install dev/tasks/linux-packages/debian/gir1.2-plasma-1.0.install dev/tasks/linux-packages/debian/libarrow-dev.install @@ -87,6 +88,13 @@ dev/tasks/linux-packages/debian/libarrow-cuda12.install dev/tasks/linux-packages/debian/libarrow-python-dev.install dev/tasks/linux-packages/debian/libarrow-python12.install dev/tasks/linux-packages/debian/libarrow12.install +dev/tasks/linux-packages/debian/libgandiva-dev.install +dev/tasks/linux-packages/debian/libgandiva-glib-dev.install +dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base +dev/tasks/linux-packages/debian/libgandiva-glib-doc.install +dev/tasks/linux-packages/debian/libgandiva-glib-doc.links +dev/tasks/linux-packages/debian/libgandiva-glib12.install +dev/tasks/linux-packages/debian/libgandiva12.install dev/tasks/linux-packages/debian/libparquet-dev.install dev/tasks/linux-packages/debian/libparquet-glib-dev.install dev/tasks/linux-packages/debian/libparquet-glib-doc.doc-base diff --git a/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile b/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile index 4dde574cbf95d..70cefaabf262e 100644 --- a/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile +++ b/dev/tasks/linux-packages/apt/debian-stretch/Dockerfile @@ -22,6 +22,9 @@ ENV DEBIAN_FRONTEND noninteractive ARG DEBUG RUN sed -i'' -e 's/main$/main contrib non-free/g' /etc/apt/sources.list +RUN \ + echo "deb http://deb.debian.org/debian stretch-backports main" > \ + /etc/apt/sources.list.d/backports.list RUN \ quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ @@ -30,6 +33,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile index 5d3c9ba2932ed..68de4d569a663 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-bionic/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile index 519d058d4b2e3..0d871eaa2635d 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile index 17cb27713f08c..c7c5b1e09ece1 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-xenial/Dockerfile @@ -28,6 +28,7 @@ RUN \ autoconf-archive \ bison \ build-essential \ + clang-6.0 \ cmake \ debhelper\ devscripts \ diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/rules b/dev/tasks/linux-packages/debian.ubuntu-trusty/rules index 6f2ffdc416906..4eb26772df00c 100755 --- a/dev/tasks/linux-packages/debian.ubuntu-trusty/rules +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/rules @@ -22,7 +22,6 @@ override_dh_auto_configure: --builddirectory=cpp_build \ -- \ -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DARROW_BUILD_TESTS=OFF \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_PLASMA=ON diff --git a/dev/tasks/linux-packages/debian/control b/dev/tasks/linux-packages/debian/control index b5c696363798f..579c2e47bb520 100644 --- a/dev/tasks/linux-packages/debian/control +++ b/dev/tasks/linux-packages/debian/control @@ -5,6 +5,7 @@ Maintainer: Kouhei Sutou Build-Depends: autoconf-archive, bison, + clang-6.0, cmake, debhelper (>= 9.20160115), dh-autoreconf, @@ -102,6 +103,33 @@ Description: Apache Arrow is a data processing library for analysis . This package provides C++ header files for CUDA support. +Package: libgandiva12 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow12 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides C++ library files. + +Package: libgandiva-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libarrow-dev (= ${binary:Version}), + libgandiva12 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides C++ header files. + Package: libplasma12 Section: libs Architecture: any @@ -252,6 +280,60 @@ Description: Apache Arrow is a data processing library for analysis . This package provides GLib based header files for CUDA support. +Package: libgandiva-glib12 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow-glib12 (= ${binary:Version}), + libgandiva12 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GLib based library files. + +Package: gir1.2-gandiva-1.0 +Section: introspection +Architecture: any +Multi-Arch: same +Depends: + ${gir:Depends}, + ${misc:Depends} +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GObject Introspection typelib files. + +Package: libgandiva-glib-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libgandiva-dev (= ${binary:Version}), + libarrow-glib-dev (= ${binary:Version}), + libgandiva-glib12 (= ${binary:Version}), + gir1.2-gandiva-1.0 (= ${binary:Version}) +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides GLib based header files. + +Package: libgandiva-glib-doc +Section: doc +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends} +Recommends: libglib2.0-doc +Description: Gandiva is a toolset for compiling and evaluating expressions + on Arrow Data. + . + This package provides documentations. + Package: libplasma-glib12 Section: libs Architecture: any diff --git a/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install b/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install new file mode 100644 index 0000000000000..0433b367a24c8 --- /dev/null +++ b/dev/tasks/linux-packages/debian/gir1.2-gandiva-1.0.install @@ -0,0 +1 @@ +usr/lib/*/girepository-1.0/Gandiva-1.0.typelib diff --git a/dev/tasks/linux-packages/debian/libgandiva-dev.install b/dev/tasks/linux-packages/debian/libgandiva-dev.install new file mode 100644 index 0000000000000..1e5d264378e69 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-dev.install @@ -0,0 +1,3 @@ +usr/lib/*/libgandiva.a +usr/lib/*/libgandiva.so +usr/lib/*/pkgconfig/gandiva.pc diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install b/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install new file mode 100644 index 0000000000000..4189dac66ed90 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-dev.install @@ -0,0 +1,5 @@ +usr/include/gandiva-glib/ +usr/lib/*/libgandiva-glib.a +usr/lib/*/libgandiva-glib.so +usr/lib/*/pkgconfig/gandiva-glib.pc +usr/share/gir-1.0/Gandiva-1.0.gir diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base new file mode 100644 index 0000000000000..bed6a124c5e08 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.doc-base @@ -0,0 +1,9 @@ +Document: gandiva-glib +Title: Gandiva GLib Reference Manual +Author: The Apache Software Foundation +Abstract: Gandiva GLib is a toolset for compiling and evaluating expressions on Arrow Data that uses GLib. +Section: Programming + +Format: HTML +Index: /usr/share/doc/libarrow-glib-doc/gandiva-glib/index.html +Files: /usr/share/doc/libarrow-glib-doc/gandiva-glib/*.html diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install new file mode 100644 index 0000000000000..54d2d066c275a --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.install @@ -0,0 +1 @@ +usr/share/doc/libarrow-glib-doc/gandiva-glib/ diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links new file mode 100644 index 0000000000000..291b004ed717a --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib-doc.links @@ -0,0 +1,3 @@ +usr/share/doc/libglib2.0-doc/glib usr/share/doc/libgandiva-glib-doc/glib +usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libgandiva-glib-doc/gobject +usr/share/doc/libarrow-glib-doc/gandiva-glib usr/share/gtk-doc/html/gandiva-glib diff --git a/dev/tasks/linux-packages/debian/libgandiva-glib12.install b/dev/tasks/linux-packages/debian/libgandiva-glib12.install new file mode 100644 index 0000000000000..6257fd43823c0 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva-glib12.install @@ -0,0 +1 @@ +usr/lib/*/libgandiva-glib.so.* diff --git a/dev/tasks/linux-packages/debian/libgandiva12.install b/dev/tasks/linux-packages/debian/libgandiva12.install new file mode 100644 index 0000000000000..38a05876db6e6 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libgandiva12.install @@ -0,0 +1,2 @@ +usr/lib/*/libgandiva.so.* +usr/lib/*/gandiva/ diff --git a/dev/tasks/linux-packages/debian/rules b/dev/tasks/linux-packages/debian/rules index f3cc2a045c1ee..d82f306cd2656 100755 --- a/dev/tasks/linux-packages/debian/rules +++ b/dev/tasks/linux-packages/debian/rules @@ -24,12 +24,13 @@ override_dh_auto_configure: --builddirectory=cpp_build \ -- \ -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DARROW_BUILD_TESTS=OFF \ -DARROW_PYTHON=ON \ -DARROW_BOOST_USE_SHARED=ON \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ -DARROW_PLASMA=ON \ + -DARROW_GANDIVA=ON \ + -DARROW_GANDIVA_JAVA=OFF \ -DPROTOBUF_HOME=/usr \ -DARROW_PROTOBUF_USE_SHARED=ON \ -DPythonInterp_FIND_VERSION=ON \ diff --git a/dev/tasks/linux-packages/yum/arrow.spec.in b/dev/tasks/linux-packages/yum/arrow.spec.in index ad60dfbdde18e..568477e90d6d3 100644 --- a/dev/tasks/linux-packages/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/yum/arrow.spec.in @@ -75,8 +75,7 @@ cd cpp/build %if %{use_parquet} -DARROW_PARQUET=ON \ %endif - -DARROW_PLASMA=ON \ - -DARROW_BUILD_TESTS=OFF + -DARROW_PLASMA=ON make %{?_smp_mflags} cd - diff --git a/dev/tasks/linux-packages/yum/centos-6/Dockerfile b/dev/tasks/linux-packages/yum/centos-6/Dockerfile index 8143b99efd180..c7de92296767a 100644 --- a/dev/tasks/linux-packages/yum/centos-6/Dockerfile +++ b/dev/tasks/linux-packages/yum/centos-6/Dockerfile @@ -20,14 +20,13 @@ FROM centos:6 ARG DEBUG ENV \ - SRPM_DOWNLOAD_URL=http://vault.centos.org/7.4.1708/os/Source/SPackages \ + SRPM_DOWNLOAD_URL=http://vault.centos.org/7.6.1810/os/Source/SPackages \ LIBARCHIVE_SRPM_BASE=libarchive-3.1.2-10.el7_2.src.rpm RUN \ quiet=$([ "${DEBUG}" = "yes" ] || echo "--quiet") && \ yum update -y ${quiet} && \ yum install -y ${quiet} \ - centos-release-scl \ epel-release && \ yum install -y \ autoconf268 \ @@ -43,9 +42,10 @@ RUN \ ~/rpmbuild/SPECS/libarchive.spec && \ yum install -y ${quiet} ~/rpmbuild/RPMS/*/libarchive-3.*.rpm && \ rm -rf ${LIBARCHIVE_SRPM_BASE} ~/rpmbuild/ && \ + yum install -y ${quiet} \ + centos-release-scl && \ yum install -y ${quiet} \ boost-devel \ - centos-release-scl \ cmake3 \ devtoolset-6 \ git \ diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index ea104d507eec1..52bbc577e6f1b 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -274,6 +274,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb @@ -292,6 +293,13 @@ tasks: - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12-dbgsym_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib12-dbgsym_{no_rc_version}-1_amd64.deb + - libgandiva-glib12_{no_rc_version}-1_amd64.deb + - libgandiva12-dbgsym_{no_rc_version}-1_amd64.deb + - libgandiva12_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb @@ -356,6 +364,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb @@ -369,6 +378,11 @@ tasks: - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib12_{no_rc_version}-1_amd64.deb + - libgandiva12_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb @@ -396,6 +410,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb @@ -409,6 +424,11 @@ tasks: - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib12_{no_rc_version}-1_amd64.deb + - libgandiva12_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb @@ -436,6 +456,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-gandiva-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb @@ -449,6 +470,11 @@ tasks: - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb + - libgandiva-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-dev_{no_rc_version}-1_amd64.deb + - libgandiva-glib-doc_{no_rc_version}-1_all.deb + - libgandiva-glib12_{no_rc_version}-1_amd64.deb + - libgandiva12_{no_rc_version}-1_amd64.deb - libparquet-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-dev_{no_rc_version}-1_amd64.deb - libparquet-glib-doc_{no_rc_version}-1_all.deb From abde663b215295c051ae46f8a4e2bcceec081a2f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 27 Dec 2018 10:24:00 +0900 Subject: [PATCH 70/80] ARROW-4078: [CI] Detect changes in docs/ directory and build the Linux Python entry if so Author: Wes McKinney Closes #3266 from wesm/ARROW-4078 and squashes the following commits: 395c4969 Detect changes in docs/ directory and build the Linux Python entry if so --- .travis.yml | 2 +- ci/detect-changes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 10300c9b6e287..99ff24aaacc97 100644 --- a/.travis.yml +++ b/.travis.yml @@ -106,7 +106,7 @@ matrix: # TODO(wesm): Run the benchmarks outside of Travis # - ARROW_TRAVIS_PYTHON_BENCHMARKS=1 before_script: - - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi + - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ] && [ $ARROW_CI_DOCS_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh diff --git a/ci/detect-changes.py b/ci/detect-changes.py index e9a647c5e6d9c..102dc56396c45 100644 --- a/ci/detect-changes.py +++ b/ci/detect-changes.py @@ -26,7 +26,7 @@ perr = functools.partial(print, file=sys.stderr) -LANGUAGE_TOPICS = ['c_glib', 'cpp', 'go', 'java', 'js', 'python', +LANGUAGE_TOPICS = ['c_glib', 'cpp', 'docs', 'go', 'java', 'js', 'python', 'r', 'ruby', 'rust'] ALL_TOPICS = LANGUAGE_TOPICS + ['integration', 'site', 'dev'] From 5904eea4cc2f422c14c8ef9d1ac323718ff765ea Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 26 Dec 2018 22:38:00 -0600 Subject: [PATCH 71/80] ARROW-3324: [Python] Destroy temporary metadata builder classes more eagerly when building files to reduce memory usage Destroy RowGroupMetadataBuilder after each row group is completed Author: Wes McKinney Closes #3261 from tanyaschlusser/ARROW-3324 and squashes the following commits: 5f3876706 Refine case a bit 4f2bdcdce Destroy RowGroupMetadataBuilder object after completing a row group to reduce memory usage --- cpp/src/parquet/metadata-test.cc | 2 +- cpp/src/parquet/metadata.cc | 67 +++++++++++++------------------- cpp/src/parquet/metadata.h | 25 ++++++------ python/scripts/test_leak.py | 66 ++++++++++++++++++++++++------- 4 files changed, 93 insertions(+), 67 deletions(-) diff --git a/cpp/src/parquet/metadata-test.cc b/cpp/src/parquet/metadata-test.cc index bcf911eab8b26..826ac4d6a504f 100644 --- a/cpp/src/parquet/metadata-test.cc +++ b/cpp/src/parquet/metadata-test.cc @@ -59,7 +59,6 @@ TEST(Metadata, TestBuildAccess) { auto f_builder = FileMetaDataBuilder::Make(&schema, props); auto rg1_builder = f_builder->AppendRowGroup(); - auto rg2_builder = f_builder->AppendRowGroup(); // Write the metadata // rowgroup1 metadata @@ -75,6 +74,7 @@ TEST(Metadata, TestBuildAccess) { rg1_builder->Finish(1024); // rowgroup2 metadata + auto rg2_builder = f_builder->AppendRowGroup(); col1_builder = rg2_builder->NextColumnChunk(); col2_builder = rg2_builder->NextColumnChunk(); // column metadata diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 22cfbdb91aa73..6ac53c58afed4 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -115,7 +115,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } possible_stats_ = nullptr; } - ~ColumnChunkMetaDataImpl() {} // column chunk inline int64_t file_offset() const { return column_->file_offset; } @@ -197,13 +196,13 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { }; std::unique_ptr ColumnChunkMetaData::Make( - const uint8_t* metadata, const ColumnDescriptor* descr, + const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) { return std::unique_ptr( new ColumnChunkMetaData(metadata, descr, writer_version)); } -ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata, +ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version) : impl_{std::unique_ptr(new ColumnChunkMetaDataImpl( @@ -272,7 +271,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl { const SchemaDescriptor* schema, const ApplicationVersion* writer_version) : row_group_(row_group), schema_(schema), writer_version_(writer_version) {} - ~RowGroupMetaDataImpl() {} inline int num_columns() const { return static_cast(row_group_->columns.size()); } @@ -289,9 +287,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl { << " columns, requested metadata for column: " << i; throw ParquetException(ss.str()); } - return ColumnChunkMetaData::Make( - reinterpret_cast(&row_group_->columns[i]), schema_->Column(i), - writer_version_); + return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), + writer_version_); } private: @@ -301,14 +298,13 @@ class RowGroupMetaData::RowGroupMetaDataImpl { }; std::unique_ptr RowGroupMetaData::Make( - const uint8_t* metadata, const SchemaDescriptor* schema, + const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) { return std::unique_ptr( new RowGroupMetaData(metadata, schema, writer_version)); } -RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata, - const SchemaDescriptor* schema, +RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version) : impl_{std::unique_ptr(new RowGroupMetaDataImpl( reinterpret_cast(metadata), schema, writer_version))} { @@ -332,10 +328,11 @@ class FileMetaData::FileMetaDataImpl { public: FileMetaDataImpl() : metadata_len_(0) {} - explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) + explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len) : metadata_len_(0) { metadata_.reset(new format::FileMetaData); - DeserializeThriftMsg(metadata, metadata_len, metadata_.get()); + DeserializeThriftMsg(reinterpret_cast(metadata), metadata_len, + metadata_.get()); metadata_len_ = *metadata_len; if (metadata_->__isset.created_by) { @@ -348,7 +345,6 @@ class FileMetaData::FileMetaDataImpl { InitColumnOrders(); InitKeyValueMetadata(); } - ~FileMetaDataImpl() {} inline uint32_t size() const { return metadata_len_; } inline int num_columns() const { return schema_.num_columns(); } @@ -375,9 +371,7 @@ class FileMetaData::FileMetaDataImpl { << " row groups, requested metadata for row group: " << i; throw ParquetException(ss.str()); } - return RowGroupMetaData::Make( - reinterpret_cast(&metadata_->row_groups[i]), &schema_, - &writer_version_); + return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_); } const SchemaDescriptor* schema() const { return &schema_; } @@ -429,13 +423,13 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr key_value_metadata_; }; -std::shared_ptr FileMetaData::Make(const uint8_t* metadata, +std::shared_ptr FileMetaData::Make(const void* metadata, uint32_t* metadata_len) { // This FileMetaData ctor is private, not compatible with std::make_shared return std::shared_ptr(new FileMetaData(metadata, metadata_len)); } -FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len) +FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len) : impl_{std::unique_ptr( new FileMetaDataImpl(metadata, metadata_len))} {} @@ -606,11 +600,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { Init(column_chunk); } - ~ColumnChunkMetaDataBuilderImpl() {} - - const uint8_t* contents() const { - return reinterpret_cast(column_chunk_); - } + const void* contents() const { return column_chunk_; } // column chunk void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); } @@ -699,7 +689,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::unique_ptr ColumnChunkMetaDataBuilder::Make( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents) { + void* contents) { return std::unique_ptr( new ColumnChunkMetaDataBuilder(props, column, contents)); } @@ -717,14 +707,14 @@ ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents) + void* contents) : impl_{std::unique_ptr( new ColumnChunkMetaDataBuilderImpl( props, column, reinterpret_cast(contents)))} {} ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() {} -const uint8_t* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); } +const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); } void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) { impl_->set_file_path(path); @@ -754,12 +744,11 @@ void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed, class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { public: explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr& props, - const SchemaDescriptor* schema, uint8_t* contents) + const SchemaDescriptor* schema, void* contents) : properties_(props), schema_(schema), current_column_(0) { row_group_ = reinterpret_cast(contents); InitializeColumns(schema->num_columns()); } - ~RowGroupMetaDataBuilderImpl() {} ColumnChunkMetaDataBuilder* NextColumnChunk() { if (!(current_column_ < num_columns())) { @@ -770,8 +759,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { } auto column = schema_->Column(current_column_); auto column_builder = ColumnChunkMetaDataBuilder::Make( - properties_, column, - reinterpret_cast(&row_group_->columns[current_column_++])); + properties_, column, &row_group_->columns[current_column_++]); auto column_builder_ptr = column_builder.get(); column_builders_.push_back(std::move(column_builder)); return column_builder_ptr; @@ -820,14 +808,14 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { std::unique_ptr RowGroupMetaDataBuilder::Make( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents) { + void* contents) { return std::unique_ptr( new RowGroupMetaDataBuilder(props, schema_, contents)); } RowGroupMetaDataBuilder::RowGroupMetaDataBuilder( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents) + void* contents) : impl_{std::unique_ptr( new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {} @@ -861,16 +849,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { : properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) { metadata_.reset(new format::FileMetaData()); } - ~FileMetaDataBuilderImpl() {} RowGroupMetaDataBuilder* AppendRowGroup() { - auto row_group = std::unique_ptr(new format::RowGroup()); - auto row_group_builder = RowGroupMetaDataBuilder::Make( - properties_, schema_, reinterpret_cast(row_group.get())); - RowGroupMetaDataBuilder* row_group_ptr = row_group_builder.get(); - row_group_builders_.push_back(std::move(row_group_builder)); - row_groups_.push_back(std::move(row_group)); - return row_group_ptr; + row_groups_.emplace_back(new format::RowGroup); + current_row_group_builder_ = + RowGroupMetaDataBuilder::Make(properties_, schema_, row_groups_.back().get()); + return current_row_group_builder_.get(); } std::unique_ptr Finish() { @@ -939,7 +923,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { private: const std::shared_ptr properties_; std::vector> row_groups_; - std::vector> row_group_builders_; + + std::unique_ptr current_row_group_builder_; const SchemaDescriptor* schema_; std::shared_ptr key_value_metadata_; }; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 25f4d4cd8cbdf..209c75a6ffbce 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -93,7 +93,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const uint8_t* metadata, const ColumnDescriptor* descr, + const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR); ~ColumnChunkMetaData(); @@ -119,7 +119,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_uncompressed_size() const; private: - explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, + explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, const ApplicationVersion* writer_version = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; @@ -130,7 +130,7 @@ class PARQUET_EXPORT RowGroupMetaData { public: // API convenience to get a MetaData accessor static std::unique_ptr Make( - const uint8_t* metadata, const SchemaDescriptor* schema, + const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version = NULLPTR); ~RowGroupMetaData(); @@ -144,7 +144,7 @@ class PARQUET_EXPORT RowGroupMetaData { std::unique_ptr ColumnChunk(int i) const; private: - explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema, + explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, const ApplicationVersion* writer_version = NULLPTR); // PIMPL Idiom class RowGroupMetaDataImpl; @@ -156,7 +156,7 @@ class FileMetaDataBuilder; class PARQUET_EXPORT FileMetaData { public: // API convenience to get a MetaData accessor - static std::shared_ptr Make(const uint8_t* serialized_metadata, + static std::shared_ptr Make(const void* serialized_metadata, uint32_t* metadata_len); ~FileMetaData(); @@ -182,7 +182,7 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; - explicit FileMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len); + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); // PIMPL Idiom FileMetaData(); @@ -199,7 +199,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { static std::unique_ptr Make( const std::shared_ptr& props, const ColumnDescriptor* column, - uint8_t* contents); + void* contents); ~ColumnChunkMetaDataBuilder(); @@ -217,7 +217,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { bool dictionary_fallback); // The metadata contents, suitable for passing to ColumnChunkMetaData::Make - const uint8_t* contents() const; + const void* contents() const; // For writing metadata at end of column chunk void WriteTo(OutputStream* sink); @@ -226,7 +226,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, const ColumnDescriptor* column); explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, - const ColumnDescriptor* column, uint8_t* contents); + const ColumnDescriptor* column, void* contents); // PIMPL Idiom class ColumnChunkMetaDataBuilderImpl; std::unique_ptr impl_; @@ -237,7 +237,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { // API convenience to get a MetaData reader static std::unique_ptr Make( const std::shared_ptr& props, const SchemaDescriptor* schema_, - uint8_t* contents); + void* contents); ~RowGroupMetaDataBuilder(); @@ -253,7 +253,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { private: explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, - const SchemaDescriptor* schema_, uint8_t* contents); + const SchemaDescriptor* schema_, void* contents); // PIMPL Idiom class RowGroupMetaDataBuilderImpl; std::unique_ptr impl_; @@ -268,9 +268,10 @@ class PARQUET_EXPORT FileMetaDataBuilder { ~FileMetaDataBuilder(); + // The prior RowGroupMetaDataBuilder (if any) is destroyed RowGroupMetaDataBuilder* AppendRowGroup(); - // commit the metadata + // Complete the Thrift structure std::unique_ptr Finish(); private: diff --git a/python/scripts/test_leak.py b/python/scripts/test_leak.py index e3de56b28a168..d3984a89ef754 100644 --- a/python/scripts/test_leak.py +++ b/python/scripts/test_leak.py @@ -19,29 +19,49 @@ import pyarrow as pa import numpy as np +import pandas as pd +import pandas.util.testing as tm import memory_profiler import gc import io +MEGABYTE = 1 << 20 -def leak(): + +def assert_does_not_leak(f, iterations=10, check_interval=1, tolerance=5): + gc.collect() + baseline = memory_profiler.memory_usage()[0] + for i in range(iterations): + f() + if i % check_interval == 0: + gc.collect() + usage = memory_profiler.memory_usage()[0] + diff = usage - baseline + print("{0}: {1}\r".format(i, diff), end="") + if diff > tolerance: + raise Exception("Memory increased by {0} megabytes after {1} " + "iterations".format(diff, i + 1)) + gc.collect() + usage = memory_profiler.memory_usage()[0] + diff = usage - baseline + print("\nMemory increased by {0} megabytes after {1} " + "iterations".format(diff, iterations)) + + +def test_leak1(): data = [pa.array(np.concatenate([np.random.randn(100000)] * 1000))] table = pa.Table.from_arrays(data, ['foo']) - while True: - print('calling to_pandas') - print('memory_usage: {0}'.format(memory_profiler.memory_usage())) - table.to_pandas() - gc.collect() -# leak() + def func(): + table.to_pandas() + assert_does_not_leak(func) -def leak2(): +def test_leak2(): data = [pa.array(np.concatenate([np.random.randn(100000)] * 10))] table = pa.Table.from_arrays(data, ['foo']) - while True: - print('calling to_pandas') - print('memory_usage: {0}'.format(memory_profiler.memory_usage())) + + def func(): df = table.to_pandas() batch = pa.RecordBatch.from_pandas(df) @@ -55,7 +75,27 @@ def leak2(): reader = pa.open_file(buf_reader) reader.read_all() - gc.collect() + assert_does_not_leak(func, iterations=50, tolerance=50) + + +def test_leak3(): + import pyarrow.parquet as pq + + df = pd.DataFrame({'a{0}'.format(i): [1, 2, 3, 4] + for i in range(50)}) + table = pa.Table.from_pandas(df, preserve_index=False) + + writer = pq.ParquetWriter('leak_test_' + tm.rands(5) + '.parquet', + table.schema) + + def func(): + writer.write_table(table, row_group_size=len(table)) + + # This does not "leak" per se but we do want to have this use as little + # memory as possible + assert_does_not_leak(func, iterations=500, + check_interval=50, tolerance=20) -leak2() +if __name__ == '__main__': + test_leak3() From a536529a624b793ffa18c3c39581fdf777e85f8f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 27 Dec 2018 17:11:58 +0100 Subject: [PATCH 72/80] ARROW-4102: [C++] Return common IdentityCast when casting to equal type I also added some code to make it easier to write cast tests in JSON. As one issue with the JSON parser -- we have a number of tests in cast-test.cc that check that values that are in null positions are ignored. We might augment the parser to be able to pass both values and validity bitmap as separate JSON strings Author: Wes McKinney Closes #3265 from wesm/ARROW-4102 and squashes the following commits: 8c27ba2a Fix bad memory access 9c52297f Add various identity cast tests, verify that fixed_size_binary identity casts work now --- cpp/src/arrow/compute/kernels/cast-test.cc | 118 ++++++++++++--------- cpp/src/arrow/compute/kernels/cast.cc | 27 +++-- cpp/src/arrow/ipc/json-simple.cc | 5 + python/pyarrow/tests/test_array.py | 3 +- 4 files changed, 95 insertions(+), 58 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 4c3992868ef6d..781e0af87a825 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -51,6 +51,10 @@ using std::vector; namespace arrow { namespace compute { +static std::vector> kNumericTypes = { + uint8(), int8(), uint16(), int16(), uint32(), + int32(), uint64(), int64(), float32(), float64()}; + static void AssertBufferSame(const Array& left, const Array& right, int buffer_index) { ASSERT_EQ(left.data()->buffers[buffer_index].get(), right.data()->buffers[buffer_index].get()); @@ -81,8 +85,10 @@ class TestCast : public ComputeFixture, public TestBase { void CheckZeroCopy(const Array& input, const shared_ptr& out_type) { shared_ptr result; ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result)); - AssertBufferSame(input, *result, 0); - AssertBufferSame(input, *result, 1); + ASSERT_EQ(input.data()->buffers.size(), result->data()->buffers.size()); + for (size_t i = 0; i < input.data()->buffers.size(); ++i) { + AssertBufferSame(input, *result, static_cast(i)); + } } template @@ -106,15 +112,25 @@ class TestCast : public ComputeFixture, public TestBase { CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); } } -}; -TEST_F(TestCast, SameTypeZeroCopy) { - vector is_valid = {true, false, true, true, true}; - vector v1 = {0, 1, 2, 3, 4}; + void CheckCaseJSON(const shared_ptr& in_type, + const shared_ptr& out_type, const std::string& in_json, + const std::string& expected_json, + const CastOptions& options = CastOptions()) { + shared_ptr input = ArrayFromJSON(in_type, in_json); + shared_ptr expected = ArrayFromJSON(out_type, expected_json); + DCHECK_EQ(input->length(), expected->length()); + CheckPass(*input, *expected, out_type, options); - shared_ptr arr; - ArrayFromVector(int32(), is_valid, v1, &arr); + // Check a sliced variant + if (input->length() > 1) { + CheckPass(*input->Slice(1), *expected->Slice(1), out_type, options); + } + } +}; +TEST_F(TestCast, SameTypeZeroCopy) { + shared_ptr arr = ArrayFromJSON(int32(), "[0, null, 2, 3, 4]"); shared_ptr result; ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result)); @@ -124,20 +140,16 @@ TEST_F(TestCast, SameTypeZeroCopy) { TEST_F(TestCast, ToBoolean) { CastOptions options; + for (auto type : kNumericTypes) { + CheckCaseJSON(type, boolean(), "[0, null, 127, 1, 0]", + "[false, null, true, true, false]"); + } - vector is_valid = {true, false, true, true, true}; - - // int8, should suffice for other integers - vector v1 = {0, 1, 127, -1, 0}; - vector e1 = {false, true, true, true, false}; - CheckCase(int8(), v1, is_valid, boolean(), e1, - options); - - // floating point - vector v2 = {1.0, 0, 0, -1.0, 5.0}; - vector e2 = {true, false, false, true, true}; - CheckCase(float64(), v2, is_valid, boolean(), e2, - options); + // Check negative numbers + CheckCaseJSON(int8(), boolean(), "[0, null, 127, -1, 0]", + "[false, null, true, true, false]"); + CheckCaseJSON(float64(), boolean(), "[0, null, 127, -1, 0]", + "[false, null, true, true, false]"); } TEST_F(TestCast, ToIntUpcast) { @@ -648,36 +660,6 @@ TEST_F(TestCast, TimeToCompatible) { options); } -TEST_F(TestCast, PrimitiveZeroCopy) { - shared_ptr arr; - - ArrayFromVector(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint8()); - ArrayFromVector(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int8()); - - ArrayFromVector(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint16()); - ArrayFromVector(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int16()); - - ArrayFromVector(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint32()); - ArrayFromVector(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int32()); - - ArrayFromVector(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, uint64()); - ArrayFromVector(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, int64()); - - ArrayFromVector(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float32()); - - ArrayFromVector(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr); - CheckZeroCopy(*arr, float64()); -} - TEST_F(TestCast, DateToCompatible) { CastOptions options; @@ -1193,5 +1175,39 @@ TEST_F(TestCast, ListToList) { CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options); } +TEST_F(TestCast, IdentityCasts) { + // ARROW-4102 + auto CheckIdentityCast = [this](std::shared_ptr type, + const std::string& json) { + auto arr = ArrayFromJSON(type, json); + CheckZeroCopy(*arr, type); + }; + + CheckIdentityCast(null(), "[null, null, null]"); + CheckIdentityCast(boolean(), "[false, true, null, false]"); + + for (auto type : kNumericTypes) { + CheckIdentityCast(type, "[1, 2, null, 4]"); + } + CheckIdentityCast(binary(), "[\"foo\", \"bar\"]"); + CheckIdentityCast(utf8(), "[\"foo\", \"bar\"]"); + CheckIdentityCast(fixed_size_binary(3), "[\"foo\", \"bar\"]"); + + CheckIdentityCast(list(int8()), "[[1, 2], [null], [], [3]]"); + + CheckIdentityCast(time32(TimeUnit::MILLI), "[1, 2, 3, 4]"); + CheckIdentityCast(time64(TimeUnit::MICRO), "[1, 2, 3, 4]"); + CheckIdentityCast(date32(), "[1, 2, 3, 4]"); + CheckIdentityCast(date64(), "[86400000, 0]"); + CheckIdentityCast(timestamp(TimeUnit::SECOND), "[1, 2, 3, 4]"); + + { + auto dict_type = dictionary(int8(), ArrayFromJSON(int8(), "[1, 2, 3]")); + auto dict_indices = ArrayFromJSON(int8(), "[0, 1, 2, 0, null, 2]"); + auto dict_array = std::make_shared(dict_type, dict_indices); + CheckZeroCopy(*dict_array, dict_type); + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 7976ef0beffc6..15746d4c9965e 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -99,6 +99,8 @@ struct is_zero_copy_cast { static constexpr bool value = false; }; +// TODO(wesm): ARROW-4110; this is no longer needed, but may be useful if we +// ever _do_ want to generate identity cast kernels at compile time template struct is_zero_copy_cast< O, I, @@ -1143,6 +1145,17 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& i return Status::OK(); } +class IdentityCast : public UnaryKernel { + public: + IdentityCast() {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { + DCHECK_EQ(input.kind(), Datum::ARRAY); + out->value = input.array()->Copy(); + return Status::OK(); + } +}; + class CastKernel : public UnaryKernel { public: CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy, @@ -1188,6 +1201,8 @@ class CastKernel : public UnaryKernel { std::shared_ptr out_type_; }; +// TODO(wesm): ARROW-4110 Do not generate cases that could return IdentityCast + #define CAST_CASE(InType, OutType) \ case OutType::type_id: \ is_zero_copy = is_zero_copy_cast::value; \ @@ -1233,12 +1248,10 @@ class CastKernel : public UnaryKernel { FN(Int64Type, Date64Type); #define DATE32_CASES(FN, IN_TYPE) \ - FN(Date32Type, Date32Type); \ FN(Date32Type, Date64Type); \ FN(Date32Type, Int32Type); #define DATE64_CASES(FN, IN_TYPE) \ - FN(Date64Type, Date64Type); \ FN(Date64Type, Date32Type); \ FN(Date64Type, Int64Type); @@ -1258,12 +1271,9 @@ class CastKernel : public UnaryKernel { FN(TimestampType, Date64Type); \ FN(TimestampType, Int64Type); -#define BINARY_CASES(FN, IN_TYPE) \ - FN(BinaryType, BinaryType); \ - FN(BinaryType, StringType); +#define BINARY_CASES(FN, IN_TYPE) FN(BinaryType, StringType); #define STRING_CASES(FN, IN_TYPE) \ - FN(StringType, StringType); \ FN(StringType, BooleanType); \ FN(StringType, UInt8Type); \ FN(StringType, Int8Type); \ @@ -1365,6 +1375,11 @@ Status GetListCastFunc(const DataType& in_type, const std::shared_ptr& Status GetCastFunction(const DataType& in_type, const std::shared_ptr& out_type, const CastOptions& options, std::unique_ptr* kernel) { + if (in_type.Equals(out_type)) { + *kernel = std::unique_ptr(new IdentityCast); + return Status::OK(); + } + switch (in_type.id()) { CAST_FUNCTION_CASE(NullType); CAST_FUNCTION_CASE(BooleanType); diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc index 7a78fe4986cd5..047788ce0f5de 100644 --- a/cpp/src/arrow/ipc/json-simple.cc +++ b/cpp/src/arrow/ipc/json-simple.cc @@ -474,7 +474,12 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 3d3402139cb43..17ff9c625871a 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -768,7 +768,8 @@ def test_cast_date64_to_int(): ('float', [0.0, 0.1, 0.2]), ('double', [0.0, 0.1, 0.2]), ('string', ['a', 'b', 'c']), - ('binary', [b'a', b'b', b'c']) + ('binary', [b'a', b'b', b'c']), + (pa.binary(3), [b'abc', b'bcd', b'cde']) ]) def test_cast_identities(ty, values): arr = pa.array(values, type=ty) From 6781c2da8915f99eaa8438cce25329152a0defc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 27 Dec 2018 17:26:55 +0100 Subject: [PATCH 73/80] ARROW-4088: [Python] Table.from_batches() fails when passed a schema with metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Krisztián Szűcs Closes #3256 from kszucs/ARROW-4088 and squashes the following commits: b2698995 turn off check_metadata cf5c0829 propagate check_metadata to Schema's fields --- cpp/src/arrow/type-test.cc | 20 ++++++++++++-------- cpp/src/arrow/type.cc | 12 +++++++----- cpp/src/arrow/type.h | 4 ++-- python/pyarrow/tests/test_schema.py | 14 ++++++++++++++ 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 5b758d7a129fd..ec82e0a5dbbf9 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -58,6 +58,7 @@ TEST(TestField, Equals) { ASSERT_TRUE(f0.Equals(f0_other)); ASSERT_FALSE(f0.Equals(f0_nn)); ASSERT_FALSE(f0.Equals(f0_with_meta)); + ASSERT_TRUE(f0.Equals(f0_with_meta, false)); } TEST(TestField, TestMetadataConstruction) { @@ -200,28 +201,31 @@ TEST_F(TestSchema, GetFieldIndex) { } TEST_F(TestSchema, TestMetadataConstruction) { - auto f0 = field("f0", int32()); - auto f1 = field("f1", uint8(), false); - auto f2 = field("f2", utf8()); auto metadata0 = key_value_metadata({{"foo", "bar"}, {"bizz", "buzz"}}); auto metadata1 = key_value_metadata({{"foo", "baz"}}); - auto schema0 = ::arrow::schema({f0, f1, f2}, metadata0); - ASSERT_TRUE(metadata0->Equals(*schema0->metadata())); + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8(), true); + auto f3 = field("f2", utf8(), true, metadata1->Copy()); + auto schema0 = ::arrow::schema({f0, f1, f2}, metadata0); auto schema1 = ::arrow::schema({f0, f1, f2}, metadata1); - ASSERT_TRUE(metadata1->Equals(*schema1->metadata())); - auto schema2 = ::arrow::schema({f0, f1, f2}, metadata0->Copy()); - ASSERT_TRUE(metadata0->Equals(*schema2->metadata())); + auto schema3 = ::arrow::schema({f0, f1, f3}, metadata0->Copy()); + ASSERT_TRUE(metadata0->Equals(*schema0->metadata())); + ASSERT_TRUE(metadata1->Equals(*schema1->metadata())); + ASSERT_TRUE(metadata0->Equals(*schema2->metadata())); ASSERT_TRUE(schema0->Equals(*schema2)); ASSERT_FALSE(schema0->Equals(*schema1)); ASSERT_FALSE(schema2->Equals(*schema1)); + ASSERT_FALSE(schema2->Equals(*schema3)); // don't check metadata ASSERT_TRUE(schema0->Equals(*schema1, false)); ASSERT_TRUE(schema2->Equals(*schema1, false)); + ASSERT_TRUE(schema2->Equals(*schema3, false)); } TEST_F(TestSchema, TestAddMetadata) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ee7fda7c8c8f4..a8372b96132bd 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -65,13 +65,15 @@ std::vector> Field::Flatten() const { return flattened; } -bool Field::Equals(const Field& other) const { +bool Field::Equals(const Field& other, bool check_metadata) const { if (this == &other) { return true; } if (this->name_ == other.name_ && this->nullable_ == other.nullable_ && this->type_->Equals(*other.type_.get())) { - if (this->HasMetadata() && other.HasMetadata()) { + if (!check_metadata) { + return true; + } else if (this->HasMetadata() && other.HasMetadata()) { return metadata_->Equals(*other.metadata_); } else if (!this->HasMetadata() && !other.HasMetadata()) { return true; @@ -82,8 +84,8 @@ bool Field::Equals(const Field& other) const { return false; } -bool Field::Equals(const std::shared_ptr& other) const { - return Equals(*other.get()); +bool Field::Equals(const std::shared_ptr& other, bool check_metadata) const { + return Equals(*other.get(), check_metadata); } std::string Field::ToString() const { @@ -333,7 +335,7 @@ bool Schema::Equals(const Schema& other, bool check_metadata) const { return false; } for (int i = 0; i < num_fields(); ++i) { - if (!field(i)->Equals(*other.field(i).get())) { + if (!field(i)->Equals(*other.field(i).get(), check_metadata)) { return false; } } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index eb00f43caa172..0758ced80ad0c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -265,8 +265,8 @@ class ARROW_EXPORT Field { std::vector> Flatten() const; - bool Equals(const Field& other) const; - bool Equals(const std::shared_ptr& other) const; + bool Equals(const Field& other, bool check_metadata = true) const; + bool Equals(const std::shared_ptr& other, bool check_metadata = true) const; /// \brief Return a string representation ot the field std::string ToString() const; diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 5385c3c8c41d9..8549d61c3456f 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -334,6 +334,20 @@ def test_schema_equals(): assert not sch1.equals(sch3) +def test_schema_equals_propagates_check_metadata(): + # ARROW-4088 + schema1 = pa.schema([ + pa.field('foo', pa.int32()), + pa.field('bar', pa.string()) + ]) + schema2 = pa.schema([ + pa.field('foo', pa.int32()), + pa.field('bar', pa.string(), metadata={'a': 'alpha'}), + ]) + assert not schema1.equals(schema2) + assert schema1.equals(schema2, check_metadata=False) + + def test_schema_equality_operators(): fields = [ pa.field('foo', pa.int32()), From 0696eb591f4707377067b53ecdc9be1dbc4c6a34 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 27 Dec 2018 18:11:30 +0100 Subject: [PATCH 74/80] ARROW-3932: [Python] Include Benchmarks.md in Sphinx docs Author: Uwe L. Korn Closes #3249 from xhochy/ARROW-3932 and squashes the following commits: 8e969c1b Link to Sphinx documentation for benchmarks 06c3b8d0 ARROW-3932: Include Benchmarks.md in Sphinx docs --- docs/Benchmarks.md | 29 ----------------- docs/source/python/benchmarks.rst | 53 +++++++++++++++++++++++++++++++ docs/source/python/index.rst | 1 + python/README-benchmarks.md | 47 --------------------------- python/README.md | 3 ++ 5 files changed, 57 insertions(+), 76 deletions(-) delete mode 100644 docs/Benchmarks.md create mode 100644 docs/source/python/benchmarks.rst delete mode 100644 python/README-benchmarks.md diff --git a/docs/Benchmarks.md b/docs/Benchmarks.md deleted file mode 100644 index c84bf0dc1eb62..0000000000000 --- a/docs/Benchmarks.md +++ /dev/null @@ -1,29 +0,0 @@ - -## Benchmark Requirements - -The benchmarks are run using [asv][1] which is also their only requirement. - -## Running the benchmarks - -To run the benchmarks, call `asv run --python=same`. You cannot use the -plain `asv run` command at the moment as asv cannot handle python packages -in subdirectories of a repository. - -[1]: https://asv.readthedocs.org/ diff --git a/docs/source/python/benchmarks.rst b/docs/source/python/benchmarks.rst new file mode 100644 index 0000000000000..6c3144ae58637 --- /dev/null +++ b/docs/source/python/benchmarks.rst @@ -0,0 +1,53 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Benchmarks +========== + +The ``pyarrow`` package comes with a suite of benchmarks meant to +run with `asv`_. You'll need to install the ``asv`` package first +(``pip install asv`` or ``conda install -c conda-forge asv``). + +The benchmarks are run using `asv`_ which is also their only requirement. + +Running the benchmarks +---------------------- + +To run the benchmarks, call ``asv run --python=same``. You cannot use the +plain ``asv run`` command at the moment as asv cannot handle python packages +in subdirectories of a repository. + +Running with arbitrary revisions +-------------------------------- + +ASV allows to store results and generate graphs of the benchmarks over +the project's evolution. For this you have the latest development version of ASV: + +.. code:: + + pip install git+https://github.com/airspeed-velocity/asv + +Now you should be ready to run ``asv run`` or whatever other command +suits your needs. + +Compatibility +------------- + +We only expect the benchmarking setup to work with Python 3.6 or later, +on a Unix-like system. + +.. asv:: https://asv.readthedocs.org/ diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index cf691e37eaa25..fe04a73f32ef2 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -47,3 +47,4 @@ files into Arrow structures. api development getting_involved + benchmarks diff --git a/python/README-benchmarks.md b/python/README-benchmarks.md deleted file mode 100644 index 77901f3f020bb..0000000000000 --- a/python/README-benchmarks.md +++ /dev/null @@ -1,47 +0,0 @@ - - -# Benchmarks - -The `pyarrow` package comes with a suite of benchmarks meant to -run with [ASV](https://asv.readthedocs.io). You'll need to install -the `asv` package first (`pip install asv`). - -## Running with your local tree - -When developing, the simplest and fastest way to run the benchmark suite -against your local changes is to use the `asv dev` command. This will -use your current Python interpreter and environment. - -## Running with arbitrary revisions - -ASV allows to store results and generate graphs of the benchmarks over -the project's evolution. For this you have the latest development version of ASV: - -```shell -pip install git+https://github.com/airspeed-velocity/asv -``` - -Now you should be ready to run `asv run` or whatever other command -suits your needs. - -## Compatibility - -We only expect the benchmarking setup to work with Python 3.6 or later, -on a Unix-like system. diff --git a/python/README.md b/python/README.md index ce696939929f9..ce7bdde999eed 100644 --- a/python/README.md +++ b/python/README.md @@ -76,6 +76,8 @@ pytest pyarrow --help and look for the "custom options" section. +For running the benchmarks, see the [Sphinx documentation][5]. + ### Building the documentation ```bash @@ -86,3 +88,4 @@ python setup.py build_sphinx -s ../docs/source [2]: https://github.com/apache/arrow/blob/master/docs/source/python/development.rst [3]: https://github.com/pandas-dev/pandas [4]: https://docs.pytest.org/en/latest/ +[5]: https://arrow.apache.org/docs/latest/python/benchmarks.html From 9b03947c4369cb1b4d82022df00629baf2b6eb00 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 27 Dec 2018 12:17:50 -0600 Subject: [PATCH 75/80] ARROW-3928: [Python] Deduplicate Python objects when converting binary, string, date, time types to object arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a `deduplicate_objects` option to all of the `to_pandas` methods. It works with string types, date types (when `date_as_object=True`), and time types. I also made it so that `ScalarMemoTable` can be used with `string_view`, for more efficient memoization in this case. I made the default for `deduplicate_objects` is True. When the ratio of unique strings to the length of the array is low, not only does this use drastically less memory, it is also faster. I will write some benchmarks to show where the "crossover point" is when the overhead of hashing makes things slower. Let's consider a simple case where we have 10,000,000 strings of length 10, but only 1000 unique values: ``` In [50]: import pandas.util.testing as tm In [51]: unique_values = [tm.rands(10) for i in range(1000)] In [52]: values = unique_values * 10000 In [53]: arr = pa.array(values) In [54]: timeit arr.to_pandas() 236 ms ± 1.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) In [55]: timeit arr.to_pandas(deduplicate_objects=False) 730 ms ± 12.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` Almost 3 times faster in this case. The different in memory use is even more drastic ``` In [44]: unique_values = [tm.rands(10) for i in range(1000)] In [45]: values = unique_values * 10000 In [46]: arr = pa.array(values) In [49]: %memit result11 = arr.to_pandas() peak memory: 1505.89 MiB, increment: 76.27 MiB In [50]: %memit result12 = arr.to_pandas(deduplicate_objects=False) peak memory: 2202.29 MiB, increment: 696.11 MiB ``` As you can see, this is a huge problem. If our bug reports about Parquet memory use problems are any indication, users have been suffering from this issue for a long time. When the strings are mostly unique, then things are slower as expected, the peak memory use is higher because of the hash table ``` In [17]: unique_values = [tm.rands(10) for i in range(500000)] In [18]: values = unique_values * 2 In [19]: arr = pa.array(values) In [20]: timeit result = arr.to_pandas() 177 ms ± 574 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [21]: timeit result = arr.to_pandas(deduplicate_objects=False) 70.1 ms ± 783 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [42]: %memit result8 = arr.to_pandas() peak memory: 644.39 MiB, increment: 92.23 MiB In [43]: %memit result9 = arr.to_pandas(deduplicate_objects=False) peak memory: 610.85 MiB, increment: 58.41 MiB ``` In real world work, many duplicated strings is the most common use case. Given the massive memory use and moderate performance improvements, it makes sense to have this enabled by default. Author: Wes McKinney Closes #3257 from wesm/ARROW-3928 and squashes the following commits: d9a88700 Prettier output a00b51c7 Add benchmarks for object deduplication ca88b963 Add Python unit tests, deduplicate for date and time types also when converting to Python objects 7a7873b8 First working iteration of string deduplication when calling to_pandas --- cpp/src/arrow/python/arrow_to_pandas.cc | 286 +++++++++++--------- cpp/src/arrow/python/arrow_to_pandas.h | 41 +-- cpp/src/arrow/type.cc | 7 +- cpp/src/arrow/type.h | 9 +- cpp/src/arrow/type_traits.h | 5 + cpp/src/arrow/util/hashing.h | 21 +- python/benchmarks/convert_pandas.py | 22 ++ python/pyarrow/array.pxi | 91 ++++--- python/pyarrow/compat.py | 6 +- python/pyarrow/includes/libarrow.pxd | 9 +- python/pyarrow/lib.pxd | 14 +- python/pyarrow/pandas_compat.py | 5 +- python/pyarrow/table.pxi | 160 +---------- python/pyarrow/tests/test_convert_pandas.py | 85 ++++++ 14 files changed, 409 insertions(+), 352 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 29d64355bdaed..b532bfb705acd 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -36,9 +36,11 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/hashing.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/parallel.h" +#include "arrow/util/string_view.h" #include "arrow/visitor_inline.h" #include "arrow/compute/api.h" @@ -75,21 +77,21 @@ template struct WrapBytes {}; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyUnicode_FromStringAndSize(data, length); } }; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyBytes_FromStringAndSize(data, length); } }; template <> -struct WrapBytes { +struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { return PyBytes_FromStringAndSize(data, length); } @@ -216,7 +218,7 @@ class PandasBlock { CATEGORICAL }; - PandasBlock(PandasOptions options, int64_t num_rows, int num_columns) + PandasBlock(const PandasOptions& options, int64_t num_rows, int num_columns) : num_rows_(num_rows), num_columns_(num_columns), options_(options) {} virtual ~PandasBlock() {} @@ -301,8 +303,8 @@ inline const T* GetPrimitiveValues(const Array& arr) { } template -inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& data, - double* out_values) { +inline void ConvertIntegerWithNulls(const PandasOptions& options, + const ChunkedArray& data, double* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); const T* in_values = GetPrimitiveValues(arr); @@ -315,8 +317,8 @@ inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& d } template -inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedArray& data, - T* out_values) { +inline void ConvertIntegerNoNullsSameType(const PandasOptions& options, + const ChunkedArray& data, T* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); if (arr.length() > 0) { @@ -328,8 +330,8 @@ inline void ConvertIntegerNoNullsSameType(PandasOptions options, const ChunkedAr } template -inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& data, - OutType* out_values) { +inline void ConvertIntegerNoNullsCast(const PandasOptions& options, + const ChunkedArray& data, OutType* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); const InType* in_values = GetPrimitiveValues(arr); @@ -339,8 +341,8 @@ inline void ConvertIntegerNoNullsCast(PandasOptions options, const ChunkedArray& } } -static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { +static Status ConvertBooleanWithNulls(const PandasOptions& options, + const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); @@ -363,7 +365,7 @@ static Status ConvertBooleanWithNulls(PandasOptions options, const ChunkedArray& return Status::OK(); } -static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& data, +static void ConvertBooleanNoNulls(const PandasOptions& options, const ChunkedArray& data, uint8_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); @@ -373,57 +375,106 @@ static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& dat } } -template -static Status ConvertIntegerObjects(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { - PyAcquireGIL lock; - constexpr bool is_signed = std::is_signed::value; - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = *data.chunk(c); - const auto* in_values = GetPrimitiveValues(arr); - - for (int i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - *out_values++ = is_signed ? PyLong_FromLongLong(in_values[i]) - : PyLong_FromUnsignedLongLong(in_values[i]); - RETURN_IF_PYERROR(); - } +// Generic Array -> PyObject** converter that handles object deduplication, if +// requested +template +inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, + PyObject** out_values) { + const bool has_nulls = arr.null_count() > 0; + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); } + ++out_values; } return Status::OK(); } -template -inline Status ConvertBinaryLike(PandasOptions options, const ChunkedArray& data, - PyObject** out_values) { +template +struct MemoizationTraits { + using Scalar = typename T::c_type; +}; + +template +struct MemoizationTraits> { + // For binary, we memoize string_view as a scalar value to avoid having to + // unnecessarily copy the memory into the memo table data structure + using Scalar = util::string_view; +}; + +template +inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, + WrapFunction&& wrap_func, PyObject** out_values) { using ArrayType = typename TypeTraits::ArrayType; + using Scalar = typename MemoizationTraits::Scalar; + PyAcquireGIL lock; + ::arrow::internal::ScalarMemoTable memo_table; + std::vector unique_values; + int32_t memo_size = 0; + + auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { + int32_t memo_index = memo_table.GetOrInsert(value); + if (memo_index == memo_size) { + // New entry + RETURN_NOT_OK(wrap_func(value, out_values)); + unique_values.push_back(*out_values); + ++memo_size; + } else { + // Duplicate entry + Py_INCREF(unique_values[memo_index]); + *out_values = unique_values[memo_index]; + } + return Status::OK(); + }; + + auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { + return wrap_func(value, out_values); + }; + for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); - - const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr.length(); ++i) { - if (has_nulls && arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values = Py_None; - } else { - auto view = arr.GetView(i); - *out_values = WrapBytes::Wrap(view.data(), view.length()); - if (*out_values == nullptr) { - PyErr_Clear(); - return Status::UnknownError("Wrapping ", view, " failed"); - } - } - ++out_values; + if (options.deduplicate_objects) { + RETURN_NOT_OK(WriteArrayObjects(arr, WrapMemoized, out_values)); + } else { + RETURN_NOT_OK(WriteArrayObjects(arr, WrapUnmemoized, out_values)); } + out_values += arr.length(); } return Status::OK(); } -inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertIntegerObjects(const PandasOptions& options, + const ChunkedArray& data, PyObject** out_values) { + using T = typename Type::c_type; + auto WrapValue = [](T value, PyObject** out) { + *out = std::is_signed::value ? PyLong_FromLongLong(value) + : PyLong_FromUnsignedLongLong(value); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); +} + +template +inline Status ConvertBinaryLike(const PandasOptions& options, const ChunkedArray& data, + PyObject** out_values) { + auto WrapValue = [](const util::string_view& view, PyObject** out) { + *out = WrapBytes::Wrap(view.data(), view.length()); + if (*out == nullptr) { + PyErr_Clear(); + return Status::UnknownError("Wrapping ", view, " failed"); + } + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); +} + +inline Status ConvertNulls(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; for (int c = 0; c < data.num_chunks(); c++) { @@ -439,7 +490,7 @@ inline Status ConvertNulls(PandasOptions options, const ChunkedArray& data, return Status::OK(); } -inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, +inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; if (data.num_chunks() <= 0) { @@ -503,7 +554,8 @@ inline Status ConvertStruct(PandasOptions options, const ChunkedArray& data, } template -inline Status ConvertListsLike(PandasOptions options, const std::shared_ptr& col, +inline Status ConvertListsLike(const PandasOptions& options, + const std::shared_ptr& col, PyObject** out_values) { const ChunkedArray& data = *col->data().get(); const auto& list_type = checked_cast(*col->type()); @@ -604,69 +656,40 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) } } -template -static Status ConvertDates(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertDates(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - - PyAcquireGIL lock; - OwnedRef date_ref; - - PyDateTime_IMPORT; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - auto type = std::dynamic_pointer_cast(arr.type()); - DCHECK(type); - - const DateUnit unit = type->unit(); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - RETURN_NOT_OK(PyDate_from_int(arr.Value(i), unit, out_values++)); - RETURN_IF_PYERROR(); - } - } + { + PyAcquireGIL lock; + PyDateTime_IMPORT; } - - return Status::OK(); + auto WrapValue = [](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(PyDate_from_int(value, Type::UNIT, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); } -template -static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, +template +static Status ConvertTimes(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - using ArrayType = typename TypeTraits::ArrayType; - - PyAcquireGIL lock; - OwnedRef time_ref; - - PyDateTime_IMPORT; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - auto type = std::dynamic_pointer_cast(arr.type()); - DCHECK(type); - - const TimeUnit::type unit = type->unit(); - - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsNull(i)) { - Py_INCREF(Py_None); - *out_values++ = Py_None; - } else { - RETURN_NOT_OK(PyTime_from_int(arr.Value(i), unit, out_values++)); - RETURN_IF_PYERROR(); - } - } + { + PyAcquireGIL lock; + PyDateTime_IMPORT; } - return Status::OK(); + const TimeUnit::type unit = checked_cast(*data.type()).unit(); + + auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(PyTime_from_int(value, unit, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); } -static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, +static Status ConvertDecimals(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; OwnedRef decimal; @@ -715,21 +738,21 @@ class ObjectBlock : public PandasBlock { if (type == Type::BOOL) { RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer)); } else if (type == Type::UINT8) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT8) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT16) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT16) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT32) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT32) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::UINT64) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::INT64) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); } else if (type == Type::BINARY) { RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); } else if (type == Type::STRING) { @@ -1009,7 +1032,8 @@ class DatetimeBlock : public PandasBlock { class DatetimeTZBlock : public DatetimeBlock { public: - DatetimeTZBlock(PandasOptions options, const std::string& timezone, int64_t num_rows) + DatetimeTZBlock(const PandasOptions& options, const std::string& timezone, + int64_t num_rows) : DatetimeBlock(options, num_rows, 1), timezone_(timezone) {} // Like Categorical, the internal ndarray is 1-dimensional @@ -1038,7 +1062,8 @@ class DatetimeTZBlock : public DatetimeBlock { class CategoricalBlock : public PandasBlock { public: - explicit CategoricalBlock(PandasOptions options, MemoryPool* pool, int64_t num_rows) + explicit CategoricalBlock(const PandasOptions& options, MemoryPool* pool, + int64_t num_rows) : PandasBlock(options, num_rows, 1), pool_(pool), needs_copy_(false) {} Status Allocate() override { @@ -1235,7 +1260,7 @@ class CategoricalBlock : public PandasBlock { bool needs_copy_; }; -Status MakeBlock(PandasOptions options, PandasBlock::type type, int64_t num_rows, +Status MakeBlock(const PandasOptions& options, PandasBlock::type type, int64_t num_rows, int num_columns, std::shared_ptr* block) { #define BLOCK_CASE(NAME, TYPE) \ case PandasBlock::NAME: \ @@ -1518,7 +1543,7 @@ class DataFrameBlockCreator { class ArrowDeserializer { public: - ArrowDeserializer(PandasOptions options, const std::shared_ptr& col, + ArrowDeserializer(const PandasOptions& options, const std::shared_ptr& col, PyObject* py_ref) : col_(col), data_(*col->data().get()), options_(options), py_ref_(py_ref) {} @@ -1532,7 +1557,7 @@ class ArrowDeserializer { } template - Status ConvertValuesZeroCopy(PandasOptions options, int npy_type, + Status ConvertValuesZeroCopy(const PandasOptions& options, int npy_type, const std::shared_ptr& arr) { typedef typename internal::arrow_traits::T T; @@ -1738,9 +1763,7 @@ class ArrowDeserializer { if (data_.null_count() > 0) { if (options_.integer_object_nulls) { - using c_type = typename Type::c_type; - - return VisitObjects(ConvertIntegerObjects); + return VisitObjects(ConvertIntegerObjects); } else { RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); @@ -1878,15 +1901,16 @@ class ArrowDeserializer { PyObject* result_; }; -Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, - PyObject* py_ref, PyObject** out) { +Status ConvertArrayToPandas(const PandasOptions& options, + const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out) { static std::string dummy_name = "dummy"; auto field = std::make_shared(dummy_name, arr->type()); auto col = std::make_shared(field, arr); return ConvertColumnToPandas(options, col, py_ref, out); } -Status ConvertChunkedArrayToPandas(PandasOptions options, +Status ConvertChunkedArrayToPandas(const PandasOptions& options, const std::shared_ptr& ca, PyObject* py_ref, PyObject** out) { static std::string dummy_name = "dummy"; @@ -1895,19 +1919,21 @@ Status ConvertChunkedArrayToPandas(PandasOptions options, return ConvertColumnToPandas(options, col, py_ref, out); } -Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, - PyObject* py_ref, PyObject** out) { +Status ConvertColumnToPandas(const PandasOptions& options, + const std::shared_ptr& col, PyObject* py_ref, + PyObject** out) { ArrowDeserializer converter(options, col, py_ref); return converter.Convert(out); } -Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr
& table, - MemoryPool* pool, PyObject** out) { +Status ConvertTableToPandas(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool, + PyObject** out) { return ConvertTableToPandas(options, std::unordered_set(), table, pool, out); } -Status ConvertTableToPandas(PandasOptions options, +Status ConvertTableToPandas(const PandasOptions& options, const std::unordered_set& categorical_columns, const std::shared_ptr
& table, MemoryPool* pool, PyObject** out) { diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 753bf4823566b..20bad40971020 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -43,32 +43,32 @@ namespace py { struct PandasOptions { /// If true, we will convert all string columns to categoricals - bool strings_to_categorical; - bool zero_copy_only; - bool integer_object_nulls; - bool date_as_object; - bool use_threads; - - PandasOptions() - : strings_to_categorical(false), - zero_copy_only(false), - integer_object_nulls(false), - date_as_object(false), - use_threads(false) {} + bool strings_to_categorical = false; + bool zero_copy_only = false; + bool integer_object_nulls = false; + bool date_as_object = false; + bool use_threads = false; + + /// \brief If true, do not create duplicate PyObject versions of equal + /// objects. This only applies to immutable objects like strings or datetime + /// objects + bool deduplicate_objects = false; }; ARROW_PYTHON_EXPORT -Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, - PyObject* py_ref, PyObject** out); +Status ConvertArrayToPandas(const PandasOptions& options, + const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out); ARROW_PYTHON_EXPORT -Status ConvertChunkedArrayToPandas(PandasOptions options, +Status ConvertChunkedArrayToPandas(const PandasOptions& options, const std::shared_ptr& col, PyObject* py_ref, PyObject** out); ARROW_PYTHON_EXPORT -Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, - PyObject* py_ref, PyObject** out); +Status ConvertColumnToPandas(const PandasOptions& options, + const std::shared_ptr& col, PyObject* py_ref, + PyObject** out); // Convert a whole table as efficiently as possible to a pandas.DataFrame. // @@ -77,15 +77,16 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& table, - MemoryPool* pool, PyObject** out); +Status ConvertTableToPandas(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool, + PyObject** out); /// Convert a whole table as efficiently as possible to a pandas.DataFrame. /// /// Explicitly name columns that should be a categorical /// This option is only used on conversions that are applied to a table. ARROW_PYTHON_EXPORT -Status ConvertTableToPandas(PandasOptions options, +Status ConvertTableToPandas(const PandasOptions& options, const std::unordered_set& categorical_columns, const std::shared_ptr
& table, MemoryPool* pool, PyObject** out); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a8372b96132bd..cd57e2dfb2119 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -137,12 +137,11 @@ std::string FixedSizeBinaryType::ToString() const { // ---------------------------------------------------------------------- // Date types -DateType::DateType(Type::type type_id, DateUnit unit) - : FixedWidthType(type_id), unit_(unit) {} +DateType::DateType(Type::type type_id) : FixedWidthType(type_id) {} -Date32Type::Date32Type() : DateType(Type::DATE32, DateUnit::DAY) {} +Date32Type::Date32Type() : DateType(Type::DATE32) {} -Date64Type::Date64Type() : DateType(Type::DATE64, DateUnit::MILLI) {} +Date64Type::Date64Type() : DateType(Type::DATE64) {} std::string Date64Type::ToString() const { return std::string("date64[ms]"); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 0758ced80ad0c..6c3643c6344c8 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -600,17 +600,17 @@ enum class DateUnit : char { DAY = 0, MILLI = 1 }; /// \brief Base type class for date data class ARROW_EXPORT DateType : public FixedWidthType { public: - DateUnit unit() const { return unit_; } + virtual DateUnit unit() const = 0; protected: - DateType(Type::type type_id, DateUnit unit); - DateUnit unit_; + explicit DateType(Type::type type_id); }; /// Concrete type class for 32-bit date data (as number of days since UNIX epoch) class ARROW_EXPORT Date32Type : public DateType { public: static constexpr Type::type type_id = Type::DATE32; + static constexpr DateUnit UNIT = DateUnit::DAY; using c_type = int32_t; @@ -622,12 +622,14 @@ class ARROW_EXPORT Date32Type : public DateType { std::string ToString() const override; std::string name() const override { return "date32"; } + DateUnit unit() const override { return UNIT; } }; /// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch) class ARROW_EXPORT Date64Type : public DateType { public: static constexpr Type::type type_id = Type::DATE64; + static constexpr DateUnit UNIT = DateUnit::MILLI; using c_type = int64_t; @@ -639,6 +641,7 @@ class ARROW_EXPORT Date64Type : public DateType { std::string ToString() const override; std::string name() const override { return "date64"; } + DateUnit unit() const override { return UNIT; } }; struct TimeUnit { diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index da5cf25f5eed1..b89f52f2da661 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -371,6 +371,11 @@ template using enable_if_boolean = typename std::enable_if::value>::type; +template +using enable_if_binary_like = + typename std::enable_if::value || + std::is_base_of::value>::type; + template using enable_if_fixed_size_binary = typename std::enable_if::value>::type; diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 76724b2a30035..3dde0beeb194e 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -102,6 +102,18 @@ struct ScalarHelper +struct ScalarHelper< + Scalar, AlgNum, + typename std::enable_if::value>::type> + : public ScalarHelperBase { + // ScalarHelper specialization for util::string_view + + static hash_t ComputeHash(const util::string_view& value) { + return ComputeStringHash(value.data(), static_cast(value.size())); + } +}; + template struct ScalarHelper::value>::type> @@ -332,7 +344,7 @@ class ScalarMemoTable { explicit ScalarMemoTable(int64_t entries = 0) : hash_table_(static_cast(entries)) {} - int32_t Get(const Scalar value) const { + int32_t Get(const Scalar& value) const { auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(payload->value, value); }; @@ -346,7 +358,7 @@ class ScalarMemoTable { } template - int32_t GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found) { + int32_t GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found) { auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(value, payload->value); }; @@ -364,7 +376,7 @@ class ScalarMemoTable { return memo_index; } - int32_t GetOrInsert(const Scalar value) { + int32_t GetOrInsert(const Scalar& value) { return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {}); } @@ -389,6 +401,7 @@ class ScalarMemoTable { Scalar value; int32_t memo_index; }; + using HashTableType = HashTableTemplateType; using HashTableEntry = typename HashTableType::Entry; HashTableType hash_table_; @@ -621,9 +634,11 @@ class BinaryMemoTable { struct Payload { int32_t memo_index; }; + using HashTableType = HashTable; using HashTableEntry = typename HashTable::Entry; HashTableType hash_table_; + std::vector offsets_; std::string values_; diff --git a/python/benchmarks/convert_pandas.py b/python/benchmarks/convert_pandas.py index 244b3dcc84713..bb8d7102ea783 100644 --- a/python/benchmarks/convert_pandas.py +++ b/python/benchmarks/convert_pandas.py @@ -17,6 +17,8 @@ import numpy as np import pandas as pd +import pandas.util.testing as tm + import pyarrow as pa @@ -50,6 +52,26 @@ def time_to_series(self, n, dtype): self.arrow_data.to_pandas() +class ToPandasStrings(object): + + param_names = ('uniqueness', 'total') + params = ((0.001, 0.01, 0.1, 0.5), (1000000,)) + string_length = 25 + + def setup(self, uniqueness, total): + nunique = int(total * uniqueness) + unique_values = [tm.rands(self.string_length) for i in range(nunique)] + values = unique_values * (total // nunique) + self.arr = pa.array(values, type=pa.string()) + self.table = pa.Table.from_arrays([self.arr], ['f0']) + + def time_to_pandas_dedup(self, *args): + self.arr.to_pandas() + + def time_to_pandas_no_dedup(self, *args): + self.arr.to_pandas(deduplicate_objects=False) + + class ZeroCopyPandasRead(object): def setup(self): diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b86872f7ea98d..ef95efe71b33c 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -339,7 +339,61 @@ def _restore_array(data): return pyarrow_wrap_array(MakeArray(ad)) -cdef class Array: +cdef class _PandasConvertible: + + def to_pandas(self, categories=None, bint strings_to_categorical=False, + bint zero_copy_only=False, bint integer_object_nulls=False, + bint date_as_object=False, + bint use_threads=True, + bint deduplicate_objects=True, + bint ignore_metadata=False): + """ + Convert to a pandas-compatible NumPy array or DataFrame, as appropriate + + Parameters + ---------- + strings_to_categorical : boolean, default False + Encode string (UTF8) and binary types to pandas.Categorical + categories: list, default empty + List of fields that should be returned as pandas.Categorical. Only + applies to table-like data structures + zero_copy_only : boolean, default False + Raise an ArrowException if this function call would require copying + the underlying data + integer_object_nulls : boolean, default False + Cast integers with nulls to objects + date_as_object : boolean, default False + Cast dates to objects + use_threads: boolean, default True + Whether to parallelize the conversion using multiple threads + deduplicate_objects : boolean, default False + Do not create multiple copies Python objects when created, to save + on memory use. Conversion will be slower + ignore_metadata : boolean, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present + + Returns + ------- + NumPy array or DataFrame depending on type of object + """ + cdef: + PyObject* out + PandasOptions options + + options = PandasOptions( + strings_to_categorical=strings_to_categorical, + zero_copy_only=zero_copy_only, + integer_object_nulls=integer_object_nulls, + date_as_object=date_as_object, + use_threads=use_threads, + deduplicate_objects=deduplicate_objects) + + return self._to_pandas(options, categories=categories, + ignore_metadata=ignore_metadata) + + +cdef class Array(_PandasConvertible): def __init__(self): raise TypeError("Do not call {}'s constructor directly, use one of " @@ -602,42 +656,13 @@ cdef class Array: return pyarrow_wrap_array(result) - def to_pandas(self, bint strings_to_categorical=False, - bint zero_copy_only=False, bint integer_object_nulls=False, - bint date_as_object=False): - """ - Convert to a NumPy array object suitable for use in pandas. - - Parameters - ---------- - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - - See also - -------- - Column.to_pandas - Table.to_pandas - RecordBatch.to_pandas - """ + def _to_pandas(self, options, **kwargs): cdef: PyObject* out - PandasOptions options + PandasOptions c_options = options - options = PandasOptions( - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, - use_threads=False) with nogil: - check_status(ConvertArrayToPandas(options, self.sp_array, + check_status(ConvertArrayToPandas(c_options, self.sp_array, self, &out)) return wrap_array_output(out) diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index 068d5607de813..ee924ed388ff1 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -192,11 +192,15 @@ def _iterate_python_module_paths(package_name): for finder in sys.meta_path: try: spec = finder.find_spec(absolute_name, None) - except AttributeError: + except (AttributeError, TypeError): # On Travis (Python 3.5) the above produced: # AttributeError: 'VendorImporter' object has no # attribute 'find_spec' + # + # ARROW-4117: When running "asv dev", TypeError is raised + # due to the meta-importer spec = None + if spec is not None: break diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 7ce03bf6eb80c..cc77ff432967f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1064,20 +1064,20 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus TensorToNdarray(const shared_ptr[CTensor]& tensor, object base, PyObject** out) - CStatus ConvertArrayToPandas(PandasOptions options, + CStatus ConvertArrayToPandas(const PandasOptions& options, const shared_ptr[CArray]& arr, object py_ref, PyObject** out) - CStatus ConvertChunkedArrayToPandas(PandasOptions options, + CStatus ConvertChunkedArrayToPandas(const PandasOptions& options, const shared_ptr[CChunkedArray]& arr, object py_ref, PyObject** out) - CStatus ConvertColumnToPandas(PandasOptions options, + CStatus ConvertColumnToPandas(const PandasOptions& options, const shared_ptr[CColumn]& arr, object py_ref, PyObject** out) CStatus ConvertTableToPandas( - PandasOptions options, + const PandasOptions& options, const unordered_set[c_string]& categorical_columns, const shared_ptr[CTable]& table, CMemoryPool* pool, @@ -1110,6 +1110,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: c_bool integer_object_nulls c_bool date_as_object c_bool use_threads + c_bool deduplicate_objects cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil: diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index d829d6a0c50ad..8cd8f401a2749 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -179,7 +179,11 @@ cdef class FixedSizeBinaryValue(ArrayValue): pass -cdef class Array: +cdef class _PandasConvertible: + pass + + +cdef class Array(_PandasConvertible): cdef: shared_ptr[CArray] sp_array CArray* ap @@ -306,7 +310,7 @@ cdef object box_scalar(DataType type, int64_t index) -cdef class ChunkedArray: +cdef class ChunkedArray(_PandasConvertible): cdef: shared_ptr[CChunkedArray] sp_chunked_array CChunkedArray* chunked_array @@ -315,7 +319,7 @@ cdef class ChunkedArray: cdef getitem(self, int64_t i) -cdef class Column: +cdef class Column(_PandasConvertible): cdef: shared_ptr[CColumn] sp_column CColumn* column @@ -323,7 +327,7 @@ cdef class Column: cdef void init(self, const shared_ptr[CColumn]& column) -cdef class Table: +cdef class Table(_PandasConvertible): cdef: shared_ptr[CTable] sp_table CTable* table @@ -331,7 +335,7 @@ cdef class Table: cdef void init(self, const shared_ptr[CTable]& table) -cdef class RecordBatch: +cdef class RecordBatch(_PandasConvertible): cdef: shared_ptr[CRecordBatch] sp_batch CRecordBatch* batch diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 6acca0c35cf40..a5d8621590f13 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -548,7 +548,7 @@ def _make_datetimetz(tz): # Converting pyarrow.Table efficiently to pandas.DataFrame -def table_to_blockmanager(options, table, memory_pool, categories=None, +def table_to_blockmanager(options, table, categories=None, ignore_metadata=False): from pyarrow.compat import DatetimeTZDtype @@ -624,7 +624,8 @@ def table_to_blockmanager(options, table, memory_pool, categories=None, block_table.schema.get_field_index(raw_name) ) - blocks = _table_to_blocks(options, block_table, memory_pool, categories) + blocks = _table_to_blocks(options, block_table, pa.default_memory_pool(), + categories) # Construct the row index if len(index_arrays) > 1: diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 29a784d60f5a8..59680ed87aa38 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -28,7 +28,7 @@ else: import pyarrow.pandas_compat as pdcompat -cdef class ChunkedArray: +cdef class ChunkedArray(_PandasConvertible): """ Array backed via one or more memory chunks. @@ -145,43 +145,14 @@ cdef class ChunkedArray: return result - def to_pandas(self, bint strings_to_categorical=False, - bint zero_copy_only=False, bint integer_object_nulls=False, - bint date_as_object=False): - """ - Convert the arrow::ChunkedArray to an array object suitable for use - in pandas - - Parameters - ---------- - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - - See also - -------- - Column.to_pandas - """ + def _to_pandas(self, options, **kwargs): cdef: PyObject* out - PandasOptions options - - options = PandasOptions( - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, - use_threads=False) + PandasOptions c_options = options with nogil: check_status(libarrow.ConvertChunkedArrayToPandas( - options, + c_options, self.sp_chunked_array, self, &out)) @@ -385,7 +356,7 @@ def column(object field_or_name, arr): return pyarrow_wrap_column(sp_column) -cdef class Column: +cdef class Column(_PandasConvertible): """ Named vector of elements of equal type. @@ -497,33 +468,8 @@ cdef class Column: return [pyarrow_wrap_column(col) for col in flattened] - def to_pandas(self, bint strings_to_categorical=False, - bint zero_copy_only=False, bint integer_object_nulls=False, - bint date_as_object=False): - """ - Convert the arrow::Column to a pandas.Series - - Parameters - ---------- - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - - Returns - ------- - pandas.Series - """ - values = self.data.to_pandas( - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - date_as_object=date_as_object, - integer_object_nulls=integer_object_nulls) + def _to_pandas(self, options, **kwargs): + values = self.data._to_pandas(options) result = pd.Series(values, name=self.name) if isinstance(self.type, TimestampType): @@ -685,7 +631,7 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): schema.reset(new CSchema(c_fields, c_meta)) -cdef class RecordBatch: +cdef class RecordBatch(_PandasConvertible): """ Batch of rows of columns of equal length @@ -887,46 +833,8 @@ cdef class RecordBatch: entries.append((name, column)) return OrderedDict(entries) - def to_pandas(self, MemoryPool memory_pool=None, categories=None, - bint strings_to_categorical=False, bint zero_copy_only=False, - bint integer_object_nulls=False, bint date_as_object=False, - bint use_threads=True, bint ignore_metadata=False): - """ - Convert the arrow::RecordBatch to a pandas DataFrame - - Parameters - ---------- - memory_pool: MemoryPool, optional - Specific memory pool to use to allocate casted columns - categories: list, default empty - List of columns that should be returned as pandas.Categorical - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - use_threads: boolean, default True - Whether to parallelize the conversion using multiple threads - ignore_metadata : boolean, default False - If True, do not use the 'pandas' metadata to reconstruct the - DataFrame index, if present - - Returns - ------- - pandas.DataFrame - """ - return Table.from_batches([self]).to_pandas( - memory_pool=memory_pool, categories=categories, - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, use_threads=use_threads, - ignore_metadata=ignore_metadata - ) + def _to_pandas(self, options, **kwargs): + return Table.from_batches([self])._to_pandas(options, **kwargs) @classmethod def from_pandas(cls, df, Schema schema=None, bint preserve_index=True, @@ -1031,7 +939,7 @@ def table_to_blocks(PandasOptions options, Table table, return PyObject_to_object(result_obj) -cdef class Table: +cdef class Table(_PandasConvertible): """ A collection of top-level named, equal length Arrow arrays. @@ -1386,50 +1294,8 @@ cdef class Table: return result - def to_pandas(self, MemoryPool memory_pool=None, categories=None, - bint strings_to_categorical=False, bint zero_copy_only=False, - bint integer_object_nulls=False, bint date_as_object=False, - bint use_threads=True, bint ignore_metadata=False): - """ - Convert the arrow::Table to a pandas DataFrame - - Parameters - ---------- - memory_pool: MemoryPool, optional - Specific memory pool to use to allocate casted columns - categories: list, default empty - List of columns that should be returned as pandas.Categorical - strings_to_categorical : boolean, default False - Encode string (UTF8) and binary types to pandas.Categorical - zero_copy_only : boolean, default False - Raise an ArrowException if this function call would require copying - the underlying data - integer_object_nulls : boolean, default False - Cast integers with nulls to objects - date_as_object : boolean, default False - Cast dates to objects - use_threads: boolean, default True - Whether to parallelize the conversion using multiple threads - ignore_metadata : boolean, default False - If True, do not use the 'pandas' metadata to reconstruct the - DataFrame index, if present - - Returns - ------- - pandas.DataFrame - """ - cdef: - PandasOptions options - - options = PandasOptions( - strings_to_categorical=strings_to_categorical, - zero_copy_only=zero_copy_only, - integer_object_nulls=integer_object_nulls, - date_as_object=date_as_object, - use_threads=use_threads) - - mgr = pdcompat.table_to_blockmanager(options, self, memory_pool, - categories, + def _to_pandas(self, options, categories=None, ignore_metadata=False): + mgr = pdcompat.table_to_blockmanager(options, self, categories, ignore_metadata=ignore_metadata) return pd.DataFrame(mgr) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 12214847f3e53..8d8b65b2240b8 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -2316,6 +2316,91 @@ def test_convert_unsupported_type_error_message(): pa.Table.from_pandas(df) +# ---------------------------------------------------------------------- +# Test object deduplication in to_pandas + + +def _generate_dedup_example(nunique, repeats): + unique_values = [tm.rands(10) for i in range(nunique)] + return unique_values * repeats + + +def _assert_nunique(obj, expected): + assert len({id(x) for x in obj}) == expected + + +def test_to_pandas_deduplicate_strings_array_types(): + nunique = 100 + repeats = 10 + values = _generate_dedup_example(nunique, repeats) + + for arr in [pa.array(values, type=pa.binary()), + pa.array(values, type=pa.utf8()), + pa.chunked_array([values, values]), + pa.column('foo', [values, values])]: + _assert_nunique(arr.to_pandas(), nunique) + _assert_nunique(arr.to_pandas(deduplicate_objects=False), len(arr)) + + +def test_to_pandas_deduplicate_strings_table_types(): + nunique = 100 + repeats = 10 + values = _generate_dedup_example(nunique, repeats) + + arr = pa.array(values) + rb = pa.RecordBatch.from_arrays([arr], ['foo']) + tbl = pa.Table.from_batches([rb]) + + for obj in [rb, tbl]: + _assert_nunique(obj.to_pandas()['foo'], nunique) + _assert_nunique(obj.to_pandas(deduplicate_objects=False)['foo'], + len(obj)) + + +def test_to_pandas_deduplicate_integers_as_objects(): + nunique = 100 + repeats = 10 + + # Python automatically interns smaller integers + unique_values = list(np.random.randint(10000000, 1000000000, size=nunique)) + unique_values[nunique // 2] = None + + arr = pa.array(unique_values * repeats) + + _assert_nunique(arr.to_pandas(integer_object_nulls=True), nunique) + _assert_nunique(arr.to_pandas(integer_object_nulls=True, + deduplicate_objects=False), + # Account for None + (nunique - 1) * repeats + 1) + + +def test_to_pandas_deduplicate_date_time(): + nunique = 100 + repeats = 10 + + unique_values = list(range(nunique)) + + cases = [ + # raw type, array type, to_pandas options + ('int32', 'date32', {'date_as_object': True}), + ('int64', 'date64', {'date_as_object': True}), + ('int32', 'time32[ms]', {}), + ('int64', 'time64[us]', {}) + ] + + for raw_type, array_type, pandas_options in cases: + raw_arr = pa.array(unique_values * repeats, type=raw_type) + casted_arr = raw_arr.cast(array_type) + + _assert_nunique(casted_arr.to_pandas(**pandas_options), + nunique) + _assert_nunique(casted_arr.to_pandas(deduplicate_objects=False, + **pandas_options), + len(casted_arr)) + + +# --------------------------------------------------------------------- + def test_table_from_pandas_keeps_column_order_of_dataframe(): df1 = pd.DataFrame(OrderedDict([ ('partition', [0, 0, 1, 1]), From 83a4e979271535b74de9870289cf99d02f6eb16b Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Thu, 27 Dec 2018 12:36:54 -0600 Subject: [PATCH 76/80] ARROW-4080: [Rust] Improving lengthy build times in Appveyor This tries to cut the build times by skipping: 1. build for stable (it doesn't seem too useful). 1. benchmarks in travis 2. build for dev profiles in windows CI - now we only build with release profiles. Author: Chao Sun Closes #3231 from sunchao/ARROW-4080 and squashes the following commits: f5956404 Disable some flaky doctests 60f8b7d2 ARROW-4080: Improving lengthy build times in Appveyor --- .travis.yml | 1 - ci/rust-build-main.bat | 20 -------------------- ci/travis_script_rust.sh | 1 - rust/src/parquet/column/mod.rs | 4 ++-- rust/src/parquet/file/mod.rs | 8 ++++---- 5 files changed, 6 insertions(+), 28 deletions(-) diff --git a/.travis.yml b/.travis.yml index 99ff24aaacc97..b37194f8f2414 100644 --- a/.travis.yml +++ b/.travis.yml @@ -282,7 +282,6 @@ matrix: - if [ $ARROW_CI_RUST_AFFECTED != "1" ]; then exit; fi - $TRAVIS_BUILD_DIR/ci/travis_install_cargo.sh script: - - RUSTUP_TOOLCHAIN=stable $TRAVIS_BUILD_DIR/ci/travis_script_rust.sh || true - RUSTUP_TOOLCHAIN=nightly $TRAVIS_BUILD_DIR/ci/travis_script_rust.sh after_success: - pushd ${TRAVIS_BUILD_DIR}/rust diff --git a/ci/rust-build-main.bat b/ci/rust-build-main.bat index e338f7e172e6e..6ef451204d45a 100644 --- a/ci/rust-build-main.bat +++ b/ci/rust-build-main.bat @@ -22,33 +22,13 @@ git submodule update --init || exit /B set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data pushd rust -@echo =================================== -@echo Build with stable toolchain -@echo =================================== - -rustup default stable -rustup show -cargo build --target %TARGET% -cargo build --target %TARGET% --release -@echo Test (debug) -@echo ------------ -cargo test --target %TARGET% -@echo -@echo Test (release) -@echo -------------- -cargo test --target %TARGET% --release - @echo =================================== @echo Build with nightly toolchain @echo =================================== rustup default nightly rustup show -cargo build --target %TARGET% || exit /B cargo build --target %TARGET% --release || exit /B -@echo Test (debug) -@echo ------------ -cargo test --target %TARGET% || exit /B @echo @echo Test (release) @echo -------------- diff --git a/ci/travis_script_rust.sh b/ci/travis_script_rust.sh index 4b09bc22e4c20..af61dd39446ff 100755 --- a/ci/travis_script_rust.sh +++ b/ci/travis_script_rust.sh @@ -36,7 +36,6 @@ cargo rustc -- -D warnings cargo build cargo test -cargo bench cargo run --example builders cargo run --example dynamic_types cargo run --example read_csv diff --git a/rust/src/parquet/column/mod.rs b/rust/src/parquet/column/mod.rs index 09c4bde51f771..4ced32e28cbb9 100644 --- a/rust/src/parquet/column/mod.rs +++ b/rust/src/parquet/column/mod.rs @@ -35,7 +35,7 @@ //! The example uses column writer and reader APIs to write raw values, definition and //! repetition levels and read them to verify write/read correctness. //! -//! ```rust +//! ```rust,no_run //! use std::{fs, path::Path, rc::Rc}; //! //! use arrow::parquet::{ @@ -48,7 +48,7 @@ //! schema::parser::parse_message_type, //! }; //! -//! let path = Path::new("target/debug/examples/column_sample.parquet"); +//! let path = Path::new("/path/to/column_sample.parquet"); //! //! // Writing data using column writer API. //! diff --git a/rust/src/parquet/file/mod.rs b/rust/src/parquet/file/mod.rs index ebaebbad0bb6f..38fe8fa9b15b1 100644 --- a/rust/src/parquet/file/mod.rs +++ b/rust/src/parquet/file/mod.rs @@ -26,7 +26,7 @@ //! //! # Example of writing a new file //! -//! ```rust +//! ```rust,no_run //! use std::{fs, path::Path, rc::Rc}; //! //! use arrow::parquet::{ @@ -37,7 +37,7 @@ //! schema::parser::parse_message_type, //! }; //! -//! let path = Path::new("target/debug/examples/sample.parquet"); +//! let path = Path::new("/path/to/sample.parquet"); //! //! let message_type = " //! message schema { @@ -61,11 +61,11 @@ //! ``` //! # Example of reading an existing file //! -//! ```rust +//! ```rust,no_run //! use arrow::parquet::file::reader::{FileReader, SerializedFileReader}; //! use std::{fs::File, path::Path}; //! -//! let path = Path::new("target/debug/examples/sample.parquet"); +//! let path = Path::new("/path/to/sample.parquet"); //! if let Ok(file) = File::open(&path) { //! let file = File::open(&path).unwrap(); //! let reader = SerializedFileReader::new(file).unwrap(); From 0a631dbadb81a95c599ab68a2fd0801144d59f52 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Fri, 28 Dec 2018 00:18:31 -0600 Subject: [PATCH 77/80] ARROW-4113: [R] Fix version number Author: Kouhei Sutou Closes #3278 from kou/r-fix-package-version and squashes the following commits: 17fe7da6 Remove R from allow_failures 50377004 Fix version number --- .travis.yml | 1 - dev/release/00-prepare.sh | 41 ++++++++++++++++++++++++++------------- r/DESCRIPTION | 2 +- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/.travis.yml b/.travis.yml index b37194f8f2414..059daeef8fd14 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,6 @@ matrix: fast_finish: true allow_failures: - jdk: oraclejdk9 - - language: r include: - name: "Lint C++, Python, R" os: linux diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 141882e22566a..47ef760b86b9e 100755 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -22,7 +22,20 @@ set -e SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" update_versions() { - local version=$1 + local base_version=$1 + local next_version=$2 + local type=$3 + + case ${type} in + release) + version=${base_version} + r_version=${base_version} + ;; + snapshot) + version=${next_version}-SNAPSHOT + r_version=${base_version}.9000 + ;; + esac cd "${SOURCE_DIR}/../../cpp" sed -i.bak -r -e \ @@ -70,7 +83,7 @@ update_versions() { cd "${SOURCE_DIR}/../../r" sed -i.bak -r -e \ - "s/^Version: .+/Version: ${version}/" \ + "s/^Version: .+/Version: ${r_version}/" \ DESCRIPTION rm -f DESCRIPTION.bak git add DESCRIPTION @@ -95,8 +108,8 @@ update_versions() { if [ "$#" -eq 2 ]; then version=$1 - nextVersion=$2 - nextVersionSNAPSHOT=${nextVersion}-SNAPSHOT + next_version=$2 + next_version_snapshot=${next_version}-SNAPSHOT tag=apache-arrow-${version} echo "Updating changelog for $version" @@ -113,23 +126,23 @@ if [ "$#" -eq 2 ]; then git commit -m "[Release] Update .deb/.rpm changelogs for $version" cd - - echo "prepare release ${version} on tag ${tag} then reset to version ${nextVersionSNAPSHOT}" + echo "prepare release ${version} on tag ${tag} then reset to version ${next_version_snapshot}" - update_versions "${version}" + update_versions "${version}" "${next_version}" "release" git commit -m "[Release] Update versions for ${version}" cd "${SOURCE_DIR}/../../java" mvn release:clean - mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${nextVersionSNAPSHOT} + mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${next_version_snapshot} cd - - echo "Updating versions for ${nextVersionSNAPSHOT}" - update_versions "${nextVersionSNAPSHOT}" - git commit -m "[Release] Update versions for ${nextVersionSNAPSHOT}" + echo "Updating versions for ${next_version_snapshot}" + update_versions "${version}" "${next_version}" "snapshot" + git commit -m "[Release] Update versions for ${next_version_snapshot}" - echo "Updating .deb package names for ${nextVersion}" + echo "Updating .deb package names for ${next_version}" deb_lib_suffix=$(echo $version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') - next_deb_lib_suffix=$(echo $nextVersion | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') + next_deb_lib_suffix=$(echo $next_version | sed -r -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') cd $SOURCE_DIR/../tasks/linux-packages/ for target in debian*/lib*${deb_lib_suffix}.install; do git mv \ @@ -150,12 +163,12 @@ if [ "$#" -eq 2 ]; then sed -i.bak -r -e "${deb_lib_suffix_substitute_pattern}" rat_exclude_files.txt rm -f rat_exclude_files.txt.bak git add rat_exclude_files.txt - git commit -m "[Release] Update .deb package names for $nextVersion" + git commit -m "[Release] Update .deb package names for $next_version" cd - echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" else - echo "Usage: $0 " + echo "Usage: $0 " exit fi diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 10c28c3e7c42e..45e0f83dcbd0a 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: R Integration to 'Apache' 'Arrow' -Version: 0.12.0-SNAPSHOT +Version: 0.11.0.9000 Authors@R: c( person("Romain", "François", email = "romain@rstudio.com", role = c("aut", "cre")), person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")), From 68daba2ba7390d0afee072aa00271a60d8ad4b07 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 28 Dec 2018 15:56:55 +0100 Subject: [PATCH 78/80] ARROW-3020: [C++/Python] Allow empty arrow::Table objects to be written as empty Parquet row groups While it's unclear how useful this is, it at least preserves the intent of the user if they decide to call `write_table` with an empty table Author: Wes McKinney Closes #3269 from wesm/ARROW-3020 and squashes the following commits: b8c0cc2d Revert changes to CMakeLists.txt 12b92cf6 Allow empty arrow::Table objects to be written as empty Parquet row groups, and read back --- cpp/src/parquet/arrow/writer.cc | 30 ++++++++++++++++++++-------- python/pyarrow/_parquet.pyx | 13 ++++++------ python/pyarrow/tests/test_parquet.py | 18 +++++++++++++++++ 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index a8153cac1ebea..a5c0a62994b1b 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -312,6 +312,10 @@ class ArrowColumnWriter { Status Write(const Array& data); Status Write(const ChunkedArray& data, int64_t offset, const int64_t size) { + if (data.length() == 0) { + return Status::OK(); + } + int64_t absolute_position = 0; int chunk_index = 0; int64_t chunk_offset = 0; @@ -1134,22 +1138,32 @@ Status WriteFileMetaData(const FileMetaData& file_metadata, namespace {} // namespace Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) { - if (chunk_size <= 0) { + if (chunk_size <= 0 && table.num_rows() > 0) { return Status::Invalid("chunk size per row_group must be greater than 0"); } else if (chunk_size > impl_->properties().max_row_group_length()) { chunk_size = impl_->properties().max_row_group_length(); } - for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) { - int64_t offset = chunk * chunk_size; - int64_t size = std::min(chunk_size, table.num_rows() - offset); - - RETURN_NOT_OK_ELSE(NewRowGroup(size), PARQUET_IGNORE_NOT_OK(Close())); + auto WriteRowGroup = [&](int64_t offset, int64_t size) { + RETURN_NOT_OK(NewRowGroup(size)); for (int i = 0; i < table.num_columns(); i++) { auto chunked_data = table.column(i)->data(); - RETURN_NOT_OK_ELSE(WriteColumnChunk(chunked_data, offset, size), - PARQUET_IGNORE_NOT_OK(Close())); + RETURN_NOT_OK(WriteColumnChunk(chunked_data, offset, size)); } + return Status::OK(); + }; + + if (table.num_rows() == 0) { + // Append a row group with 0 rows + RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close())); + return Status::OK(); + } + + for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) { + int64_t offset = chunk * chunk_size; + RETURN_NOT_OK_ELSE( + WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)), + PARQUET_IGNORE_NOT_OK(Close())); } return Status::OK(); } diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 2e92bac9a74d8..fcecaf5680e42 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -909,17 +909,16 @@ cdef class ParquetWriter: check_status(self.sink.get().Close()) def write_table(self, Table table, row_group_size=None): - cdef CTable* ctable = table.table + cdef: + CTable* ctable = table.table + int64_t c_row_group_size if row_group_size is None or row_group_size == -1: - if ctable.num_rows() > 0: - row_group_size = ctable.num_rows() - else: - row_group_size = 1 + c_row_group_size = ctable.num_rows() elif row_group_size == 0: raise ValueError('Row group size cannot be 0') - - cdef int64_t c_row_group_size = row_group_size + else: + c_row_group_size = row_group_size with nogil: check_status(self.writer.get() diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 82c80e9e09d13..9f05170bdbeba 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2251,6 +2251,24 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): writer.write_table(table2) +def test_empty_row_groups(tempdir): + # ARROW-3020 + table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0']) + + path = tempdir / 'empty_row_groups.parquet' + + num_groups = 3 + with pq.ParquetWriter(path, table.schema) as writer: + for i in range(num_groups): + writer.write_table(table) + + reader = pq.ParquetFile(path) + assert reader.metadata.num_row_groups == num_groups + + for i in range(num_groups): + assert reader.read_row_group(i).equals(table) + + def test_writing_empty_lists(): # ARROW-2591: [Python] Segmentation fault issue in pq.write_table arr1 = pa.array([[], []], pa.list_(pa.int32())) From 8ed97cc15a2eff95dad28d3f5dce5af944f02ea3 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 28 Dec 2018 16:07:08 +0100 Subject: [PATCH 79/80] ARROW-4129: [Python] Fix syntax problem in benchmark docs Author: Uwe L. Korn Closes #3282 from xhochy/ARROW-4129 and squashes the following commits: 2430f156 ARROW-4129: Fix syntax problem in benchmark docs --- docs/source/python/benchmarks.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/benchmarks.rst b/docs/source/python/benchmarks.rst index 6c3144ae58637..7672294a4eddf 100644 --- a/docs/source/python/benchmarks.rst +++ b/docs/source/python/benchmarks.rst @@ -50,4 +50,4 @@ Compatibility We only expect the benchmarking setup to work with Python 3.6 or later, on a Unix-like system. -.. asv:: https://asv.readthedocs.org/ +.. _asv: https://asv.readthedocs.org/ From 7074889602a2279cfa2440697040a946628f5b56 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Sun, 30 Dec 2018 09:56:05 +0900 Subject: [PATCH 80/80] ARROW-4132: [GLib] Add more GArrowTable constructors Author: Kouhei Sutou Closes #3285 from kou/glib-table-new and squashes the following commits: 8bab8046 Add more GArrowTable constructors --- c_glib/arrow-glib/composite-array.h | 2 + c_glib/arrow-glib/orc-file-reader.h | 4 +- c_glib/arrow-glib/table.cpp | 204 +++++++++++++++++++++++++++- c_glib/arrow-glib/table.h | 33 ++++- c_glib/arrow-glib/version.h.in | 23 ++++ c_glib/test/test-table.rb | 61 +++++++-- 6 files changed, 310 insertions(+), 17 deletions(-) diff --git a/c_glib/arrow-glib/composite-array.h b/c_glib/arrow-glib/composite-array.h index c634dbfc3b006..10432e2e56ba3 100644 --- a/c_glib/arrow-glib/composite-array.h +++ b/c_glib/arrow-glib/composite-array.h @@ -130,8 +130,10 @@ GArrowStructArray *garrow_struct_array_new(GArrowDataType *data_type, GArrowArray *garrow_struct_array_get_field(GArrowStructArray *array, gint i); +#ifndef GARROW_DISABLE_DEPRECATED GARROW_DEPRECATED_IN_0_10_FOR(garrow_struct_array_flatten) GList *garrow_struct_array_get_fields(GArrowStructArray *array); +#endif GARROW_AVAILABLE_IN_0_10 GList *garrow_struct_array_flatten(GArrowStructArray *array, GError **error); diff --git a/c_glib/arrow-glib/orc-file-reader.h b/c_glib/arrow-glib/orc-file-reader.h index 9b2dbadefe43a..97cf1efa92ff7 100644 --- a/c_glib/arrow-glib/orc-file-reader.h +++ b/c_glib/arrow-glib/orc-file-reader.h @@ -39,7 +39,7 @@ garrow_orc_file_reader_new(GArrowSeekableInputStream *file, GError **error); #ifndef GARROW_DISABLE_DEPRECATED -G_GNUC_DEPRECATED_FOR(garrow_orc_file_reader_set_field_indices) +GARROW_DEPRECATED_IN_0_12_FOR(garrow_orc_file_reader_set_field_indices) void garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader, const gint *field_indexes, @@ -50,7 +50,7 @@ garrow_orc_file_reader_set_field_indices(GArrowORCFileReader *reader, const gint *field_indices, guint n_field_indices); #ifndef GARROW_DISABLE_DEPRECATED -G_GNUC_DEPRECATED_FOR(garrow_orc_file_reader_get_field_indices) +GARROW_DEPRECATED_IN_0_12_FOR(garrow_orc_file_reader_get_field_indices) const gint * garrow_orc_file_reader_get_field_indexes(GArrowORCFileReader *reader, guint *n_field_indexes); diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index f9e1b951a3658..b889eb2c9da23 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -21,8 +21,10 @@ # include #endif +#include #include #include +#include #include #include @@ -133,22 +135,218 @@ garrow_table_class_init(GArrowTableClass *klass) * @columns: (element-type GArrowColumn): The columns of the table. * * Returns: A newly created #GArrowTable. + * + * Deprecated: 0.12.0: Use garrow_table_new_values() instead. */ GArrowTable * garrow_table_new(GArrowSchema *schema, GList *columns) { + auto arrow_schema = garrow_schema_get_raw(schema); std::vector> arrow_columns; for (GList *node = columns; node; node = node->next) { - GArrowColumn *column = GARROW_COLUMN(node->data); + auto column = GARROW_COLUMN(node->data); arrow_columns.push_back(garrow_column_get_raw(column)); } - auto arrow_table = - arrow::Table::Make(garrow_schema_get_raw(schema), arrow_columns); + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); return garrow_table_new_raw(&arrow_table); } +/** + * garrow_table_new_values: (skip) + * @schema: The schema of the table. + * @values: The values of the table. All values must be instance of the + * same class. Available classes are #GArrowColumn, #GArrowArray and + * #GArrowRecordBatch. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_values(GArrowSchema *schema, + GList *values, + GError **error) +{ + const auto context = "[table][new][values]"; + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_columns; + std::vector> arrow_arrays; + std::vector> arrow_record_batches; + for (GList *node = values; node; node = node->next) { + if (GARROW_IS_COLUMN(node->data)) { + auto column = GARROW_COLUMN(node->data); + arrow_columns.push_back(garrow_column_get_raw(column)); + } else if (GARROW_IS_ARRAY(node->data)) { + auto array = GARROW_ARRAY(node->data); + arrow_arrays.push_back(garrow_array_get_raw(array)); + } else if (GARROW_IS_RECORD_BATCH(node->data)) { + auto record_batch = GARROW_RECORD_BATCH(node->data); + arrow_record_batches.push_back(garrow_record_batch_get_raw(record_batch)); + } else { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s: %s", + context, + "value must be one of " + "GArrowColumn, GArrowArray and GArrowRecordBatch"); + return NULL; + } + } + + size_t n_types = 0; + if (!arrow_columns.empty()) { + ++n_types; + } + if (!arrow_arrays.empty()) { + ++n_types; + } + if (!arrow_record_batches.empty()) { + ++n_types; + } + if (n_types > 1) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s: %s", + context, + "all values must be the same objects of " + "GArrowColumn, GArrowArray or GArrowRecordBatch"); + return NULL; + } + + if (!arrow_columns.empty()) { + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } else if (!arrow_arrays.empty()) { + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } else { + std::shared_ptr arrow_table; + auto status = arrow::Table::FromRecordBatches(arrow_schema, + arrow_record_batches, + &arrow_table); + if (garrow_error_check(error, status, context)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } + } +} + +/** + * garrow_table_new_columns: + * @schema: The schema of the table. + * @columns: (array length=n_columns): The columns of the table. + * @n_columns: The number of columns. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_columns(GArrowSchema *schema, + GArrowColumn **columns, + gsize n_columns, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_columns; + for (gsize i = 0; i < n_columns; ++i) { + arrow_columns.push_back(garrow_column_get_raw(columns[i])); + } + + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, "[table][new][columns]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + +/** + * garrow_table_new_arrays: + * @schema: The schema of the table. + * @arrays: (array length=n_arrays): The arrays of the table. + * @n_arrays: The number of arrays. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_arrays(GArrowSchema *schema, + GArrowArray **arrays, + gsize n_arrays, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_arrays; + for (gsize i = 0; i < n_arrays; ++i) { + arrow_arrays.push_back(garrow_array_get_raw(arrays[i])); + } + + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays); + auto status = arrow_table->Validate(); + if (garrow_error_check(error, status, "[table][new][arrays]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + +/** + * garrow_table_new_record_batches: + * @schema: The schema of the table. + * @record_batches: (array length=n_record_batches): The record batches + * that have data for the table. + * @n_record_batches: The number of record batches. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowTable or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowTable * +garrow_table_new_record_batches(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_record_batches; + for (gsize i = 0; i < n_record_batches; ++i) { + auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]); + arrow_record_batches.push_back(arrow_record_batch); + } + + std::shared_ptr arrow_table; + auto status = arrow::Table::FromRecordBatches(arrow_schema, + arrow_record_batches, + &arrow_table); + if (garrow_error_check(error, status, "[table][new][record-batches]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + /** * garrow_table_equal: * @table: A #GArrowTable. diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index ef7b0f5c289ce..bde2535033c7d 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -20,7 +20,9 @@ #pragma once #include +#include #include +#include G_BEGIN_DECLS @@ -35,8 +37,35 @@ struct _GArrowTableClass GObjectClass parent_class; }; -GArrowTable *garrow_table_new (GArrowSchema *schema, - GList *columns); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_12_FOR(garrow_table_new_values) +GArrowTable * +garrow_table_new(GArrowSchema *schema, + GList *columns); +#endif +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_values(GArrowSchema *schema, + GList *values, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_columns(GArrowSchema *schema, + GArrowColumn **columns, + gsize n_columns, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_arrays(GArrowSchema *schema, + GArrowArray **arrays, + gsize n_arrays, + GError **error); +GARROW_AVAILABLE_IN_0_12 +GArrowTable * +garrow_table_new_record_batches(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches, + GError **error); gboolean garrow_table_equal (GArrowTable *table, GArrowTable *other_table); diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index eb734250e2352..501827d06e054 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_0_12: + * + * You can use this macro value for compile time API version check. + * + * Since: 0.12.0 + */ +#define GARROW_VERSION_0_12 G_ENCODE_VERSION(0, 12) + /** * GARROW_VERSION_0_10: * @@ -166,6 +175,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_12 +# define GARROW_DEPRECATED_IN_0_12 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_0_12_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_0_12 +# define GARROW_DEPRECATED_IN_0_12_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_12 +# define GARROW_AVAILABLE_IN_0_12 GARROW_UNAVAILABLE(0, 12) +#else +# define GARROW_AVAILABLE_IN_0_12 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_10 # define GARROW_DEPRECATED_IN_0_10 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_0_10_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb index 4394ad1353e7d..871e0d7c5ffd4 100644 --- a/c_glib/test/test-table.rb +++ b/c_glib/test/test-table.rb @@ -17,21 +17,19 @@ class TestTable < Test::Unit::TestCase include Helper::Buildable + include Helper::Omittable sub_test_case(".new") do - def test_columns - fields = [ + def setup + @fields = [ Arrow::Field.new("visible", Arrow::BooleanDataType.new), Arrow::Field.new("valid", Arrow::BooleanDataType.new), ] - schema = Arrow::Schema.new(fields) - columns = [ - Arrow::Column.new(fields[0], build_boolean_array([true])), - Arrow::Column.new(fields[1], build_boolean_array([false])), - ] - table = Arrow::Table.new(schema, columns) + @schema = Arrow::Schema.new(@fields) + end - data = table.n_columns.times.collect do |i| + def dump_table(table) + table.n_columns.times.collect do |i| column = table.get_column(i) values = [] column.data.chunks.each do |chunk| @@ -44,11 +42,54 @@ def test_columns values, ] end + end + + def test_columns + columns = [ + Arrow::Column.new(@fields[0], build_boolean_array([true])), + Arrow::Column.new(@fields[1], build_boolean_array([false])), + ] + table = Arrow::Table.new(@schema, columns) assert_equal([ ["visible", [true]], ["valid", [false]], ], - data) + dump_table(table)) + end + + def test_arrays + require_gi_bindings(3, 3, 1) + arrays = [ + build_boolean_array([true]), + build_boolean_array([false]), + ] + table = Arrow::Table.new(@schema, arrays) + assert_equal([ + ["visible", [true]], + ["valid", [false]], + ], + dump_table(table)) + end + + def test_record_batches + require_gi_bindings(3, 3, 1) + record_batches = [ + build_record_batch({ + "visible" => build_boolean_array([true]), + "valid" => build_boolean_array([false]) + }), + build_record_batch({ + "visible" => build_boolean_array([false]), + "valid" => build_boolean_array([true]) + }), + ] + table = Arrow::Table.new(@schema, record_batches) + + assert_equal([ + ["visible", [true, false]], + ["valid", [false, true]], + ], + dump_table(table)) end end