Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions ci/msvc-build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,19 @@

conda create -n arrow -q -y python=%PYTHON% ^
six pytest setuptools numpy pandas cython
conda install -n arrow -q -y -c conda-forge flatbuffers rapidjson
conda install -n arrow -q -y -c conda-forge ^
flatbuffers rapidjson ^
cmake git boost-cpp thrift-cpp snappy zlib brotli

call activate arrow

set ARROW_HOME=%CONDA_PREFIX%\Library
set FLATBUFFERS_HOME=%CONDA_PREFIX%\Library
set RAPIDJSON_HOME=%CONDA_PREFIX%\Library
set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library

@rem Build and test Arrow C++ libraries

mkdir cpp\build
cd cpp\build
pushd cpp\build

cmake -G "%GENERATOR%" ^
-DCMAKE_INSTALL_PREFIX=%CONDA_PREFIX%\Library ^
Expand All @@ -44,10 +46,28 @@ cmake --build . --target INSTALL --config Release || exit /B
set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%

ctest -VV || exit /B
popd

@rem Build parquet-cpp

git clone https://github.com/apache/parquet-cpp.git || exit /B
mkdir parquet-cpp\build
pushd parquet-cpp\build

set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
set PARQUET_HOME=%CONDA_PREFIX%\Library
cmake -G "%GENERATOR%" ^
-DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^
-DCMAKE_BUILD_TYPE=Release ^
-DPARQUET_ZLIB_VENDORED=off ^
-DPARQUET_BUILD_TESTS=off .. || exit /B
cmake --build . --target INSTALL --config Release || exit /B
popd

@rem Build and import pyarrow
set PYTHONPATH=

cd ..\..\python
python setup.py build_ext --inplace || exit /B
pushd python
python setup.py build_ext --inplace --with-parquet || exit /B
py.test pyarrow -v -s || exit /B
popd
16 changes: 9 additions & 7 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -219,12 +219,6 @@ include_directories(SYSTEM
# Dependencies
############################################################

## Parquet
find_package(Parquet)
if(PARQUET_FOUND)
include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
endif()

## Arrow
find_package(Arrow REQUIRED)
include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
Expand Down Expand Up @@ -286,9 +280,14 @@ set(LINK_LIBS
)

if (PYARROW_BUILD_PARQUET)
## Parquet
find_package(Parquet)

if(NOT (PARQUET_FOUND AND PARQUET_ARROW_FOUND))
message(FATAL_ERROR "Unable to locate Parquet libraries")
endif()
include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})

if (PYARROW_BUNDLE_ARROW_CPP)
get_filename_component(PARQUET_LIBRARY_DIR ${PARQUET_SHARED_LIB} DIRECTORY)
get_filename_component(PARQUET_LIBRARY_NAME ${PARQUET_SHARED_LIB} NAME_WE)
Expand Down Expand Up @@ -333,11 +332,14 @@ if (PYARROW_BUILD_PARQUET)
#SET(PARQUET_ARROW_SHARED_LIB
# ${BUILD_OUTPUT_ROOT_DIRECTORY}/libparquet_arrow${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
ADD_THIRDPARTY_LIB(parquet
SHARED_LIB ${PARQUET_SHARED_LIB})
ADD_THIRDPARTY_LIB(parquet_arrow
SHARED_LIB ${PARQUET_ARROW_SHARED_LIB})
set(LINK_LIBS
${LINK_LIBS}
parquet_arrow)
parquet_shared
parquet_arrow_shared)
set(CYTHON_EXTENSIONS
${CYTHON_EXTENSIONS}
_parquet)
Expand Down
2 changes: 0 additions & 2 deletions python/cmake_modules/FindArrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,8 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIBS)
if (MSVC)
set(ARROW_STATIC_LIB ${ARROW_LIB_PATH})
set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH})
set(ARROW_JEMALLOC_STATIC_LIB ${ARROW_JEMALLOC_LIB_PATH})
set(ARROW_SHARED_LIB ${ARROW_STATIC_LIB})
set(ARROW_PYTHON_SHARED_LIB ${ARROW_PYTHON_STATIC_LIB})
set(ARROW_JEMALLOC_SHARED_LIB ${ARROW_JEMALLOC_STATIC_LIB})
else()
set(ARROW_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/libarrow.a)
set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/libarrow_python.a)
Expand Down
51 changes: 32 additions & 19 deletions python/cmake_modules/FindParquet.cmake
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
# Copyright 2012 Cloudera Inc.
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so)
# This module defines
Expand Down Expand Up @@ -71,21 +74,31 @@ endif()

if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES)
set(PARQUET_FOUND TRUE)
set(PARQUET_LIB_NAME libparquet)
set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a)
set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
if (MSVC)
set(PARQUET_STATIC_LIB "${PARQUET_LIBRARIES}_static")
set(PARQUET_SHARED_LIB "${PARQUET_LIBRARIES}")
else()
set(PARQUET_LIB_NAME libparquet)
set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a)
set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
else ()
set(PARQUET_FOUND FALSE)
endif ()

if (PARQUET_INCLUDE_DIR AND PARQUET_ARROW_LIBRARIES)
set(PARQUET_ARROW_FOUND TRUE)
get_filename_component(PARQUET_ARROW_LIBS ${PARQUET_ARROW_LIBRARIES} PATH)
set(PARQUET_ARROW_LIB_NAME libparquet_arrow)
set(PARQUET_ARROW_STATIC_LIB
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a)
set(PARQUET_ARROW_SHARED_LIB
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
if (MSVC)
set(PARQUET_ARROW_STATIC_LIB "${PARQUET_ARROW_LIBRARIES}_static")
set(PARQUET_ARROW_SHARED_LIB "${PARQUET_ARROW_LIBRARIES}")
else()
set(PARQUET_ARROW_LIB_NAME libparquet_arrow)
set(PARQUET_ARROW_STATIC_LIB
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a)
set(PARQUET_ARROW_SHARED_LIB
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
else ()
set(PARQUET_ARROW_FOUND FALSE)
endif ()
Expand Down
51 changes: 48 additions & 3 deletions python/doc/source/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,37 @@ You should be able to run the unit tests with:
Windows
=======

First, make sure you can `build the C++ library <https://github.com/apache/arrow/blob/master/cpp/doc/Windows.md>`_.
First, we bootstrap a conda environment similar to the `C++ build instructions
<https://github.com/apache/arrow/blob/master/cpp/doc/Windows.md>`_. This
includes all the dependencies for Arrow and the Apache Parquet C++ libraries.

Now, we need to build and install the C++ libraries someplace.
First, starting from fresh clones of Apache Arrow and parquet-cpp:

.. code-block:: shell

git clone https://github.com/apache/arrow.git
git clone https://github.com/apache/parquet-cpp.git

.. code-block:: shell

conda create -n arrow-dev cmake git boost-cpp ^
flatbuffers snappy zlib brotli thrift-cpp rapidjson
activate arrow-dev

As one git housekeeping item, we must run this command in our Arrow clone:

.. code-block:: shell

cd arrow
git config core.symlinks true

Now, we build and install Arrow C++ libraries

.. code-block:: shell

mkdir cpp\build
cd cpp\build
set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
set ARROW_HOME=C:\thirdparty
cmake -G "Visual Studio 14 2015 Win64" ^
-DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
Expand All @@ -191,6 +214,22 @@ Now, we need to build and install the C++ libraries someplace.
cmake --build . --target INSTALL --config Release
cd ..\..

Now, we build parquet-cpp and install the result in the same place:

.. code-block:: shell

mkdir ..\parquet-cpp\build
pushd ..\parquet-cpp\build
set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
set PARQUET_HOME=C:\thirdparty
cmake -G "Visual Studio 14 2015 Win64" ^
-DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^
-DCMAKE_BUILD_TYPE=Release ^
-DPARQUET_ZLIB_VENDORED=off ^
-DPARQUET_BUILD_TESTS=off ..
cmake --build . --target INSTALL --config Release
popd

After that, we must put the install directory's bin path in our ``%PATH%``:

.. code-block:: shell
Expand All @@ -202,7 +241,13 @@ Now, we can build pyarrow:
.. code-block:: shell

cd python
python setup.py build_ext --inplace
python setup.py build_ext --inplace --with-parquet

Then run the unit tests with:

.. code-block:: shell

py.test pyarrow -v

Running C++ unit tests with Python
----------------------------------
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ def read_parquet(self, path, columns=None, metadata=None, schema=None,
filesystem=self)
return dataset.read(columns=columns, nthreads=nthreads)

@property
def pathsep(self):
return '/'


class LocalFilesystem(Filesystem):

Expand Down Expand Up @@ -132,6 +136,10 @@ def open(self, path, mode='rb'):
"""
return open(path, mode=mode)

@property
def pathsep(self):
return os.path.sep


class HdfsClient(lib._HdfsClient, Filesystem):
"""
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/'):

if is_string(path_or_paths) and fs.isdir(path_or_paths):
manifest = ParquetManifest(path_or_paths, filesystem=fs,
pathsep=pathsep)
pathsep=fs.pathsep)
metadata_path = manifest.metadata_path
pieces = manifest.pieces
partitions = manifest.partitions
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,10 @@ def test_pandas_column_selection(tmpdir):

def _random_integers(size, dtype):
# We do not generate integers outside the int64 range
i64_info = np.iinfo('int64')
platform_int_info = np.iinfo('int_')
iinfo = np.iinfo(dtype)
return np.random.randint(max(iinfo.min, i64_info.min),
min(iinfo.max, i64_info.max),
return np.random.randint(max(iinfo.min, platform_int_info.min),
min(iinfo.max, platform_int_info.max),
size=size).astype(dtype)


Expand Down