Skip to content

Commit 0f43848

Browse files
committed
Fixes for building pyarrow with Parquet support on MSVC. Add to appveyor build
1 parent a4f3259 commit 0f43848

File tree

8 files changed

+127
-41
lines changed

8 files changed

+127
-41
lines changed

ci/msvc-build.bat

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,19 @@
1919

2020
conda create -n arrow -q -y python=%PYTHON% ^
2121
six pytest setuptools numpy pandas cython
22-
conda install -n arrow -q -y -c conda-forge flatbuffers rapidjson
22+
conda install -n arrow -q -y -c conda-forge ^
23+
flatbuffers rapidjson ^
24+
cmake git boost-cpp thrift-cpp snappy zlib brotli
25+
2326
call activate arrow
2427

2528
set ARROW_HOME=%CONDA_PREFIX%\Library
26-
set FLATBUFFERS_HOME=%CONDA_PREFIX%\Library
27-
set RAPIDJSON_HOME=%CONDA_PREFIX%\Library
29+
set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
2830

2931
@rem Build and test Arrow C++ libraries
3032

3133
mkdir cpp\build
32-
cd cpp\build
34+
pushd cpp\build
3335

3436
cmake -G "%GENERATOR%" ^
3537
-DCMAKE_INSTALL_PREFIX=%CONDA_PREFIX%\Library ^
@@ -44,10 +46,28 @@ cmake --build . --target INSTALL --config Release || exit /B
4446
set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%
4547

4648
ctest -VV || exit /B
49+
popd
50+
51+
@rem Build parquet-cpp
52+
53+
git clone https://github.com/apache/parquet-cpp.git || exit /B
54+
mkdir parquet-cpp\build
55+
pushd parquet-cpp\build
56+
57+
set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
58+
set PARQUET_HOME=%CONDA_PREFIX%\Library
59+
cmake -G "%GENERATOR%" ^
60+
-DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^
61+
-DCMAKE_BUILD_TYPE=Release ^
62+
-DPARQUET_ZLIB_VENDORED=off ^
63+
-DPARQUET_BUILD_TESTS=off .. || exit /B
64+
cmake --build . --target INSTALL --config Release || exit /B
65+
popd
4766

4867
@rem Build and import pyarrow
4968
set PYTHONPATH=
5069

51-
cd ..\..\python
52-
python setup.py build_ext --inplace || exit /B
70+
pushd python
71+
python setup.py build_ext --inplace --with-parquet || exit /B
5372
py.test pyarrow -v -s || exit /B
73+
popd

python/CMakeLists.txt

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -219,12 +219,6 @@ include_directories(SYSTEM
219219
# Dependencies
220220
############################################################
221221

222-
## Parquet
223-
find_package(Parquet)
224-
if(PARQUET_FOUND)
225-
include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
226-
endif()
227-
228222
## Arrow
229223
find_package(Arrow REQUIRED)
230224
include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
@@ -286,9 +280,14 @@ set(LINK_LIBS
286280
)
287281

288282
if (PYARROW_BUILD_PARQUET)
283+
## Parquet
284+
find_package(Parquet)
285+
289286
if(NOT (PARQUET_FOUND AND PARQUET_ARROW_FOUND))
290287
message(FATAL_ERROR "Unable to locate Parquet libraries")
291288
endif()
289+
include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
290+
292291
if (PYARROW_BUNDLE_ARROW_CPP)
293292
get_filename_component(PARQUET_LIBRARY_DIR ${PARQUET_SHARED_LIB} DIRECTORY)
294293
get_filename_component(PARQUET_LIBRARY_NAME ${PARQUET_SHARED_LIB} NAME_WE)
@@ -333,11 +332,14 @@ if (PYARROW_BUILD_PARQUET)
333332
#SET(PARQUET_ARROW_SHARED_LIB
334333
# ${BUILD_OUTPUT_ROOT_DIRECTORY}/libparquet_arrow${CMAKE_SHARED_LIBRARY_SUFFIX})
335334
endif()
335+
ADD_THIRDPARTY_LIB(parquet
336+
SHARED_LIB ${PARQUET_SHARED_LIB})
336337
ADD_THIRDPARTY_LIB(parquet_arrow
337338
SHARED_LIB ${PARQUET_ARROW_SHARED_LIB})
338339
set(LINK_LIBS
339340
${LINK_LIBS}
340-
parquet_arrow)
341+
parquet_shared
342+
parquet_arrow_shared)
341343
set(CYTHON_EXTENSIONS
342344
${CYTHON_EXTENSIONS}
343345
_parquet)

python/cmake_modules/FindArrow.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,8 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIBS)
7676
if (MSVC)
7777
set(ARROW_STATIC_LIB ${ARROW_LIB_PATH})
7878
set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH})
79-
set(ARROW_JEMALLOC_STATIC_LIB ${ARROW_JEMALLOC_LIB_PATH})
8079
set(ARROW_SHARED_LIB ${ARROW_STATIC_LIB})
8180
set(ARROW_PYTHON_SHARED_LIB ${ARROW_PYTHON_STATIC_LIB})
82-
set(ARROW_JEMALLOC_SHARED_LIB ${ARROW_JEMALLOC_STATIC_LIB})
8381
else()
8482
set(ARROW_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/libarrow.a)
8583
set(ARROW_PYTHON_STATIC_LIB ${ARROW_PYTHON_LIB_PATH}/libarrow_python.a)

python/cmake_modules/FindParquet.cmake

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,19 @@
1-
# Copyright 2012 Cloudera Inc.
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
28
#
3-
# Licensed under the Apache License, Version 2.0 (the "License");
4-
# you may not use this file except in compliance with the License.
5-
# You may obtain a copy of the License at
9+
# http://www.apache.org/licenses/LICENSE-2.0
610
#
7-
# http://www.apache.org/licenses/LICENSE-2.0
8-
#
9-
# Unless required by applicable law or agreed to in writing, software
10-
# distributed under the License is distributed on an "AS IS" BASIS,
11-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
# See the License for the specific language governing permissions and
13-
# limitations under the License.
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
1417

1518
# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so)
1619
# This module defines
@@ -71,21 +74,31 @@ endif()
7174

7275
if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES)
7376
set(PARQUET_FOUND TRUE)
74-
set(PARQUET_LIB_NAME libparquet)
75-
set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a)
76-
set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
77+
if (MSVC)
78+
set(PARQUET_STATIC_LIB "${PARQUET_LIBRARIES}_static")
79+
set(PARQUET_SHARED_LIB "${PARQUET_LIBRARIES}")
80+
else()
81+
set(PARQUET_LIB_NAME libparquet)
82+
set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a)
83+
set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
84+
endif()
7785
else ()
7886
set(PARQUET_FOUND FALSE)
7987
endif ()
8088

8189
if (PARQUET_INCLUDE_DIR AND PARQUET_ARROW_LIBRARIES)
8290
set(PARQUET_ARROW_FOUND TRUE)
8391
get_filename_component(PARQUET_ARROW_LIBS ${PARQUET_ARROW_LIBRARIES} PATH)
84-
set(PARQUET_ARROW_LIB_NAME libparquet_arrow)
85-
set(PARQUET_ARROW_STATIC_LIB
86-
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a)
87-
set(PARQUET_ARROW_SHARED_LIB
88-
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
92+
if (MSVC)
93+
set(PARQUET_STATIC_LIB "${PARQUET_ARROW_LIBRARIES}_static")
94+
set(PARQUET_SHARED_LIB "${PARQUET_ARROW_LIBRARIES}")
95+
else()
96+
set(PARQUET_ARROW_LIB_NAME libparquet_arrow)
97+
set(PARQUET_ARROW_STATIC_LIB
98+
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a)
99+
set(PARQUET_ARROW_SHARED_LIB
100+
${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
101+
endif()
89102
else ()
90103
set(PARQUET_ARROW_FOUND FALSE)
91104
endif ()

python/doc/source/development.rst

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,14 +174,37 @@ You should be able to run the unit tests with:
174174
Windows
175175
=======
176176

177-
First, make sure you can `build the C++ library <https://github.com/apache/arrow/blob/master/cpp/doc/Windows.md>`_.
177+
First, we bootstrap a conda environment similar to the `C++ build instructions
178+
<https://github.com/apache/arrow/blob/master/cpp/doc/Windows.md>`_. This
179+
includes all the dependencies for Arrow and the Apache Parquet C++ libraries.
178180

179-
Now, we need to build and install the C++ libraries someplace.
181+
First, starting from fresh clones of Apache Arrow and parquet-cpp:
182+
183+
.. code-block:: shell
184+
185+
git clone https://github.com/apache/arrow.git
186+
git clone https://github.com/apache/parquet-cpp.git
187+
188+
.. code-block:: shell
189+
190+
conda create -n arrow-dev cmake git boost-cpp ^
191+
flatbuffers snappy zlib brotli thrift-cpp rapidjson
192+
activate arrow-dev
193+
194+
As one git housekeeping item, we must run this command in our Arrow clone:
195+
196+
.. code-block:: shell
197+
198+
cd arrow
199+
git config core.symlinks true
200+
201+
Now, we build and install Arrow C++ libraries
180202

181203
.. code-block:: shell
182204
183205
mkdir cpp\build
184206
cd cpp\build
207+
set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
185208
set ARROW_HOME=C:\thirdparty
186209
cmake -G "Visual Studio 14 2015 Win64" ^
187210
-DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
@@ -191,6 +214,22 @@ Now, we need to build and install the C++ libraries someplace.
191214
cmake --build . --target INSTALL --config Release
192215
cd ..\..
193216
217+
Now, we build parquet-cpp and install the result in the same place:
218+
219+
.. code-block:: shell
220+
221+
mkdir ..\parquet-cpp\build
222+
pushd ..\parquet-cpp\build
223+
set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
224+
set PARQUET_HOME=C:\thirdparty
225+
cmake -G "Visual Studio 14 2015 Win64" ^
226+
-DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^
227+
-DCMAKE_BUILD_TYPE=Release ^
228+
-DPARQUET_ZLIB_VENDORED=off ^
229+
-DPARQUET_BUILD_TESTS=off ..
230+
cmake --build . --target INSTALL --config Release
231+
popd
232+
194233
After that, we must put the install directory's bin path in our ``%PATH%``:
195234

196235
.. code-block:: shell
@@ -202,7 +241,13 @@ Now, we can build pyarrow:
202241
.. code-block:: shell
203242
204243
cd python
205-
python setup.py build_ext --inplace
244+
python setup.py build_ext --inplace --with-parquet
245+
246+
Then run the unit tests with:
247+
248+
.. code-block:: shell
249+
250+
py.test pyarrow -v
206251
207252
Running C++ unit tests with Python
208253
----------------------------------

python/pyarrow/filesystem.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ def read_parquet(self, path, columns=None, metadata=None, schema=None,
9292
filesystem=self)
9393
return dataset.read(columns=columns, nthreads=nthreads)
9494

95+
@property
96+
def pathsep(self):
97+
return '/'
98+
9599

96100
class LocalFilesystem(Filesystem):
97101

@@ -132,6 +136,10 @@ def open(self, path, mode='rb'):
132136
"""
133137
return open(path, mode=mode)
134138

139+
@property
140+
def pathsep(self):
141+
return os.path.sep
142+
135143

136144
class HdfsClient(lib._HdfsClient, Filesystem):
137145
"""

python/pyarrow/parquet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/'):
582582

583583
if is_string(path_or_paths) and fs.isdir(path_or_paths):
584584
manifest = ParquetManifest(path_or_paths, filesystem=fs,
585-
pathsep=pathsep)
585+
pathsep=fs.pathsep)
586586
metadata_path = manifest.metadata_path
587587
pieces = manifest.pieces
588588
partitions = manifest.partitions

python/pyarrow/tests/test_parquet.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,10 @@ def test_pandas_column_selection(tmpdir):
192192

193193
def _random_integers(size, dtype):
194194
# We do not generate integers outside the int64 range
195-
i64_info = np.iinfo('int64')
195+
platform_int_info = np.iinfo('int_')
196196
iinfo = np.iinfo(dtype)
197-
return np.random.randint(max(iinfo.min, i64_info.min),
198-
min(iinfo.max, i64_info.max),
197+
return np.random.randint(max(iinfo.min, platform_int_info.min),
198+
min(iinfo.max, platform_int_info.max),
199199
size=size).astype(dtype)
200200

201201

0 commit comments

Comments
 (0)