Skip to content

Commit

Permalink
Integrate Arrow dependence into build scripts
Browse files Browse the repository at this point in the history
- Add Pyarrow as an extra dependence in python-package setup.py.
  • Loading branch information
Zhang Zhang committed Jul 20, 2020
1 parent 7a6ca39 commit f7260a4
Show file tree
Hide file tree
Showing 11 changed files with 61 additions and 59 deletions.
25 changes: 23 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@ if (USE_OPENMP)
find_package(OpenMP REQUIRED)
endif (USE_OPENMP)

# Detect Pyarrow installation
find_package(Python3 COMPONENTS Interpreter Development)
if (Python3_FOUND)
set(ARROW_HOME "${Python3_SITELIB}/pyarrow")
message(STATUS "ARROW_HOME: ${ARROW_HOME}")
find_package(Arrow)
find_package(ArrowPython)
endif (Python3_FOUND)

# core xgboost
add_subdirectory(${xgboost_SOURCE_DIR}/src)

Expand All @@ -122,6 +131,14 @@ set_target_properties(dmlc PROPERTIES
CXX_STANDARD 14
CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON)
if (ARROW_FOUND AND ARROW_PYTHON_FOUND)
target_compile_definitions(dmlc PRIVATE
-D_GLIBCXX_USE_CXX11_ABI=0)
if (TARGET dmlc_unit_tests)
target_compile_definitions(dmlc_unit_tests PRIVATE
-D_GLIBCXX_USE_CXX11_ABI=0)
endif (TARGET dmlc_unit_tests)
endif (ARROW_FOUND AND ARROW_PYTHON_FOUND)
if (MSVC)
target_compile_options(dmlc PRIVATE
-D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
Expand Down Expand Up @@ -202,12 +219,16 @@ endif (JVM_BINDINGS)
#-- CLI for xgboost
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc)
target_link_libraries(runxgboost PRIVATE objxgboost)

if (ARROW_FOUND AND ARROW_PYTHON_FOUND)
target_compile_definitions(runxgboost PRIVATE
-D_GLIBCXX_USE_CXX11_ABI=0)
endif (ARROW_FOUND AND ARROW_PYTHON_FOUND)

if (USE_NVTX)
enable_nvtx(runxgboost)
endif (USE_NVTX)

enable_arrow_if_available(runxgboost)

target_include_directories(runxgboost
PRIVATE
${xgboost_SOURCE_DIR}/include
Expand Down
19 changes: 0 additions & 19 deletions cmake/Utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -116,22 +116,3 @@ macro(enable_nvtx target)
target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
endmacro()

macro(enable_arrow_if_available target)
find_package(Arrow)
find_package(ArrowPython)
if (ARROW_FOUND AND ARROW_PYTHON_FOUND)
find_package(Python3 COMPONENTS Development REQUIRED)
target_include_directories(${target} PRIVATE
${ARROW_INCLUDE_DIR}
${ARROW_PYTHON_INCLUDE_DIR}
${Python3_INCLUDE_DIRS})
target_link_libraries(${target} PRIVATE
${ARROW_SHARED_LIB}
${ARROW_PYTHON_SHARED_LIB}
${Python3_LIBRARIES})
target_compile_definitions(${target} PRIVATE
-DXGBOOST_BUILD_ARROW_SUPPORT=1
-D_GLIBCXX_USE_CXX11_ABI=0)
endif (ARROW_FOUND AND ARROW_PYTHON_FOUND)
endmacro()
5 changes: 5 additions & 0 deletions jvm-packages/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ target_include_directories(xgboost4j
${PROJECT_SOURCE_DIR}/dmlc-core/include
${PROJECT_SOURCE_DIR}/rabit/include)

if (ARROW_FOUND AND ARROW_PYTHON_FOUND)
target_compile_definitions(xgboost4j PRIVATE
-D_GLIBCXX_USE_CXX11_ABI=0)
endif (ARROW_FOUND AND ARROW_PYTHON_FOUND)

set_output_directory(xgboost4j ${PROJECT_SOURCE_DIR}/lib)
set_target_properties(
xgboost4j PROPERTIES
Expand Down
1 change: 1 addition & 0 deletions python-package/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ def run(self):
'scikit-learn': ['scikit-learn'],
'dask': ['dask', 'pandas', 'distributed'],
'datatable': ['datatable'],
'pyarrow': ['pyarrow'],
'plotting': ['graphviz', 'matplotlib']
},
maintainer='Hyunsu Cho',
Expand Down
2 changes: 0 additions & 2 deletions python-package/xgboost/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,8 @@ def from_json(self, doc):
# pyarrow
try:
from pyarrow import Table as ArrowTable
from pyarrow import ChunkedArray as ArrowChunkedArray

PYARROW_INSTALLED = True
except ImportError:
ArrowTable = object
ArrowChunkedArray = object
PYARROW_INSTALLED = False
2 changes: 1 addition & 1 deletion python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def set_info(self,
feature_names=None,
feature_types=None):
'''Set meta info for DMatrix.'''
if label is not None and not isinstance(label, str):
if label is not None:
self.set_label(label)
if weight is not None:
self.set_weight(weight)
Expand Down
24 changes: 0 additions & 24 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,27 +316,7 @@ def _is_arrow_table(data):
return lazy_isinstance(data, 'pyarrow.lib', 'Table')


def _transform_arrow_table(data, feature_names=None, feature_types=None,
meta=None, meta_type=None):
if meta:
data = data.column(meta).to_pandas()
if meta_type:
data = data.astype(meta_type)
return data, None, None

if feature_names is None:
feature_names = data.column_names

if feature_types is not None:
raise ValueError(
'ArrowTable has own feature types, cannot pass them in.')

return data, feature_names, feature_types


def _from_arrow_table(data, missing, nthread, feature_names, feature_types):
data, feature_names, feature_types = _transform_arrow_table(
data, feature_names, feature_types, None, None)
nthread = -1
handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixCreateFromArrowTable(
Expand Down Expand Up @@ -672,10 +652,6 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
if _is_dt_df(data):
_meta_from_dt(data, name, dtype, handle)
return
if _is_arrow_table(data):
data, _, _ = _transform_arrow_table(data, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
return
if _has_array_protocol(data):
pass
raise TypeError('Unsupported type for ' + name, str(type(data)))
Expand Down
16 changes: 14 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ if (USE_CUDA)
CUDA_SEPARABLE_COMPILATION OFF)
endif (USE_CUDA)

enable_arrow_if_available(objxgboost)

target_include_directories(objxgboost
PRIVATE
${xgboost_SOURCE_DIR}/include
Expand Down Expand Up @@ -97,6 +95,20 @@ if (XGBOOST_BUILTIN_PREFETCH_PRESENT)
-DXGBOOST_BUILTIN_PREFETCH_PRESENT=1)
endif (XGBOOST_BUILTIN_PREFETCH_PRESENT)

if (ARROW_FOUND AND ARROW_PYTHON_FOUND)
target_include_directories(objxgboost PRIVATE
${ARROW_INCLUDE_DIR}
${ARROW_PYTHON_INCLUDE_DIR}
${Python3_INCLUDE_DIRS})
target_compile_definitions(objxgboost PRIVATE
-DXGBOOST_BUILD_ARROW_SUPPORT=1
-D_GLIBCXX_USE_CXX11_ABI=0)
target_link_libraries(objxgboost PUBLIC
${ARROW_SHARED_LIB}
${ARROW_PYTHON_SHARED_LIB}
${Python3_LIBRARIES})
endif (ARROW_FOUND AND ARROW_PYTHON_FOUND)

find_package(Threads REQUIRED)
target_link_libraries(objxgboost PUBLIC Threads::Threads ${CMAKE_THREAD_LIBS_INIT})

Expand Down
5 changes: 2 additions & 3 deletions tests/ci_build/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@ ENV GOSU_VERSION 1.10
RUN \
pip install pyyaml cpplint pylint astroid sphinx numpy scipy pandas matplotlib sh \
recommonmark guzzle_sphinx_theme mock breathe graphviz hypothesis\
pytest scikit-learn wheel kubernetes urllib3 jsonschema boto3 && \
pytest scikit-learn wheel kubernetes urllib3 jsonschema boto3 pyarrow==0.17.* && \
pip install https://h2o-release.s3.amazonaws.com/datatable/stable/datatable-0.7.0/datatable-0.7.0-cp37-cp37m-linux_x86_64.whl && \
pip install "dask[complete]" && \
pip install pyarrow
pip install "dask[complete]"

# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
Expand Down
13 changes: 11 additions & 2 deletions tests/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ if (USE_DMLC_GTEST)
else (USE_DMLC_GTEST)
find_package(GTest REQUIRED)
endif (USE_DMLC_GTEST)

if (ARROW_FOUND AND ARROW_PYTHON_FOUND)
target_compile_definitions(gtest PRIVATE
-D_GLIBCXX_USE_CXX11_ABI=0)
endif (ARROW_FOUND AND ARROW_PYTHON_FOUND)

file(GLOB_RECURSE TEST_SOURCES "*.cc")

if (USE_CUDA)
Expand All @@ -14,6 +20,11 @@ if (USE_CUDA)
endif (USE_CUDA)
add_executable(testxgboost ${TEST_SOURCES}
${xgboost_SOURCE_DIR}/plugin/example/custom_obj.cc)

if (ARROW_FOUND AND ARROW_PYTHON_FOUND)
target_compile_definitions(testxgboost PRIVATE
-D_GLIBCXX_USE_CXX11_ABI=0)
endif (ARROW_FOUND AND ARROW_PYTHON_FOUND)
target_link_libraries(testxgboost PRIVATE objxgboost)

if (USE_CUDA)
Expand Down Expand Up @@ -56,8 +67,6 @@ if (USE_CUDA)
CUDA_STANDARD_REQUIRED ON)
endif (USE_CUDA)

enable_arrow_if_available(testxgboost)

if (MSVC)
target_compile_options(testxgboost PRIVATE
$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>
Expand Down
8 changes: 4 additions & 4 deletions tests/python/test_with_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def test_arrow_table(self):
assert dm.num_col() == 4

def test_arrow_table_with_label(self):
df = pd.DataFrame([[0, 1, 2., 3.], [1, 2, 3., 4.]],
columns=['label', 'a', 'b', 'c'])
table = pa.Table.from_pandas(df)
dm = xgb.DMatrix(table, label='label')
df = pd.DataFrame([[1, 2., 3.], [2, 3., 4.]],
columns=['a', 'b', 'c'])
label = np.array([0, 1])
dm = xgb.DMatrix(df, label=label)
assert dm.num_row() == 2
assert dm.num_col() == 3
np.testing.assert_array_equal(dm.get_label(), np.array([0, 1]))
Expand Down

0 comments on commit f7260a4

Please sign in to comment.