Skip to content

Commit

Permalink
ARROW-222: Prototyping an IO interface for Arrow, with initial HDFS t…
Browse files Browse the repository at this point in the history
…arget

- Switch Travis CI back to Ubuntu trusty (old Boost in precise has issues with
  C++11)
- Adapt SFrame libhdfs shim for arrow
- Create C++ public API within arrow:io to libhdfs
- Implement and test many functions in libhdfs
- Start Cython wrapper interface to arrow_io. Begin Python file-like interface,
  unit tests
- Add thirdparty hdfs.h so builds are possible without a local Hadoop distro
  (e.g. in Travis CI).

Change-Id: I4a46e50f6c1c22787baa3749d8a542216341e630
  • Loading branch information
wesm committed Jun 24, 2016
1 parent f7ade7b commit ef90830
Show file tree
Hide file tree
Showing 26 changed files with 3,656 additions and 61 deletions.
5 changes: 4 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sudo: required
dist: precise
dist: trusty
addons:
apt:
sources:
Expand All @@ -12,6 +12,9 @@ addons:
- ccache
- cmake
- valgrind
- libboost-dev
- libboost-filesystem-dev
- libboost-system-dev

matrix:
fast_finish: true
Expand Down
9 changes: 9 additions & 0 deletions NOTICE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Apache Arrow
Copyright 2016 The Apache Software Foundation

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

This product includes software from the SFrame project (BSD, 3-clause).
* Copyright (C) 2015 Dato, Inc.
* Copyright (c) 2009 Carnegie Mellon University.
15 changes: 12 additions & 3 deletions ci/travis_before_script_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,21 @@ echo $GTEST_HOME

: ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install}

CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DARROW_PARQUET=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL"
CMAKE_COMMON_FLAGS="\
-DARROW_BUILD_BENCHMARKS=ON \
-DARROW_PARQUET=ON \
-DARROW_HDFS=on \
-DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL"

if [ $TRAVIS_OS_NAME == "linux" ]; then
cmake -DARROW_TEST_MEMCHECK=on $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR
cmake -DARROW_TEST_MEMCHECK=on \
$CMAKE_COMMON_FLAGS \
-DCMAKE_CXX_FLAGS="-Werror" \
$CPP_DIR
else
cmake $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR
cmake $CMAKE_COMMON_FLAGS \
-DCMAKE_CXX_FLAGS="-Werror" \
$CPP_DIR
fi

make -j4
Expand Down
60 changes: 59 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
"Build the Arrow IPC extensions"
ON)

option(ARROW_HDFS
"Build the Arrow IO extensions for the Hadoop file system"
OFF)

option(ARROW_SSE3
"Build Arrow with SSE3"
ON)
Expand Down Expand Up @@ -454,6 +458,47 @@ if ("$ENV{GBENCHMARK_HOME}" STREQUAL "")
set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed)
endif()

# ----------------------------------------------------------------------
# Add Boost dependencies (code adapted from Apache Kudu (incubating))

# find boost headers and libs
set(Boost_DEBUG TRUE)
set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_LIBS ON)
find_package(Boost COMPONENTS system filesystem REQUIRED)
include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
set(BOOST_STATIC_LIBS ${Boost_LIBRARIES})
list(LENGTH BOOST_STATIC_LIBS BOOST_STATIC_LIBS_LEN)

# Find Boost shared libraries.
set(Boost_USE_STATIC_LIBS OFF)
find_package(Boost COMPONENTS system filesystem REQUIRED)
set(BOOST_SHARED_LIBS ${Boost_LIBRARIES})
list(LENGTH BOOST_SHARED_LIBS BOOST_SHARED_LIBS_LEN)
list(SORT BOOST_SHARED_LIBS)

message(STATUS "Boost include dir: " ${Boost_INCLUDE_DIRS})
message(STATUS "Boost libraries: " ${Boost_LIBRARIES})

math(EXPR LAST_IDX "${BOOST_STATIC_LIBS_LEN} - 1")
foreach(IDX RANGE ${LAST_IDX})
list(GET BOOST_STATIC_LIBS ${IDX} BOOST_STATIC_LIB)
list(GET BOOST_SHARED_LIBS ${IDX} BOOST_SHARED_LIB)

# Remove the prefix/suffix from the library name.
#
# e.g. libboost_system-mt --> boost_system
get_filename_component(LIB_NAME ${BOOST_STATIC_LIB} NAME_WE)
string(REGEX REPLACE "lib([^-]*)(-mt)?" "\\1" LIB_NAME_NO_PREFIX_SUFFIX ${LIB_NAME})
ADD_THIRDPARTY_LIB(${LIB_NAME_NO_PREFIX_SUFFIX}
STATIC_LIB "${BOOST_STATIC_LIB}"
SHARED_LIB "${BOOST_SHARED_LIB}")
list(APPEND ARROW_BOOST_LIBS ${LIB_NAME_NO_PREFIX_SUFFIX})
endforeach()
include_directories(SYSTEM ${Boost_INCLUDE_DIR})

# ----------------------------------------------------------------------
# Enable / disable tests and benchmarks

if(ARROW_BUILD_TESTS)
add_custom_target(unittest ctest -L unittest)
Expand Down Expand Up @@ -529,12 +574,24 @@ endif (UNIX)
# "make lint" target
############################################################
if (UNIX)

file(GLOB_RECURSE LINT_FILES
"${CMAKE_CURRENT_SOURCE_DIR}/src/*.h"
"${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc"
)

FOREACH(item ${LINT_FILES})
IF(NOT (item MATCHES "_generated.h"))
LIST(APPEND FILTERED_LINT_FILES ${item})
ENDIF()
ENDFOREACH(item ${LINT_FILES})

# Full lint
add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py
--verbose=2
--linelength=90
--filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11,-runtime/references
`find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
${FILTERED_LINT_FILES})
endif (UNIX)


Expand Down Expand Up @@ -624,6 +681,7 @@ set_target_properties(arrow
target_link_libraries(arrow ${LIBARROW_LINK_LIBS})

add_subdirectory(src/arrow)
add_subdirectory(src/arrow/io)
add_subdirectory(src/arrow/util)
add_subdirectory(src/arrow/types)

Expand Down
39 changes: 39 additions & 0 deletions cpp/doc/HDFS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
## Using Arrow's HDFS (Apache Hadoop Distributed File System) interface

### Build requirements

To build the integration, pass the following option to CMake

```shell
-DARROW_HDFS=on
```

For convenience, we have bundled `hdfs.h` for libhdfs from Apache Hadoop in
Arrow's thirdparty. If you wish to build against the `hdfs.h` in your installed
Hadoop distribution, set the `$HADOOP_HOME` environment variable.

### Runtime requirements

By default, the HDFS client C++ class in `libarrow_io` uses the libhdfs JNI
interface to the Java Hadoop client. This library is loaded **at runtime**
(rather than at link / library load time, since the library may not be in your
LD_LIBRARY_PATH), and relies on some environment variables.

* `HADOOP_HOME`: the root of your installed Hadoop distribution. Check in the
`lib/native` directory to look for `libhdfs.so` if you have any questions
about which directory you're after.
* `JAVA_HOME`: the location of your Java SDK installation
* `CLASSPATH`: must contain the Hadoop jars. You can set these using:

```shell
export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
```

#### Setting $JAVA_HOME automatically on OS X

The installed location of Java on OS X can vary, however the following snippet
will set it automatically for you:

```shell
export JAVA_HOME=$(/usr/libexec/java_home)
```
97 changes: 97 additions & 0 deletions cpp/src/arrow/io/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# ----------------------------------------------------------------------
# arrow_io : Arrow IO interfaces

set(ARROW_IO_LINK_LIBS
arrow
)

set(ARROW_IO_PRIVATE_LINK_LIBS
boost_system
boost_filesystem
)

set(ARROW_IO_TEST_LINK_LIBS
arrow_io
${ARROW_IO_PRIVATE_LINK_LIBS})

set(ARROW_IO_SRCS
)

if(ARROW_HDFS)
if(NOT THIRDPARTY_DIR)
message(FATAL_ERROR "THIRDPARTY_DIR not set")
endif()

if (DEFINED ENV{HADOOP_HOME})
set(HADOOP_HOME $ENV{HADOOP_HOME})
else()
set(HADOOP_HOME "${THIRDPARTY_DIR}/hadoop")
endif()

set(HDFS_H_PATH "${HADOOP_HOME}/include/hdfs.h")
if (NOT EXISTS ${HDFS_H_PATH})
message(FATAL_ERROR "Did not find hdfs.h at ${HDFS_H_PATH}")
endif()
message(STATUS "Found hdfs.h at: " ${HDFS_H_PATH})
message(STATUS "Building libhdfs shim component")

include_directories(SYSTEM "${HADOOP_HOME}/include")

set(ARROW_HDFS_SRCS
hdfs.cc
libhdfs_shim.cc)

set_property(SOURCE ${ARROW_HDFS_SRCS}
APPEND_STRING PROPERTY
COMPILE_FLAGS "-DHAS_HADOOP")

set(ARROW_IO_SRCS
${ARROW_HDFS_SRCS}
${ARROW_IO_SRCS})

ADD_ARROW_TEST(hdfs-io-test)
ARROW_TEST_LINK_LIBRARIES(hdfs-io-test
${ARROW_IO_TEST_LINK_LIBS})
endif()

add_library(arrow_io SHARED
${ARROW_IO_SRCS}
)
target_link_libraries(arrow_io LINK_PUBLIC ${ARROW_IO_LINK_LIBS})
target_link_libraries(arrow_io LINK_PRIVATE ${ARROW_IO_PRIVATE_LINK_LIBS})

SET_TARGET_PROPERTIES(arrow_io PROPERTIES LINKER_LANGUAGE CXX)

if (APPLE)
set_target_properties(arrow_io
PROPERTIES
BUILD_WITH_INSTALL_RPATH ON
INSTALL_NAME_DIR "@rpath")
endif()

# Headers: top level
install(FILES
hdfs.h
interfaces.h
DESTINATION include/arrow/io)

install(TARGETS arrow_io
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib)
Loading

0 comments on commit ef90830

Please sign in to comment.