diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 36121c7bc65..06e8c8fb19d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -615,7 +615,7 @@ endif(UNIX) set(ARROW_LINK_LIBS) # Libraries to link statically with libarrow.so -set(ARROW_STATIC_LINK_LIBS) +set(ARROW_STATIC_LINK_LIBS double-conversion) if (ARROW_WITH_BROTLI) SET(ARROW_STATIC_LINK_LIBS @@ -694,6 +694,7 @@ else () set(ARROW_MIN_TEST_LIBS arrow_shared ${ARROW_LINK_LIBS} + double-conversion ${BOOST_SYSTEM_LIBRARY} ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_REGEX_LIBRARY} diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index e25f954d296..19d14d855da 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -25,13 +25,13 @@ if (NOT "$ENV{ARROW_BUILD_TOOLCHAIN}" STREQUAL "") set(BROTLI_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(FLATBUFFERS_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(GFLAGS_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") + set(GLOG_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(GRPC_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") # Using gtest from the toolchain breaks AppVeyor builds if (NOT MSVC) set(GTEST_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") endif() set(JEMALLOC_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") - set(GRPC_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(LZ4_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") # orc disabled as it's not in conda-forge (but in Anaconda with an incompatible ABI) # set(ORC_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") @@ -41,7 +41,6 @@ if (NOT "$ENV{ARROW_BUILD_TOOLCHAIN}" STREQUAL "") set(THRIFT_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(ZLIB_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(ZSTD_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") - set(GLOG_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") if (NOT DEFINED ENV{BOOST_ROOT}) # Since we have to set this in the environment, we check whether @@ -56,6 +55,10 @@ if (DEFINED ENV{BROTLI_HOME}) set(BROTLI_HOME "$ENV{BROTLI_HOME}") endif() +if (DEFINED ENV{DOUBLE_CONVERSION_HOME}) + set(DOUBLE_CONVERSION_HOME "$ENV{DOUBLE_CONVERSION_HOME}") +endif() + if (DEFINED ENV{FLATBUFFERS_HOME}) set(FLATBUFFERS_HOME "$ENV{FLATBUFFERS_HOME}") endif() @@ -64,6 +67,10 @@ if (DEFINED ENV{GFLAGS_HOME}) set(GFLAGS_HOME "$ENV{GFLAGS_HOME}") endif() +if (DEFINED ENV{GLOG_HOME}) + set(GLOG_HOME "$ENV{GLOG_HOME}") +endif() + if (DEFINED ENV{GRPC_HOME}) set(GRPC_HOME "$ENV{GRPC_HOME}") endif() @@ -108,10 +115,6 @@ if (DEFINED ENV{ZSTD_HOME}) set(ZSTD_HOME "$ENV{ZSTD_HOME}") endif() -if (DEFINED ENV{GLOG_HOME}) - set(GLOG_HOME "$ENV{GLOG_HOME}") -endif() - # ---------------------------------------------------------------------- # Some EP's require other EP's @@ -165,6 +168,12 @@ else() set(BROTLI_SOURCE_URL "https://github.com/google/brotli/archive/${BROTLI_VERSION}.tar.gz") endif() +if (DEFINED ENV{DOUBLE_CONVERSION_SOURCE_URL}) + set(DOUBLE_CONVERSION_SOURCE_URL "$ENV{DOUBLE_CONVERSION_SOURCE_URL}") +else() + set(DOUBLE_CONVERSION_SOURCE_URL "https://github.com/google/double-conversion/archive/${DOUBLE_CONVERSION_VERSION}.tar.gz") +endif() + if (DEFINED ENV{ARROW_FLATBUFFERS_URL}) set(FLATBUFFERS_SOURCE_URL "$ENV{ARROW_FLATBUFFERS_URL}") else() @@ -183,6 +192,12 @@ else() set(GFLAGS_SOURCE_URL "https://github.com/gflags/gflags/archive/${GFLAGS_VERSION}.tar.gz") endif() +if (DEFINED ENV{ARROW_GLOG_URL}) + set(GLOG_SOURCE_URL "$ENV{ARROW_GLOG_URL}") +else() + set(GLOG_SOURCE_URL "https://github.com/google/glog/archive/${GLOG_VERSION}.tar.gz") +endif() + if (DEFINED ENV{ARROW_GRPC_URL}) set(GRPC_SOURCE_URL "$ENV{ARROW_GRPC_URL}") else() @@ -245,12 +260,6 @@ else() set(ZSTD_SOURCE_URL "https://github.com/facebook/zstd/archive/${ZSTD_VERSION}.tar.gz") endif() -if (DEFINED ENV{ARROW_GLOG_URL}) - set(GLOG_SOURCE_URL "$ENV{ARROW_GLOG_URL}") -else() - set(GLOG_SOURCE_URL "https://github.com/google/glog/archive/${GLOG_VERSION}.tar.gz") -endif() - # ---------------------------------------------------------------------- # ExternalProject options @@ -436,6 +445,43 @@ endif() include_directories(SYSTEM ${Boost_INCLUDE_DIR}) +# ---------------------------------------------------------------------- +# Google double-conversion + +if("${DOUBLE_CONVERSION_HOME}" STREQUAL "") + set(DOUBLE_CONVERSION_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/double-conversion_ep/src/double-conversion_ep") + set(DOUBLE_CONVERSION_HOME "${DOUBLE_CONVERSION_PREFIX}") + set(DOUBLE_CONVERSION_INCLUDE_DIR "${DOUBLE_CONVERSION_PREFIX}/include") + set(DOUBLE_CONVERSION_STATIC_LIB "${DOUBLE_CONVERSION_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}double-conversion${CMAKE_STATIC_LIBRARY_SUFFIX}") + + set(DOUBLE_CONVERSION_CMAKE_ARGS + "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}" + "-DCMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}=${EP_CXX_FLAGS}" + "-DCMAKE_INSTALL_PREFIX=${DOUBLE_CONVERSION_PREFIX}") + ExternalProject_Add(double-conversion_ep + ${EP_LOG_OPTIONS} + INSTALL_DIR ${DOUBLE_CONVERSION_PREFIX} + URL ${DOUBLE_CONVERSION_SOURCE_URL} + CMAKE_ARGS ${DOUBLE_CONVERSION_CMAKE_ARGS} + BUILD_BYPRODUCTS "${DOUBLE_CONVERSION_STATIC_LIB}") + set(DOUBLE_CONVERSION_VENDORED 1) +else() + find_package(double-conversion REQUIRED) + set(DOUBLE_CONVERSION_VENDORED 0) +endif() + +include_directories(SYSTEM ${DOUBLE_CONVERSION_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(double-conversion + STATIC_LIB ${DOUBLE_CONVERSION_STATIC_LIB}) + +if (DOUBLE_CONVERSION_VENDORED) + add_dependencies(arrow_dependencies double-conversion_ep) +endif() + +# ---------------------------------------------------------------------- +# Google gtest & gflags + if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) add_custom_target(unittest ctest -L unittest) diff --git a/cpp/src/arrow/symbols.map b/cpp/src/arrow/symbols.map index 437299934e1..69717f089d8 100644 --- a/cpp/src/arrow/symbols.map +++ b/cpp/src/arrow/symbols.map @@ -72,6 +72,7 @@ # Statically linked C++ dependencies boost::*; + double_conversion::*; google::*; # glog *MakeCheckOpValueString*; diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 8b7ed7ad43e..d3d60fb9ffd 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -19,11 +19,40 @@ # arrow_util ####################################### -file(GLOB_RECURSE ARROW_UTIL_HEADERS "*.h") - # Headers: top level install(FILES - ${ARROW_UTIL_HEADERS} + bit-stream-utils.h + bit-util.h + bpacking.h + checked_cast.h + compiler-util.h + compression.h + compression_brotli.h + compression_lz4.h + compression_snappy.h + compression_zlib.h + compression_zstd.h + cpu-info.h + decimal.h + hash-util.h + hash.h + io-util.h + key_value_metadata.h + lazy.h + logging.h + macros.h + memory.h + parallel.h + rle-encoding.h + sse-util.h + stl.h + stopwatch.h + string.h + thread-pool.h + type_traits.h + variant.h + visibility.h + windows_compatibility.h DESTINATION include/arrow/util) ####################################### diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h index 4f2dc7894ab..04aff1e68fa 100644 --- a/cpp/src/arrow/util/parsing.h +++ b/cpp/src/arrow/util/parsing.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +// This is a private header for string-to-number parsing utilitiers + #ifndef ARROW_UTIL_PARSING_H #define ARROW_UTIL_PARSING_H @@ -24,6 +26,8 @@ #include #include +#include + #include "arrow/type.h" #include "arrow/type_traits.h" @@ -79,7 +83,7 @@ class StringConverter { // Ideas for faster float parsing: // - http://rapidjson.org/md_doc_internals.html#ParsingDouble -// - https://github.com/google/double-conversion +// - https://github.com/google/double-conversion [used here] // - https://github.com/achan001/dtoa-fast template @@ -87,18 +91,49 @@ class StringToFloatConverterMixin { public: using value_type = typename ARROW_TYPE::c_type; - StringToFloatConverterMixin() { ibuf.imbue(std::locale::classic()); } + StringToFloatConverterMixin() + : main_converter_(flags_, main_junk_value_, main_junk_value_, "inf", "nan"), + fallback_converter_(flags_, fallback_junk_value_, fallback_junk_value_, "inf", + "nan") {} bool operator()(const char* s, size_t length, value_type* out) { - ibuf.clear(); - ibuf.str(std::string(s, length)); - ibuf >> *out; - // XXX Should we reset errno on failure? - return !ibuf.fail() && ibuf.eof(); + value_type v; + // double-conversion doesn't give us an error flag but signals parse + // errors with sentinel values. Since a sentinel value can appear as + // legitimate input, we fallback on a second converter with a different + // sentinel to eliminate false errors. + TryConvert(main_converter_, s, length, &v); + if (ARROW_PREDICT_FALSE(v == static_cast(main_junk_value_))) { + TryConvert(fallback_converter_, s, length, &v); + if (ARROW_PREDICT_FALSE(v == static_cast(fallback_junk_value_))) { + return false; + } + } + *out = v; + return true; } protected: - std::istringstream ibuf; + static const int flags_ = + double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY; + // Two unlikely values to signal a parsing error + static constexpr double main_junk_value_ = 0.7066424364107089; + static constexpr double fallback_junk_value_ = 0.40088499148279166; + + double_conversion::StringToDoubleConverter main_converter_; + double_conversion::StringToDoubleConverter fallback_converter_; + + inline void TryConvert(double_conversion::StringToDoubleConverter& converter, + const char* s, size_t length, float* out) { + int processed_length; + *out = converter.StringToFloat(s, static_cast(length), &processed_length); + } + + inline void TryConvert(double_conversion::StringToDoubleConverter& converter, + const char* s, size_t length, double* out) { + int processed_length; + *out = converter.StringToDouble(s, static_cast(length), &processed_length); + } }; template <> diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 381282bd801..711e553176c 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -18,19 +18,20 @@ # Toolchain library versions BOOST_VERSION=1.67.0 -GTEST_VERSION=1.8.0 -GFLAGS_VERSION=v2.2.0 -GBENCHMARK_VERSION=v1.4.1 +BROTLI_VERSION=v0.6.0 +DOUBLE_CONVERSION_VERSION=v3.1.1 FLATBUFFERS_VERSION=02a7807dd8d26f5668ffbbec0360dc107bbfabd5 -RAPIDJSON_VERSION=v1.1.0 +GBENCHMARK_VERSION=v1.4.1 +GFLAGS_VERSION=v2.2.0 +GLOG_VERSION=v0.3.5 +GRPC_VERSION=1.14.1 +GTEST_VERSION=1.8.0 JEMALLOC_VERSION=17c897976c60b0e6e4f4a365c751027244dada7a -SNAPPY_VERSION=1.1.3 -BROTLI_VERSION=v0.6.0 LZ4_VERSION=v1.7.5 -ZLIB_VERSION=1.2.8 -ZSTD_VERSION=v1.2.0 -PROTOBUF_VERSION=v3.6.1 -GRPC_VERSION=1.14.1 ORC_VERSION=1.5.1 +PROTOBUF_VERSION=v3.6.1 +RAPIDJSON_VERSION=v1.1.0 +SNAPPY_VERSION=1.1.3 THRIFT_VERSION=0.11.0 -GLOG_VERSION=v0.3.5 +ZLIB_VERSION=1.2.8 +ZSTD_VERSION=v1.2.0