From eca15b93925a29cf0c9b0f147812b40546b56c84 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 20 Jun 2023 17:09:00 +0100 Subject: [PATCH] Shefali pr rebased (#6) * ARROW-2034: [C++] Filesystem implementation for AzureBlobFileSystem * ARROW-2034: [C++] Fixed formatting issues * ARROW-2034: [C++] Fixed formatting issues * Added -DARROW_AZURE in ci * Added CXX_STANDARD and CXX_STANDARD_REQUIRED * Added mocked test file * Turned -DARROW_AZURE=OFF in appveyor-cpp-build * Changed default C++ version * Changed LibXml2 target * Fixing CMake styling issues * Enabling ARROW_AZURE flag * Added OpenSSL dependency * Disabling ARROW_AZURE in windows-mingw * Fixing lint issues * Fixing azurefs_test * Added Azurite * Added azurefs_objlib * Reverting azure object library changes * Added permissions to install_azurite.sh * chmod +x ci/scripts/install_azurite.sh * Don't specify CMAKE_CXX_STANDARD by default * Fix system detection * Fix syntax * Fix style * Fix style * Running azurite through boost::process * Fixed naming in azurefs_test.cc * Fixed naming in azurefs.cc * Fixed OpenOutputStream * Added uri.Parse() * Updated versions.txt * Fixed ARROW_AZURE_STORAGE_BLOBS_URL * Added libxml2-dev * Fixed build errors --------- Co-authored-by: shefali singh Co-authored-by: Sutou Kouhei --- ci/appveyor-cpp-build.bat | 1 + ci/docker/ubuntu-20.04-cpp.dockerfile | 5 + ci/docker/ubuntu-22.04-cpp.dockerfile | 5 + ci/scripts/install_azurite.sh | 37 ++++ cpp/Brewfile | 2 + cpp/CMakeLists.txt | 5 + cpp/cmake_modules/ThirdpartyToolchain.cmake | 197 ++++++++++++++++++++ cpp/thirdparty/versions.txt | 10 + cpp/vcpkg.json | 5 + 9 files changed, 267 insertions(+) create mode 100755 ci/scripts/install_azurite.sh diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index caadaab90b9aa..67be2f111c903 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -60,6 +60,7 @@ pushd cpp\build cmake -G "%GENERATOR%" %CMAKE_ARGS% ^ -DARROW_ACERO=ON ^ + -DARROW_AZURE=OFF ^ -DARROW_BOOST_USE_SHARED=ON ^ -DARROW_BUILD_EXAMPLES=ON ^ -DARROW_BUILD_STATIC=OFF ^ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index f94494177e8ee..bb487f6a5f7d8 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -99,10 +99,12 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxml2-dev \ libzstd-dev \ make \ ninja-build \ nlohmann-json3-dev \ + npm \ pkg-config \ protobuf-compiler \ python3-dev \ @@ -123,6 +125,9 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default +COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_azurite.sh + COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_ceph.sh diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index e773c6f1ee659..35c9561c453f5 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -98,10 +98,12 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxml2-dev \ libzstd-dev \ make \ ninja-build \ nlohmann-json3-dev \ + npm \ pkg-config \ protobuf-compiler \ protobuf-compiler-grpc \ @@ -156,6 +158,9 @@ RUN /arrow/ci/scripts/install_gcs_testbench.sh default COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin +COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_azurite.sh + # Prioritize system packages and local installation # The following dependencies will be downloaded due to missing/invalid packages # provided by the distribution: diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh new file mode 100755 index 0000000000000..2e7008360fdc3 --- /dev/null +++ b/ci/scripts/install_azurite.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +case "$(uname)" in + Darwin) + npm install -g azurite + which azurite + ;; + MINGW*) + choco install nodejs.install + npm install -g azurite + ;; + Linux) + npm install -g azurite + which azurite + ;; +esac +echo "node version = $(node --version)" +echo "azurite version = $(azurite --version)" \ No newline at end of file diff --git a/cpp/Brewfile b/cpp/Brewfile index 580e8d3f115d5..8a0d143d476b8 100644 --- a/cpp/Brewfile +++ b/cpp/Brewfile @@ -30,6 +30,8 @@ brew "grpc" brew "llvm@14" brew "lz4" brew "ninja" +brew "node" +brew "numpy" brew "openssl@3" brew "protobuf" brew "python" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 301c919667db6..c89fa7d70dba3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -795,6 +795,11 @@ if(ARROW_WITH_OPENTELEMETRY) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS CURL::libcurl) endif() +if(ARROW_AZURE) + list(APPEND ARROW_SHARED_LINK_LIBS ${AZURESDK_LINK_LIBRARIES}) + list(APPEND ARROW_STATIC_LINK_LIBS ${AZURESDK_LINK_LIBRARIES}) +endif() + if(ARROW_WITH_UTF8PROC) list(APPEND ARROW_SHARED_LINK_LIBS utf8proc::utf8proc) list(APPEND ARROW_STATIC_LINK_LIBS utf8proc::utf8proc) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 635bc1684e6f2..b0d8e109883c6 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -548,6 +548,47 @@ else() "${THIRDPARTY_MIRROR_URL}/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz") endif() +if(DEFINED ENV{ARROW_AZURE_CORE_URL}) + set(AZURE_CORE_SOURCE_URL "$ENV{ARROW_AZURE_CORE_URL}") +else() + set_urls(AZURE_CORE_SOURCE_URL + "https://github.com/Azure/azure-sdk-for-cpp/archive/azure-core_${ARROW_AZURE_CORE_BUILD_VERSION}.tar.gz" + ) +endif() + +if(DEFINED ENV{ARROW_AZURE_IDENTITY_URL}) + set(AZURE_IDENTITY_SOURCE_URL "$ENV{ARROW_AZURE_IDENTITY_URL}") +else() + set_urls(AZURE_IDENTITY_SOURCE_URL + "https://github.com/Azure/azure-sdk-for-cpp/archive/azure-identity_${ARROW_AZURE_IDENTITY_BUILD_VERSION}.tar.gz" + ) +endif() + +if(DEFINED ENV{ARROW_AZURE_STORAGE_BLOBS_URL}) + set(AZURE_STORAGE_BLOBS_SOURCE_URL "$ENV{ARROW_AZURE_STORAGE_BLOBS_URL}") +else() + set_urls(AZURE_STORAGE_BLOBS_SOURCE_URL + "https://github.com/Azure/azure-sdk-for-cpp/archive/azure-storage-blobs_${ARROW_AZURE_STORAGE_BLOBS_BUILD_VERSION}.tar.gz" + ) +endif() + +if(DEFINED ENV{ARROW_AZURE_STORAGE_COMMON_URL}) + set(AZURE_STORAGE_COMMON_SOURCE_URL "$ENV{ARROW_AZURE_STORAGE_COMMON_URL}") +else() + set_urls(AZURE_STORAGE_COMMON_SOURCE_URL + "https://github.com/Azure/azure-sdk-for-cpp/archive/azure-storage-common_${ARROW_AZURE_STORAGE_COMMON_BUILD_VERSION}.tar.gz" + ) +endif() + +if(DEFINED ENV{ARROW_AZURE_STORAGE_FILES_DATALAKE_URL}) + set(AZURE_STORAGE_FILES_DATALAKE_SOURCE_URL + "$ENV{ARROW_AZURE_STORAGE_FILES_DATALAKE_URL}") +else() + set_urls(AZURE_STORAGE_FILES_DATALAKE_SOURCE_URL + "https://github.com/Azure/azure-sdk-for-cpp/archive/azure-storage-files-datalake_${ARROW_AZURE_STORAGE_FILES_DATALAKE_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_BOOST_URL}) set(BOOST_SOURCE_URL "$ENV{ARROW_BOOST_URL}") else() @@ -5042,6 +5083,162 @@ if(ARROW_S3) endif() endif() +macro(build_azuresdk) + message(STATUS "Building Azure C++ SDK from source") + + find_curl() + find_package(LibXml2 REQUIRED) + find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) + + set(AZURESDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/azuresdk_ep-install") + set(AZURESDK_INCLUDE_DIR "${AZURESDK_PREFIX}/include") + set(AZURESDK_LIB_DIR "lib") + + # provide hint for Azure SDK to link with the already located openssl + get_filename_component(OPENSSL_ROOT_HINT "${OPENSSL_INCLUDE_DIR}" DIRECTORY) + + set(AZURESDK_COMMON_CMAKE_ARGS + ${EP_COMMON_CMAKE_ARGS} + "-DCMAKE_INSTALL_PREFIX=${AZURESDK_PREFIX}" + "-DCMAKE_PREFIX_PATH=${AZURESDK_PREFIX}" + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_INSTALL_LIBDIR=${AZURESDK_LIB_DIR} + -DDISABLE_AZURE_CORE_OPENTELEMETRY=ON + -DENABLE_TESTING=OFF + -DENABLE_UNITY_BUILD=ON + -DOPENSSL_ROOT_DIR=${OPENSSL_ROOT_HINT} + -DWARNINGS_AS_ERRORS=OFF) + + file(MAKE_DIRECTORY ${AZURESDK_INCLUDE_DIR}) + + set(AZURE_CORE_STATIC_LIBRARY + "${AZURESDK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}azure-core${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + externalproject_add(azure_core_ep + ${EP_LOG_OPTIONS} + INSTALL_DIR ${AZURESDK_PREFIX} + URL ${AZURE_CORE_SOURCE_URL} + URL_HASH "SHA256=${ARROW_AZURE_CORE_BUILD_SHA256_CHECKSUM}" + CMAKE_ARGS ${AZURESDK_COMMON_CMAKE_ARGS} + BUILD_BYPRODUCTS ${AZURE_CORE_STATIC_LIBRARY}) + add_library(Azure::azure-core STATIC IMPORTED) + set_target_properties(Azure::azure-core + PROPERTIES IMPORTED_LOCATION "${AZURE_CORE_STATIC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${AZURESDK_INCLUDE_DIR}") + target_link_libraries(Azure::azure-core INTERFACE CURL::libcurl LibXml2::LibXml2) + add_dependencies(Azure::azure-core azure_core_ep) + + set(AZURE_IDENTITY_STATIC_LIBRARY + "${AZURESDK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}azure-identity${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + externalproject_add(azure_identity_ep + ${EP_LOG_OPTIONS} + INSTALL_DIR ${AZURESDK_PREFIX} + URL ${AZURE_IDENTITY_SOURCE_URL} + URL_HASH "SHA256=${ARROW_AZURE_IDENTITY_BUILD_SHA256_CHECKSUM}" + CMAKE_ARGS ${AZURESDK_COMMON_CMAKE_ARGS} + BUILD_BYPRODUCTS ${AZURE_IDENTITY_STATIC_LIBRARY}) + add_library(Azure::azure-identity STATIC IMPORTED) + set_target_properties(Azure::azure-identity + PROPERTIES IMPORTED_LOCATION "${AZURE_IDENTITY_STATIC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${AZURESDK_INCLUDE_DIR}") + target_link_libraries(Azure::azure-identity INTERFACE CURL::libcurl LibXml2::LibXml2) + add_dependencies(Azure::azure-identity azure_identity_ep) + + set(AZURE_STORAGE_BLOBS_STATIC_LIBRARY + "${AZURESDK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}azure-storage-blobs${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + externalproject_add(azure_storage_blobs_ep + ${EP_LOG_OPTIONS} + INSTALL_DIR ${AZURESDK_PREFIX} + URL ${AZURE_STORAGE_BLOBS_SOURCE_URL} + URL_HASH "SHA256=${ARROW_AZURE_STORAGE_BLOBS_BUILD_SHA256_CHECKSUM}" + CMAKE_ARGS ${AZURESDK_COMMON_CMAKE_ARGS} + BUILD_BYPRODUCTS ${AZURE_STORAGE_BLOBS_STATIC_LIBRARY}) + add_library(Azure::azure-storage-blobs STATIC IMPORTED) + set_target_properties(Azure::azure-storage-blobs + PROPERTIES IMPORTED_LOCATION + "${AZURE_STORAGE_BLOBS_STATIC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${AZURESDK_INCLUDE_DIR}") + target_link_libraries(Azure::azure-storage-blobs + INTERFACE Azure::azure-core CURL::libcurl LibXml2::LibXml2) + add_dependencies(Azure::azure-storage-blobs azure_storage_blobs_ep) + + set(AZURE_STORAGE_COMMON_STATIC_LIBRARY + "${AZURESDK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}azure-storage-common${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + externalproject_add(azure_storage_common_ep + ${EP_LOG_OPTIONS} + INSTALL_DIR ${AZURESDK_PREFIX} + URL ${AZURE_STORAGE_COMMON_SOURCE_URL} + URL_HASH "SHA256=${ARROW_AZURE_STORAGE_COMMON_BUILD_SHA256_CHECKSUM}" + CMAKE_ARGS ${AZURESDK_COMMON_CMAKE_ARGS} + BUILD_BYPRODUCTS ${AZURE_STORAGE_COMMON_STATIC_LIBRARY}) + add_library(Azure::azure-storage-common STATIC IMPORTED) + set_target_properties(Azure::azure-storage-common + PROPERTIES IMPORTED_LOCATION + "${AZURE_STORAGE_COMMON_STATIC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${AZURESDK_INCLUDE_DIR}") + target_link_libraries(Azure::azure-storage-common INTERFACE CURL::libcurl + LibXml2::LibXml2) + add_dependencies(Azure::azure-storage-common azure_storage_common_ep) + set_property(TARGET Azure::azure-storage-common PROPERTY INTERFACE_LINK_LIBRARIES + OpenSSL::Crypto) + + set(AZURE_STORAGE_FILES_DATALAKE_STATIC_LIBRARY + "${AZURESDK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}azure-storage-files-datalake${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + externalproject_add(azure_storage_files_datalake_ep + ${EP_LOG_OPTIONS} + INSTALL_DIR ${AZURESDK_PREFIX} + URL ${AZURE_STORAGE_FILES_DATALAKE_SOURCE_URL} + URL_HASH "SHA256=${ARROW_AZURE_STORAGE_FILES_DATALAKE_BUILD_SHA256_CHECKSUM}" + CMAKE_ARGS ${AZURESDK_COMMON_CMAKE_ARGS} + BUILD_BYPRODUCTS ${AZURE_STORAGE_FILES_DATALAKE_STATIC_LIBRARY}) + add_library(Azure::azure-storage-files-datalake STATIC IMPORTED) + set_target_properties(Azure::azure-storage-files-datalake + PROPERTIES IMPORTED_LOCATION + "${AZURE_STORAGE_FILES_DATALAKE_STATIC_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${AZURESDK_INCLUDE_DIR}") + target_link_libraries(Azure::azure-storage-files-datalake + INTERFACE Azure::azure-core + Azure::azure-identity + Azure::azure-storage-blobs + Azure::azure-storage-common + CURL::libcurl + LibXml2::LibXml2) + add_dependencies(Azure::azure-storage-files-datalake azure_storage_files_datalake_ep) + + set(AZURESDK_LIBRARIES) + list(APPEND + AZURESDK_LIBRARIES + Azure::azure-core + Azure::azure-storage-blobs + Azure::azure-identity + Azure::azure-storage-common + Azure::azure-storage-files-datalake) + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + Azure::azure-core + Azure::azure-storage-blobs + Azure::azure-identity + Azure::azure-storage-common + Azure::azure-storage-files-datalake) + + set(AZURESDK_LINK_LIBRARIES ${AZURESDK_LIBRARIES}) +endmacro() + +if(ARROW_AZURE) + build_azuresdk() + message(STATUS "Found Azure SDK headers: ${AZURESDK_INCLUDE_DIR}") + message(STATUS "Found Azure SDK libraries: ${AZURESDK_LINK_LIBRARIES}") +endif() + # ---------------------------------------------------------------------- # ucx - communication framework for modern, high-bandwidth and low-latency networks diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index c05ff4228462c..fd88a5010f602 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -53,6 +53,16 @@ ARROW_AWS_LC_BUILD_VERSION=v1.3.0 ARROW_AWS_LC_BUILD_SHA256_CHECKSUM=ae96a3567161552744fc0cae8b4d68ed88b1ec0f3d3c98700070115356da5a37 ARROW_AWSSDK_BUILD_VERSION=1.10.55 ARROW_AWSSDK_BUILD_SHA256_CHECKSUM=2d552fb1a84bef4a9b65e34aa7031851ed2aef5319e02cc6e4cb735c48aa30de +ARROW_AZURE_CORE_BUILD_VERSION=1.7.1 +ARROW_AZURE_CORE_BUILD_SHA256_CHECKSUM=ae6f03e65d9773d11cf3b9619d0bc7f567272974cf31b9e1c8ca2fa0ea4fb4c6 +ARROW_AZURE_IDENTITY_BUILD_VERSION=1.3.0 +ARROW_AZURE_IDENTITY_BUILD_SHA256_CHECKSUM=46701acd8000f317d1c4b33263d5d3203924fadcfa5af4860ae9187046a72c45 +ARROW_AZURE_STORAGE_BLOBS_BUILD_VERSION=12.5.0 +ARROW_AZURE_STORAGE_BLOBS_BUILD_SHA256_CHECKSUM=12394d864144ced9fc3562ad48cfe3426604e871b5aa72853ca398e086f0c594 +ARROW_AZURE_STORAGE_COMMON_BUILD_VERSION=12.2.4 +ARROW_AZURE_STORAGE_COMMON_BUILD_SHA256_CHECKSUM=7644b4355b492ba2039236b9fd56c3e7bb80aad983d8bac6a731d74aaf64e03f +ARROW_AZURE_STORAGE_FILES_DATALAKE_BUILD_VERSION=12.3.1 +ARROW_AZURE_STORAGE_FILES_DATALAKE_BUILD_SHA256_CHECKSUM=a5b74076a751d7cfaf7c56674a40ce2792c4fab9add18758fab1fe091d00baff ARROW_BOOST_BUILD_VERSION=1.81.0 ARROW_BOOST_BUILD_SHA256_CHECKSUM=9e0ffae35528c35f90468997bc8d99500bf179cbae355415a89a600c38e13574 ARROW_BROTLI_BUILD_VERSION=v1.0.9 diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index f6e65b2b95f4a..2a52a4b59be01 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -14,6 +14,11 @@ "transfer" ] }, + "azure-core-cpp", + "azure-identity-cpp", + "azure-storage-blobs-cpp", + "azure-storage-common-cpp", + "azure-storage-files-datalake-cpp", "benchmark", "boost-filesystem", "boost-multiprecision",