diff --git a/.clang-format b/.clang-format index abd823c1039..0979e8160ce 100644 --- a/.clang-format +++ b/.clang-format @@ -20,3 +20,4 @@ ColumnLimit: 90 DerivePointerAlignment: false IncludeBlocks: Preserve IndentPPDirectives: AfterHash +QualifierAlignment: Left diff --git a/.clang-tidy b/.clang-tidy index ebb75c859e0..9aa404db08c 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -27,7 +27,6 @@ Checks: | # produce HeaderFilterRegex from cpp/build-support/lint_exclusions.txt with: # echo -n '^('; sed -e 's/*/\.*/g' cpp/build-support/lint_exclusions.txt | tr '\n' '|'; echo ')$' HeaderFilterRegex: '^(.*codegen.*|.*_generated.*|.*windows_compatibility.h|.*pyarrow_api.h|.*pyarrow_lib.h|.*python/config.h|.*python/platform.h|.*thirdparty/ae/.*|.*vendored/.*|.*RcppExports.cpp.*|)$' -AnalyzeTemporaryDtors: true CheckOptions: - key: google-readability-braces-around-statements.ShortStatementLines value: '1' diff --git a/.editorconfig b/.editorconfig index 999f94bae00..67b30b62ad4 100644 --- a/.editorconfig +++ b/.editorconfig @@ -50,15 +50,6 @@ indent_style = space indent_size = 2 indent_style = space -[*.go] -indent_size = 8 -indent_style = tab -tab_width = 8 - -[*.{js,ts}] -indent_size = 4 -indent_style = space - [*.{py,pyx,pxd,pxi}] indent_size = 4 indent_style = space diff --git a/.env b/.env index d2badc1e342..5951b3c95f6 100644 --- a/.env +++ b/.env @@ -37,6 +37,10 @@ DOCKER_BUILDKIT=1 ARCH=amd64 ARCH_ALIAS=x86_64 ARCH_SHORT=amd64 +# For aarch64 +# ARCH=arm64v8 +# ARCH_ALIAS=aarch64 +# ARCH_SHORT=arm64 # Default repository to pull and push images from REPO=apache/arrow-dev @@ -47,14 +51,14 @@ ULIMIT_CORE=-1 # Default versions for platforms ALMALINUX=8 -ALPINE_LINUX=3.18 +ALPINE_LINUX=3.22 DEBIAN=12 -FEDORA=39 +FEDORA=42 UBUNTU=22.04 # Default versions for various dependencies CLANG_TOOLS=14 -CMAKE=3.25.0 +CMAKE=3.26.0 CUDA=11.7.1 DASK=latest DOTNET=8.0 @@ -65,14 +69,15 @@ KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. LLVM=18 MAVEN=3.8.7 -NODE=18 +NODE=20 NUMBA=latest +NUMBA_CUDA=latest NUMPY=latest PANDAS=latest -PYTHON=3.9 -PYTHON_IMAGE_TAG=3.9 -PYTHON_ABI_TAG=cp39 -R=4.4 +PYTHON=3.10 +PYTHON_IMAGE_TAG=3.10 +PYTHON_ABI_TAG=cp310 +R=4.5 SPARK=master TURBODBC=latest @@ -82,6 +87,7 @@ R_ORG=rhub R_TAG=latest # Env vars for R builds +R_UPDATE_CLANG=false R_CUSTOM_CCACHE=false ARROW_R_DEV=TRUE R_PRUNE_DEPS=FALSE @@ -90,14 +96,14 @@ TZ=UTC # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="f7423ee180c4b7f40d43402c2feb3859161ef625" # 2024.06.15 Release +VCPKG="4334d8b4c8916018600212ab4dd4bbdc343065d1" # 2025.09.17 Release # This must be updated when we update # ci/docker/python-*-windows-*.dockerfile or the vcpkg config. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-02-25 -PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-02-25 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-10-13 +PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-10-13 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker compose run --rm conan". # See https://github.com/conan-io/conan-docker-tools#readme and diff --git a/.gitattributes b/.gitattributes index 70007c26c8b..18396af4933 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,5 @@ cpp/src/arrow/util/bpacking_*_generated.h linguist-generated=true +cpp/src/parquet/chunker_*_generated.h linguist-generated=true cpp/src/generated/*.cpp linguist-generated=true cpp/src/generated/*.h linguist-generated=true go/**/*.s linguist-generated=true diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e72d5b4321d..12b78f7f2cf 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -33,16 +33,12 @@ /cpp/src/arrow/engine @westonpace /cpp/src/arrow/flight/ @lidavidm /cpp/src/parquet @wgtmac -/csharp/ @curthagenlocher -/go/ @zeroshade -/java/ @lidavidm -/js/ @domoritz @trxcllnt /matlab/ @kevingurney @kou @sgilmore10 +/python/ @AlenkaF @raulcd @rok /python/pyarrow/_flight.pyx @lidavidm /python/pyarrow/**/*gandiva* @wjones127 /r/ @jonkeane @thisisnic /ruby/ @kou -/swift/ @kou # Docs # /docs/ diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 2b0f859f683..282a8866c40 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -24,41 +24,3 @@ updates: commit-message: prefix: "MINOR: [CI] " open-pull-requests-limit: 10 - - package-ecosystem: "gomod" - directory: "/swift/CDataWGo/" - schedule: - interval: "weekly" - commit-message: - prefix: "MINOR: [Swift] " - open-pull-requests-limit: 10 - - package-ecosystem: "gomod" - directory: "/swift/data-generator/swift-datagen/" - schedule: - interval: "weekly" - commit-message: - prefix: "MINOR: [Swift] " - open-pull-requests-limit: 10 - - package-ecosystem: "npm" - directory: "/js/" - schedule: - interval: "monthly" - commit-message: - prefix: "MINOR: [JS] " - open-pull-requests-limit: 10 - - package-ecosystem: "nuget" - directory: "/csharp/" - schedule: - interval: "weekly" - commit-message: - prefix: "MINOR: [C#] " - open-pull-requests-limit: 10 - ignore: - - dependency-name: "Microsoft.Extensions.*" - update-types: - - "version-update:semver-major" - - dependency-name: "Microsoft.Bcl.*" - update-types: - - "version-update:semver-major" - - dependency-name: "System.*" - update-types: - - "version-update:semver-major" diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index 38dd4206f88..7980e4b55e2 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -58,16 +58,16 @@ jobs: timeout-minutes: 15 steps: - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Git Fixup shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5.5.0 + uses: actions/setup-python@v6.0.0 with: - python-version: '3.9' + python-version: '3.12' - name: Install pygit2 binary wheel run: pip install pygit2 --only-binary pygit2 - name: Install Archery, Crossbow- and Test Dependencies diff --git a/.github/workflows/check_labels.yml b/.github/workflows/check_labels.yml new file mode 100644 index 00000000000..ee4a080f838 --- /dev/null +++ b/.github/workflows/check_labels.yml @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Check Labels Reusable + +on: + workflow_call: + inputs: + parent-workflow: + description: "The parent workflow filename (without .yml)" + required: true + type: string + outputs: + ci-extra-labels: + description: "The extra CI labels" + value: ${{ jobs.check-labels.outputs.ci-extra-labels }} + force: + description: "Whether to force running the jobs" + value: ${{ jobs.check-labels.outputs.force }} + +jobs: + check-labels: + name: Check labels + runs-on: ubuntu-latest + timeout-minutes: 5 + outputs: + ci-extra-labels: ${{ steps.check.outputs.ci-extra-labels }} + force: ${{ steps.check.outputs.force }} + steps: + - name: Checkout Arrow + if: github.event_name == 'pull_request' + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - name: Check + id: check + env: + GH_TOKEN: ${{ github.token }} + run: | + set -ex + case "${GITHUB_EVENT_NAME}" in + push|schedule|workflow_dispatch) + echo "force=true" >> "${GITHUB_OUTPUT}" + ;; + pull_request) + { + echo "ci-extra-labels<> "${GITHUB_OUTPUT}" + git fetch origin ${GITHUB_BASE_REF} + git diff --stat origin/${GITHUB_BASE_REF}.. + if git diff --stat origin/${GITHUB_BASE_REF}.. | \ + grep \ + --fixed-strings ".github/workflows/${{ inputs.parent-workflow }}.yml" \ + --quiet; then + echo "force=true" >> "${GITHUB_OUTPUT}" + fi + ;; + esac diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 8fe6dfa8ebf..85157760d66 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -36,20 +36,20 @@ jobs: pull-requests: write steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: path: arrow # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies run: pip install -e arrow/dev/archery[bot] - name: Handle GitHub comment event env: - ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }} run: | archery --debug trigger-bot \ @@ -63,7 +63,7 @@ jobs: if: github.event.comment.body == 'take' runs-on: ubuntu-latest steps: - - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 1da536ed2a5..0122f01e757 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -29,6 +29,7 @@ on: - '.github/workflows/cpp.yml' - 'ci/conda_env_*' - 'ci/docker/**' + - 'ci/scripts/ccache_setup.sh' - 'ci/scripts/cpp_*' - 'ci/scripts/install_azurite.sh' - 'ci/scripts/install_gcs_testbench.sh' @@ -45,6 +46,7 @@ on: - '.github/workflows/cpp.yml' - 'ci/conda_env_*' - 'ci/docker/**' + - 'ci/scripts/ccache_setup.sh' - 'ci/scripts/cpp_*' - 'ci/scripts/install_azurite.sh' - 'ci/scripts/install_gcs_testbench.sh' @@ -91,8 +93,8 @@ jobs: image: ubuntu-cpp-sanitizer llvm: 14 runs-on: ubuntu-latest - title: AMD64 Ubuntu 22.04 C++ ASAN UBSAN - ubuntu: 22.04 + title: AMD64 Ubuntu 24.04 C++ ASAN UBSAN + ubuntu: 24.04 - arch: arm64v8 clang-tools: 14 image: ubuntu-cpp @@ -108,7 +110,7 @@ jobs: UBUNTU: ${{ matrix.ubuntu }} steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive @@ -121,7 +123,7 @@ jobs: - name: Setup Python on hosted runner if: | matrix.runs-on == 'ubuntu-latest' - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3 - name: Setup Python on self-hosted runner @@ -161,7 +163,7 @@ jobs: timeout-minutes: 45 steps: - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 submodules: recursive @@ -216,7 +218,7 @@ jobs: sysctl -a | grep cpu sysctl -a | grep "hw.optional" - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 submodules: recursive @@ -233,7 +235,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5.5.0 + uses: actions/setup-python@v6.0.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -319,7 +321,6 @@ jobs: BOOST_SOURCE: BUNDLED CMAKE_CXX_STANDARD: "17" CMAKE_GENERATOR: Ninja - CMAKE_INSTALL_LIBDIR: bin CMAKE_INSTALL_PREFIX: /usr CMAKE_UNITY_BUILD: ON steps: @@ -332,7 +333,7 @@ jobs: /d 1 ` /f - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 submodules: recursive @@ -367,15 +368,16 @@ jobs: call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 bash -c "ci/scripts/cpp_build.sh $(pwd) $(pwd)/build" - name: Test - shell: bash + shell: cmd run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 # For ORC - export TZDIR=/c/msys64/usr/share/zoneinfo - ci/scripts/cpp_test.sh $(pwd) $(pwd)/build + set TZDIR=C:\msys64\usr\share\zoneinfo + bash -c "ci/scripts/cpp_test.sh $(pwd) $(pwd)/build" windows-mingw: name: AMD64 Windows MinGW ${{ matrix.msystem_upper }} C++ - runs-on: windows-2019 + runs-on: windows-2022 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} # Build may take 1h+ without cache. timeout-minutes: 120 @@ -395,6 +397,7 @@ jobs: ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_FLIGHT_SQL: ON + ARROW_FLIGHT_SQL_ODBC: ON ARROW_GANDIVA: ON ARROW_GCS: ON ARROW_HDFS: OFF @@ -428,7 +431,7 @@ jobs: /d 1 ` /f - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 submodules: recursive @@ -462,10 +465,10 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5.5.0 + uses: actions/setup-python@v6.0.0 id: python-install with: - python-version: 3.9 + python-version: '3.12' - name: Install Google Cloud Storage Testbench shell: msys2 {0} env: diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml new file mode 100644 index 00000000000..5b054ddfb58 --- /dev/null +++ b/.github/workflows/cpp_extra.yml @@ -0,0 +1,321 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: C++ Extra + +on: + push: + branches: + - '**' + - '!dependabot/**' + paths: + - '.dockerignore' + - '.github/workflows/check_labels.yml' + - '.github/workflows/cpp_extra.yml' + - '.github/workflows/report_ci.yml' + - 'ci/conda_env_*' + - 'ci/docker/**' + - 'ci/scripts/ccache_setup.sh' + - 'ci/scripts/cpp_*' + - 'ci/scripts/install_azurite.sh' + - 'ci/scripts/install_gcs_testbench.sh' + - 'ci/scripts/install_minio.sh' + - 'ci/scripts/msys2_*' + - 'ci/scripts/util_*' + - 'cpp/**' + - 'docker-compose.yml' + - 'format/Flight.proto' + - 'testing' + tags: + - '**' + pull_request: + paths: + - '.dockerignore' + - '.github/workflows/check_labels.yml' + - '.github/workflows/cpp_extra.yml' + - '.github/workflows/report_ci.yml' + - 'ci/conda_env_*' + - 'ci/docker/**' + - 'ci/scripts/ccache_setup.sh' + - 'ci/scripts/cpp_*' + - 'ci/scripts/install_azurite.sh' + - 'ci/scripts/install_gcs_testbench.sh' + - 'ci/scripts/install_minio.sh' + - 'ci/scripts/msys2_*' + - 'ci/scripts/util_*' + - 'cpp/**' + - 'docker-compose.yml' + - 'format/Flight.proto' + - 'testing' + types: + - labeled + - opened + - reopened + - synchronize + schedule: + - cron: | + 0 0 * * * + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + check-labels: + uses: ./.github/workflows/check_labels.yml + secrets: inherit + with: + parent-workflow: cpp_extra + + docker: + needs: check-labels + name: ${{ matrix.title }} + runs-on: ${{ matrix.runs-on }} + if: >- + needs.check-labels.outputs.force == 'true' || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: C++') + timeout-minutes: 75 + strategy: + fail-fast: false + matrix: + include: + - image: alpine-linux-cpp + runs-on: ubuntu-latest + title: AMD64 Alpine Linux + - image: conda-cpp + run-options: >- + -e ARROW_USE_MESON=ON + runs-on: ubuntu-latest + title: AMD64 Ubuntu Meson + # TODO: We should remove this "continue-on-error: true" once GH-47207 is resolved + - continue-on-error: true + envs: + - DEBIAN=13 + image: debian-cpp + run-options: >- + -e CMAKE_CXX_STANDARD=23 + runs-on: ubuntu-latest + title: AMD64 Debian C++23 + env: + ARCHERY_DEBUG: 1 + ARROW_ENABLE_TIMING_TESTS: OFF + DOCKER_VOLUME_PREFIX: ".docker/" + steps: + - name: Checkout Arrow + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + submodules: recursive + - name: Cache Docker Volumes + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: .docker + key: extra-${{ matrix.image }}-${{ hashFiles('cpp/**') }} + restore-keys: extra-${{ matrix.image }}- + - name: Setup Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: 3 + - name: Setup Archery + run: python3 -m pip install -e dev/archery[docker] + - name: Execute Docker Build + continue-on-error: ${{ matrix.continue-on-error || false }} + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + ENVS: ${{ toJSON(matrix.envs) }} + run: | + # GH-40558: reduce ASLR to avoid ASAN/LSAN crashes + sudo sysctl -w vm.mmap_rnd_bits=28 + source ci/scripts/util_enable_core_dumps.sh + if [ "${ENVS}" != "null" ]; then + echo "${ENVS}" | jq -r '.[]' | while read env; do + echo "${env}" >> .env + done + fi + archery docker run ${{ matrix.run-options || '' }} ${{ matrix.image }} + - name: Docker Push + if: >- + success() && + github.event_name == 'push' && + github.repository == 'apache/arrow' && + github.ref_name == 'main' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + run: archery docker push ${{ matrix.image }} + + jni-linux: + needs: check-labels + name: JNI ${{ matrix.platform.runs-on }} ${{ matrix.platform.arch }} + runs-on: ${{ matrix.platform.runs-on }} + if: >- + needs.check-labels.outputs.force == 'true' || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: C++') + timeout-minutes: 240 + permissions: + # This is for using GitHub Packages for vcpkg cache + packages: write + strategy: + fail-fast: false + matrix: + platform: + - arch: "amd64" + runs-on: ubuntu-latest + - arch: "arm64v8" + runs-on: ubuntu-24.04-arm + env: + ARCH: ${{ matrix.platform.arch }} + REPO: ghcr.io/${{ github.repository }}-dev + steps: + - name: Checkout Arrow + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + submodules: recursive + - name: Cache Docker Volumes + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: .docker + key: jni-${{ matrix.platform.runs-on }}-${{ hashFiles('cpp/**') }} + restore-keys: jni-${{ matrix.platform.runs-on }}- + - name: Setup Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: 3 + - name: Setup Archery + run: python3 -m pip install -e dev/archery[docker] + - name: Execute Docker Build + env: + ARCHERY_DOCKER_USER: ${{ github.actor }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VCPKG_BINARY_SOURCES: "clear;nuget,GitHub,readwrite" + run: | + source ci/scripts/util_enable_core_dumps.sh + archery docker run cpp-jni + - name: Docker Push + if: >- + success() && + github.event_name == 'push' && + github.ref_name == 'main' + env: + ARCHERY_DOCKER_USER: ${{ github.actor }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + continue-on-error: true + run: archery docker push cpp-jni + + jni-macos: + needs: check-labels + name: JNI macOS + runs-on: macos-14 + if: >- + needs.check-labels.outputs.force == 'true' || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: C++') + timeout-minutes: 45 + env: + MACOSX_DEPLOYMENT_TARGET: "14.0" + steps: + - name: Checkout Arrow + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + submodules: recursive + - name: Install dependencies + run: | + brew bundle --file=cpp/Brewfile + # We want to link aws-sdk-cpp statically but Homebrew's + # aws-sdk-cpp provides only shared library. If we have + # Homebrew's aws-sdk-cpp, our build mix Homebrew's + # aws-sdk-cpp and bundled aws-sdk-cpp. We uninstall Homebrew's + # aws-sdk-cpp to ensure using only bundled aws-sdk-cpp. + brew uninstall aws-sdk-cpp + # We want to use bundled RE2 for static linking. If + # Homebrew's RE2 is installed, its header file may be used. + # We uninstall Homebrew's RE2 to ensure using bundled RE2. + brew uninstall grpc || : # gRPC depends on RE2 + brew uninstall grpc@1.54 || : # gRPC 1.54 may be installed too + brew uninstall re2 + # We want to use bundled Protobuf for static linking. If + # Homebrew's Protobuf is installed, its library file may be + # used on test We uninstall Homebrew's Protobuf to ensure using + # bundled Protobuf. + brew uninstall protobuf + - name: Prepare ccache + run: | + echo "CCACHE_DIR=${PWD}/ccache" >> ${GITHUB_ENV} + - name: Cache ccache + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: ccache + key: jni-macos-${{ hashFiles('cpp/**') }} + restore-keys: jni-macos- + - name: CMake + run: | + cmake \ + -S cpp \ + -B cpp.build \ + --preset=ninja-release-jni-macos \ + -DARROW_BUILD_TESTS=ON \ + -DCMAKE_INSTALL_PREFIX=$PWD/cpp.install + - name: Build + run: | + cmake --build cpp.build + - name: Install + run: | + cmake --install cpp.build + - name: Test + env: + ARROW_TEST_DATA: ${{ github.workspace }}/testing/data + PARQUET_TEST_DATA: ${{ github.workspace }}/cpp/submodules/parquet-testing/data + run: | + # MinIO is required + exclude_tests="arrow-s3fs-test" + # unstable + exclude_tests="${exclude_tests}|arrow-acero-asof-join-node-test" + exclude_tests="${exclude_tests}|arrow-acero-hash-join-node-test" + ctest \ + --exclude-regex "${exclude_tests}" \ + --label-regex unittest \ + --output-on-failure \ + --parallel "$(sysctl -n hw.ncpu)" \ + --test-dir "cpp.build" \ + --timeout 300 + - name: Build example + run: | + cmake \ + -S cpp/examples/minimal_build/ \ + -B cpp/examples/minimal_build.build \ + -GNinja \ + -DCMAKE_INSTALL_PREFIX=$PWD/cpp.install + cmake --build cpp/examples/minimal_build.build + cd cpp/examples/minimal_build + ../minimal_build.build/arrow-example + + report-extra-cpp: + needs: + - docker + - jni-macos + uses: ./.github/workflows/report_ci.yml + secrets: inherit diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml deleted file mode 100644 index 0ebeaeb5b0b..00000000000 --- a/.github/workflows/csharp.yml +++ /dev/null @@ -1,212 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: C# - -on: - push: - branches: - - '**' - - '!dependabot/**' - tags: - - '**' - paths: - - '.github/workflows/csharp.yml' - - 'ci/scripts/csharp_*' - - 'csharp/**' - pull_request: - paths: - - '.github/workflows/csharp.yml' - - 'ci/scripts/csharp_*' - - 'csharp/**' - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - - ubuntu: - name: AMD64 Ubuntu 24.04 C# ${{ matrix.dotnet }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - dotnet: ['8.0.x'] - steps: - - name: Install C# - uses: actions/setup-dotnet@v4.3.1 - with: - dotnet-version: ${{ matrix.dotnet }} - - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: 3 - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Build - shell: bash - run: ci/scripts/csharp_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/csharp_test.sh $(pwd) - - windows: - name: AMD64 Windows 2019 18.04 C# ${{ matrix.dotnet }} - runs-on: windows-2019 - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - dotnet: ['8.0.x'] - steps: - - name: Install C# - uses: actions/setup-dotnet@v4.3.1 - with: - dotnet-version: ${{ matrix.dotnet }} - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Build - shell: bash - run: ci/scripts/csharp_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/csharp_test.sh $(pwd) - - macos: - name: AMD64 macOS 13 C# ${{ matrix.dotnet }} - runs-on: macos-13 - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - dotnet: ['8.0.x'] - steps: - - name: Install C# - uses: actions/setup-dotnet@v4.3.1 - with: - dotnet-version: ${{ matrix.dotnet }} - - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: 3.12 - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Build - shell: bash - run: ci/scripts/csharp_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/csharp_test.sh $(pwd) - - package: - name: Package - # Branch or RC tag - if: github.ref_type != 'tag' || contains(github.ref_name, 'rc') - runs-on: ubuntu-latest - timeout-minutes: 15 - permissions: - contents: write - steps: - - name: Checkout for utilities - if: github.ref_type == 'tag' - uses: actions/checkout@v4 - with: - path: arrow - - name: Download source archive - if: github.ref_type == 'tag' - run: | - arrow/dev/release/utils-watch-gh-workflow.sh \ - ${GITHUB_REF_NAME} \ - release_candidate.yml - gh release download ${GITHUB_REF_NAME} \ - --pattern "*.tar.gz" \ - --repo ${GITHUB_REPOSITORY} - tar -xf *.tar.gz --strip-components=1 - mv csharp/dummy.git .git - env: - GH_TOKEN: ${{ github.token }} - - name: Checkout - if: github.ref_type != 'tag' - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Prepare version - if: github.ref_type != 'tag' - run: | - # apache-arrow-20.0.0.dev-9-g758867f907 -> - # 20.0.0.dev-9-g758867f907 -> - # 20.0.0.dev-9 -> - # 20.0.0-dev-9 - semver="$(git describe --tags | \ - sed -E \ - -e 's/^apache-arrow-//' \ - -e 's/-[^-]*$//' \ - -e 's/^([0-9]*\.[0-9]*\.[0-9])\./\1-/')" - sed -i'' -E -e \ - "s/^ .+<\/Version>/ ${semver}<\/Version>/" \ - csharp/Directory.Build.props - - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: 3 - - name: Setup Archery - run: | - python3 -m pip install -e 'dev/archery[docker]' - - name: Build - run: | - archery docker run ubuntu-csharp - - name: Prepare artifacts - run: | - shopt -s globstar - cp csharp/artifacts/**/*.{,s}nupkg ./ - for artifact in *.{,s}nupkg; do - dev/release/utils-generate-checksum.sh "${artifact}" - done - - name: Upload - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - with: - name: nuget - path: | - *.nupkg - *.sha256 - *.sha512 - *.snupkg - - name: Publish - if: github.ref_type == 'tag' - run: | - gh release upload ${GITHUB_REF_NAME} \ - --repo ${GITHUB_REPOSITORY} \ - *.nupkg \ - *.sha256 \ - *.sha512 \ - *.snupkg - env: - GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 465de645877..2c4e1ae9408 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -41,51 +41,32 @@ jobs: lint: name: Lint C++, Python, R, Docker, RAT - runs-on: ubuntu-24.04 + # Use Ubuntu 22.04 to ensure working pre-commit on Ubuntu 22.04. + runs-on: ubuntu-22.04 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 15 steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 - - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: 3.12 - name: Install pre-commit run: | - python -m pip install pre-commit - pre-commit run --show-diff-on-failure --color=always + sudo apt update + sudo apt install -y -V \ + pre-commit \ + r-base \ + ruby-dev - name: Cache pre-commit uses: actions/cache@v4 with: - path: ~/.cache/pre-commit + path: | + ~/.cache/pre-commit + ~/.local/share/renv/cache key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} - name: Run pre-commit run: | pre-commit run --all-files --color=always --show-diff-on-failure - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - UBUNTU: 22.04 - run: | - source ci/scripts/util_enable_core_dumps.sh - archery docker run -e GITHUB_ACTIONS=true ubuntu-lint - - name: Docker Push - if: >- - success() && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - run: archery docker push ubuntu-lint release: name: Source Release and Merge Script on ${{ matrix.runs-on }} @@ -105,11 +86,11 @@ jobs: GIT_COMMITTER_EMAIL: "github-actions[bot]@users.noreply.github.com" steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: '3.12' - name: Install Ruby @@ -124,12 +105,11 @@ jobs: shell: bash run: | gem install test-unit - pip install "cython>=3" setuptools pytest requests setuptools-scm + pip install "cython>=3.1" setuptools pytest requests setuptools-scm - name: Run Release Test - env: - ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} shell: bash run: | + echo "GH_TOKEN=${{ secrets.GITHUB_TOKEN }}" > dev/release/.env ci/scripts/release_test.sh $(pwd) - name: Run Merge Script Test shell: bash diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 96bf3993f82..16fa1142b88 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -43,7 +43,7 @@ jobs: name: Process runs-on: ubuntu-latest steps: - - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: apache/arrow ref: main @@ -53,7 +53,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -64,7 +64,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -75,7 +75,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: debug: true github-token: ${{ secrets.GITHUB_TOKEN }} @@ -87,7 +87,7 @@ jobs: if: | (github.event.action == 'opened' || github.event.action == 'synchronize') - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0 + uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b # v6.0.1 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 7ef92f0be9b..0950cacc2b9 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -25,21 +25,6 @@ - any-glob-to-any-file: - c_glib/**/* -"Component: C#": -- changed-files: - - any-glob-to-any-file: - - csharp/**/* - -"Component: Go": -- changed-files: - - any-glob-to-any-file: - - go/**/* - -"Component: JavaScript": -- changed-files: - - any-glob-to-any-file: - - js/**/* - "Component: MATLAB": - changed-files: - any-glob-to-any-file: @@ -60,11 +45,6 @@ - any-glob-to-any-file: - ruby/**/* -"Component: Swift": -- changed-files: - - any-glob-to-any-file: - - swift/**/* - "Component: FlightRPC": - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 8b618693dca..b09206485ac 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -39,7 +39,7 @@ jobs: JDK: 17 steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 - name: Free up disk space @@ -52,7 +52,7 @@ jobs: key: debian-docs-${{ hashFiles('cpp/**') }} restore-keys: debian-docs- - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 07b926df757..cf103466ab7 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -41,15 +41,15 @@ env: jobs: light: - name: AMD64 Conda Python 3.9 Sphinx Documentation + name: AMD64 Conda Python 3.12 Sphinx Documentation runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 env: - PYTHON: "3.9" + PYTHON: "3.12" steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 - name: Cache Docker Volumes @@ -59,7 +59,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index e81be37d23b..25c5181bb0c 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -30,12 +30,8 @@ on: - 'ci/**' - 'dev/archery/**' - 'docker-compose.yml' - - 'go/**' - 'integration/**' - - 'js/**' - 'cpp/**' - - 'java/**' - - 'csharp/**' - 'format/**' pull_request: paths: @@ -44,12 +40,8 @@ on: - 'ci/**' - 'dev/archery/**' - 'docker-compose.yml' - - 'go/**' - 'integration/**' - - 'js/**' - 'cpp/**' - - 'csharp/**' - - 'java/**' - 'format/**' concurrency: @@ -72,30 +64,40 @@ jobs: timeout-minutes: 60 steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive - name: Checkout Arrow Rust - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: apache/arrow-rs path: rust - name: Checkout Arrow nanoarrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: apache/arrow-nanoarrow path: nanoarrow - name: Checkout Arrow Go - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: apache/arrow-go path: go - name: Checkout Arrow Java - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: apache/arrow-java path: java + - name: Checkout Arrow JS + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: apache/arrow-js + path: js + - name: Checkout Arrow .NET + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + repository: apache/arrow-dotnet + path: dotnet - name: Free up disk space run: | ci/scripts/util_free_space.sh @@ -106,7 +108,7 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Setup Archery @@ -119,8 +121,10 @@ jobs: source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ + -e ARCHERY_INTEGRATION_WITH_DOTNET=1 \ -e ARCHERY_INTEGRATION_WITH_GO=1 \ -e ARCHERY_INTEGRATION_WITH_JAVA=1 \ + -e ARCHERY_INTEGRATION_WITH_JS=1 \ -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ -e ARCHERY_INTEGRATION_WITH_RUST=1 \ conda-integration diff --git a/.github/workflows/issue_bot.yml b/.github/workflows/issue_bot.yml index 2725825b569..3d37423d4f4 100644 --- a/.github/workflows/issue_bot.yml +++ b/.github/workflows/issue_bot.yml @@ -32,7 +32,7 @@ jobs: if: github.event.issue.pull_request == null runs-on: ubuntu-latest steps: - - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | let split_body = context.payload.issue.body.split('### Component(s)'); diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml deleted file mode 100644 index da8202082c9..00000000000 --- a/.github/workflows/js.yml +++ /dev/null @@ -1,148 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: NodeJS - -on: - push: - branches: - - '**' - - '!dependabot/**' - tags: - - '**' - paths: - - '.dockerignore' - - '.github/workflows/js.yml' - - 'ci/docker/*js.dockerfile' - - 'ci/scripts/js_*' - - 'js/**' - pull_request: - paths: - - '.dockerignore' - - '.github/workflows/js.yml' - - 'ci/docker/*js.dockerfile' - - 'ci/scripts/js_*' - - 'js/**' - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -permissions: - contents: read - -env: - ARCHERY_DEBUG: 1 - -jobs: - - docker: - name: AMD64 Debian 12 NodeJS 18 - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 60 - steps: - - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - fetch-depth: 0 - - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: 3.12 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: | - source ci/scripts/util_enable_core_dumps.sh - archery docker run debian-js - - name: Docker Push - if: >- - success() && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - run: archery docker push debian-js - - macos: - name: AMD64 macOS 13 NodeJS ${{ matrix.node }} - runs-on: macos-13 - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 45 - strategy: - fail-fast: false - matrix: - node: [18] - steps: - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Jest Cache - uses: actions/cache@v4 - with: - path: js/.jest-cache - key: js-jest-cache-${{ runner.os }}-${{ hashFiles('js/src/**/*.ts', 'js/test/**/*.ts', 'js/yarn.lock') }} - restore-keys: js-jest-cache-${{ runner.os }}- - - name: Install NodeJS - uses: actions/setup-node@v4 - with: - node-version: ${{ matrix.node }} - - name: Build - shell: bash - run: ci/scripts/js_build.sh $(pwd) build - - name: Test - shell: bash - run: ci/scripts/js_test.sh $(pwd) build - - windows: - name: AMD64 Windows NodeJS ${{ matrix.node }} - runs-on: windows-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 45 - strategy: - fail-fast: false - matrix: - node: [18] - steps: - - name: Checkout Arrow - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Jest Cache - uses: actions/cache@v4 - with: - path: js/.jest-cache - key: js-jest-cache-${{ runner.os }}-${{ hashFiles('js/src/**/*.ts', 'js/test/**/*.ts', 'js/yarn.lock') }} - restore-keys: js-jest-cache-${{ runner.os }}- - - name: Install NodeJS - uses: actions/setup-node@v4 - with: - node-version: ${{ matrix.node }} - - name: Build - shell: bash - run: ci/scripts/js_build.sh $(pwd) build - - name: Test - shell: bash - run: ci/scripts/js_test.sh $(pwd) build diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 101724b3e2c..11a0da2a348 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -51,7 +51,7 @@ jobs: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Install ninja-build @@ -59,7 +59,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2024b + release: R2025a - name: Install ccache run: sudo apt-get install ccache - name: Setup ccache @@ -99,7 +99,7 @@ jobs: macos-version: "14" steps: - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Install ninja-build @@ -107,7 +107,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2024b + release: R2025a - name: Install ccache run: brew install ccache - name: Setup ccache @@ -140,13 +140,13 @@ jobs: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2024b + release: R2025a - name: Download Timezone Database shell: bash run: ci/scripts/download_tz_database.sh diff --git a/.github/workflows/package_linux.yml b/.github/workflows/package_linux.yml new file mode 100644 index 00000000000..75969615861 --- /dev/null +++ b/.github/workflows/package_linux.yml @@ -0,0 +1,320 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Package Linux + +on: + push: + branches: + - '**' + - '!dependabot/**' + - '!release-*' + paths: + - '.github/workflows/check_labels.yml' + - '.github/workflows/package_linux.yml' + - '.github/workflows/report_ci.yml' + - 'cpp/**' + - 'c_glib/**' + - 'dev/tasks/linux-packages/**' + - 'format/Flight.proto' + tags: + - "apache-arrow-*-rc*" + pull_request: + paths: + - '.github/workflows/check_labels.yml' + - '.github/workflows/package_linux.yml' + - '.github/workflows/report_ci.yml' + - 'cpp/**' + - 'c_glib/**' + - 'dev/tasks/linux-packages/**' + - 'format/Flight.proto' + types: + - labeled + - opened + - reopened + - synchronize + schedule: + - cron: "0 2 * * *" + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +permissions: + # Upload to GitHub Release + contents: write + +jobs: + check-labels: + uses: ./.github/workflows/check_labels.yml + secrets: inherit + with: + parent-workflow: package_linux + + package: + name: ${{ matrix.id }} + runs-on: ${{ contains(matrix.id, 'amd64') && 'ubuntu-latest' || 'ubuntu-24.04-arm' }} + needs: check-labels + if: >- + needs.check-labels.outputs.force == 'true' || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: Package: Linux') + timeout-minutes: 75 + strategy: + fail-fast: false + matrix: + id: + - almalinux-8-amd64 + - almalinux-8-arm64 + - almalinux-9-amd64 + - almalinux-9-arm64 + - almalinux-10-amd64 + - almalinux-10-arm64 + - amazon-linux-2023-amd64 + - amazon-linux-2023-arm64 + - centos-9-stream-amd64 + - centos-9-stream-arm64 + - centos-7-amd64 + - debian-bookworm-amd64 + - debian-bookworm-arm64 + - debian-trixie-amd64 + - debian-trixie-arm64 + - debian-forky-amd64 + - debian-forky-arm64 + - ubuntu-jammy-amd64 + - ubuntu-jammy-arm64 + - ubuntu-noble-amd64 + - ubuntu-noble-arm64 + env: + DOCKER_VOLUME_PREFIX: ".docker/" + steps: + - name: Checkout Arrow + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + submodules: recursive + - name: Free up disk space + run: | + ci/scripts/util_free_space.sh + - name: Prepare environment variables + env: + ID: ${{ matrix.id }} + run: | + set -x + + case "${ID}" in + centos-*) + # Example: centos-9-stream-amd64 -> centos + distribution="${ID%%-*}" + ;; + *) + # Example: almalinux-8-amd64 -> almalinux + # Example: amazon-linux-2023-amd64 -> amazon-linux + distribution="${ID%-*-*}" + ;; + esac + echo "DISTRIBUTION=${distribution}" >> "${GITHUB_ENV}" + + # Example: almalinux-8-amd64 -> amd64 + architecture="${ID##*-}" + echo "ARCHITECTURE=${architecture}" >> "${GITHUB_ENV}" + + # Example: almalinux-8-amd64 -> almalinux-8 + target="${ID%-*}" + case "${target}" in + almalinux-*|amazon-linux-*|centos-*) + echo "TASK_NAMESPACE=yum" >> "${GITHUB_ENV}" + # Example: centos-9-stream-amd64 -> centos-9-stream + # Example: amazon-linux-2023-amd64 -> amazon-linux-2023 + version="${ID%-*}" + # Example: centos-9-stream -> 9-stream + # Example: amazon-linux-2023 -> 2023 + version="${version##${distribution}-}" + echo "DISTRIBUTION_VERSION=${version}" >> "${GITHUB_ENV}" + if [ "${architecture}" = "arm64" ]; then + # Example: almalinux-8 -> almalinux-8-aarch64 + target="${target}-aarch64" + fi + echo "YUM_TARGETS=${target}" >> "${GITHUB_ENV}" + ;; + *) + echo "TASK_NAMESPACE=apt" >> "${GITHUB_ENV}" + # Example: debian-bookworm-amd64 -> debian-bookworm + code_name="${ID%-*}" + # Example: debian-bookworm -> bookworm + code_name="${code_name#*-}" + echo "DISTRIBUTION_CODE_NAME=${code_name}" >> "${GITHUB_ENV}" + if [ "${architecture}" = "arm64" ]; then + # Example: ubuntu-noble -> ubuntu-noble-arm64 + target="${target}-arm64" + fi + echo "APT_TARGETS=${target}" >> "${GITHUB_ENV}" + ;; + esac + echo "TARGET=${target}" >> "${GITHUB_ENV}" + + if [ "${GITHUB_REF_TYPE}" = "tag" ]; then + # Example: apache-arrow-21.0.0-rc0 -> 21.0.0-rc0 + version="${GITHUB_REF_NAME#apache-arrow-}" + echo "ARROW_VERSION=${version}" >> "${GITHUB_ENV}" + fi + - name: Cache Docker Volumes + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: .docker + key: package-linux-${{ matrix.id }}-${{ hashFiles('cpp/**', 'c_glib/**') }} + restore-keys: package-linux-${{ matrix.id }}- + - name: Set up Ruby + run: | + sudo apt update + sudo apt install -y \ + rake \ + ruby \ + ruby-dev + - name: Prepare apache-arrow-apt-source for arm64 + if: env.ARCHITECTURE == 'arm64' + run: | + pushd dev/tasks/linux-packages/apache-arrow-apt-source/apt + for target in *-*; do + cp -a ${target} ${target}-arm64 + done + popd + - name: Prepare apache-arrow-release for arm64 + if: env.ARCHITECTURE == 'arm64' + run: | + pushd dev/tasks/linux-packages/apache-arrow-release/yum + for target in *-*; do + cp -a ${target} ${target}-aarch64 + done + popd + - name: Update version + if: github.ref_type != 'tag' + run: | + pushd dev/tasks/linux-packages + rake version:update + popd + - name: Login to GitHub Container registry + uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Wait for creating GitHub Release + if: github.ref_type == 'tag' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + dev/release/utils-watch-gh-workflow.sh \ + ${GITHUB_REF_NAME} \ + release_candidate.yml + - name: Build + run: | + pushd dev/tasks/linux-packages + rake docker:pull || : + rake --trace ${TASK_NAMESPACE}:build BUILD_DIR=build + popd + - name: Docker Push + continue-on-error: true + if: >- + success() && + github.event_name == 'push' && + github.ref_name == 'main' + run: | + pushd dev/tasks/linux-packages + rake docker:push + popd + - name: Build artifact tarball + run: | + mkdir -p "${DISTRIBUTION}" + cp -a \ + dev/tasks/linux-packages/*/${TASK_NAMESPACE}/repositories/${DISTRIBUTION}/* \ + "${DISTRIBUTION}/" + set -x + # We use latest .deb/.rpm of + # apache-arrow-apt-source/apache-arrow-release built for + # amd64 because they are architecture independent. + if [ "${ARCHITECTURE}" = "amd64" ]; then + if [ "${TASK_NAMESPACE}" = "apt" ]; then + # Create + # https://packages.apache.org/artifactory/arrow/${DISTRIBUTION}/apache-arrow-apt-source-latest-${DISTRIBUTION_CODE_NAME}.deb + # for easy to install. + cp -a \ + ${DISTRIBUTION}/pool/${DISTRIBUTION_CODE_NAME}/*/a/apache-arrow-apt-source/*.deb \ + ${DISTRIBUTION}/apache-arrow-apt-source-latest-${DISTRIBUTION_CODE_NAME}.deb + else + # Create + # https://packages.apache.org/artifactory/arrow/${DISTRIBUTION}/${DISTRIBUTION_VERSION}/apache-arrow-release-latest.rpm + # for easy to install. + cp -a \ + ${DISTRIBUTION}/${DISTRIBUTION_VERSION}/x86_64/Packages/apache-arrow-release-*.rpm \ + ${DISTRIBUTION}/${DISTRIBUTION_VERSION}/apache-arrow-release-latest.rpm + fi + fi + tar cvzf ${{ matrix.id }}.tar.gz ${DISTRIBUTION} + dev/release/utils-generate-checksum.sh ${{ matrix.id }}.tar.gz + - name: Upload the artifacts to the job + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: ${{ matrix.id }} + path: ${{ matrix.id }}.tar.gz* + - name: Upload the artifacts to GitHub Release + if: github.ref_type == 'tag' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release upload ${GITHUB_REF_NAME} \ + --clobber \ + ${{ matrix.id }}.tar.gz* + - name: Set up test + run: | + sudo apt install -y \ + apt-utils \ + cpio \ + createrepo-c \ + devscripts \ + gpg \ + rpm \ + rsync + gem install --user-install apt-dists-merge + { + echo "Key-Type: RSA" + echo "Key-Length: 4096" + echo "Name-Real: Test" + echo "Name-Email: test@example.com" + echo "%no-protection" + } | gpg --full-generate-key --batch + GPG_KEY_ID=$(gpg --list-keys --with-colon test@example.com | grep fpr | cut -d: -f10) + echo "GPG_KEY_ID=${GPG_KEY_ID}" >> ${GITHUB_ENV} + if [ "${TASK_NAMESPACE}" = "yum" ]; then + repositories_dir=dev/tasks/linux-packages/apache-arrow-release/yum/repositories + rpm2cpio ${repositories_dir}/*/*/*/Packages/apache-arrow-release-*.noarch.rpm | + cpio -id + mv etc/pki/rpm-gpg/RPM-GPG-KEY-Apache-Arrow \ + dev/tasks/linux-packages/KEYS + fi + gpg --export --armor test@example.com >> dev/tasks/linux-packages/KEYS + - name: Test + run: | + pushd dev/tasks/linux-packages + rake --trace ${TASK_NAMESPACE}:test + popd + + report-package-linux: + needs: + - package + uses: ./.github/workflows/report_ci.yml + secrets: inherit diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index f8b189f95ad..631ef944970 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -40,7 +40,7 @@ jobs: - name: 'Download PR review payload' id: 'download' if: github.event_name == 'workflow_run' - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | const run_id = "${{ github.event.workflow_run.id }}"; @@ -73,7 +73,7 @@ jobs: curl -sL -o committers.yml $url echo "committers_path=$(pwd)/committers.yml" >> $GITHUB_OUTPUT - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: path: arrow repository: apache/arrow @@ -82,14 +82,14 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies run: pip install -e arrow/dev/archery[bot] - name: Handle PR workflow event env: - ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | if [ "${GITHUB_EVENT_NAME}" = "workflow_run" ]; then # workflow_run is executed on PR review. Update to original event. diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index c550a7668b1..0d12accda4e 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -63,47 +63,47 @@ jobs: matrix: name: - conda-python-docs - - conda-python-3.10-nopandas - - conda-python-3.9-pandas-1.1.3 - - conda-python-3.11-pandas-latest - - conda-python-3.11-no-numpy + - conda-python-3.11-nopandas + - conda-python-3.10-pandas-1.3.4 + - conda-python-3.13-pandas-latest + - conda-python-3.12-no-numpy include: - name: conda-python-docs cache: conda-python-3.10 image: conda-python-docs title: AMD64 Conda Python 3.10 Sphinx & Numpydoc python: "3.10" - - name: conda-python-3.10-nopandas - cache: conda-python-3.10 + - name: conda-python-3.11-nopandas + cache: conda-python-3.11 image: conda-python - title: AMD64 Conda Python 3.10 Without Pandas - python: "3.10" - - name: conda-python-3.9-pandas-1.1.3 - cache: conda-python-3.9 + title: AMD64 Conda Python 3.11 Without Pandas + python: "3.11" + - name: conda-python-3.10-pandas-1.3.4 + cache: conda-python-3.10 image: conda-python-pandas - title: AMD64 Conda Python 3.9 Pandas 1.1.3 - python: 3.9 - pandas: "1.1.3" - numpy: 1.19.5 - - name: conda-python-3.11-pandas-latest - cache: conda-python-3.11 + title: AMD64 Conda Python 3.10 Pandas 1.3.4 + python: "3.10" + pandas: "1.3.4" + numpy: "1.21.2" + - name: conda-python-3.13-pandas-latest + cache: conda-python-3.13 image: conda-python-pandas - title: AMD64 Conda Python 3.11 Pandas latest - python: "3.11" + title: AMD64 Conda Python 3.13 Pandas latest + python: "3.13" pandas: latest - - name: conda-python-3.11-no-numpy - cache: conda-python-3.11 + - name: conda-python-3.12-no-numpy + cache: conda-python-3.12 image: conda-python-no-numpy - title: AMD64 Conda Python 3.11 without NumPy - python: "3.11" + title: AMD64 Conda Python 3.12 without NumPy + python: "3.12" env: - PYTHON: ${{ matrix.python || 3.9 }} + PYTHON: ${{ matrix.python || 3.10 }} UBUNTU: ${{ matrix.ubuntu || 22.04 }} PANDAS: ${{ matrix.pandas || 'latest' }} NUMPY: ${{ matrix.numpy || 'latest' }} steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive @@ -114,7 +114,7 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Setup Archery @@ -178,12 +178,12 @@ jobs: MACOSX_DEPLOYMENT_TARGET: 12.0 steps: - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5.5.0 + uses: actions/setup-python@v6.0.0 with: python-version: '3.11' - name: Install Dependencies @@ -239,3 +239,60 @@ jobs: - name: Test shell: bash run: ci/scripts/python_test.sh $(pwd) $(pwd)/build + + windows: + name: AMD64 Windows 2022 Python 3.13 + runs-on: windows-2022 + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 + env: + PYTHON_CMD: "py -3.13" + steps: + - name: Disable Crash Dialogs + run: | + reg add ` + "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" ` + /v DontShowUI ` + /t REG_DWORD ` + /d 1 ` + /f + - name: Checkout Arrow + uses: actions/checkout@v5 + with: + fetch-depth: 0 + submodules: recursive + - name: Setup Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: 3.13 + - name: Install ccache + shell: bash + run: | + ci/scripts/install_ccache.sh 4.6.3 /usr + - name: Setup ccache + shell: bash + run: | + ci/scripts/ccache_setup.sh + - name: Generate path variables + id: path-info + shell: bash + run: | + echo "CCACHE_DIR=$(ccache --get-config cache_dir)" >> $GITHUB_ENV + echo "usr-windows-dir="$(cygpath --absolute --windows /usr)"" >> $GITHUB_OUTPUT + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ${{ env.CCACHE_DIR }} + key: python-ccache-windows-${{ env.CACHE_VERSION }}-${{ hashFiles('cpp/**') }} + restore-keys: python-ccache-windows-${{ env.CACHE_VERSION }}- + env: + # We can invalidate the current cache by updating this. + CACHE_VERSION: "2025-09-16.1" + - name: Build Arrow C++ and PyArrow + shell: cmd + run: | + call "ci\scripts\python_build.bat" %cd% "${{ steps.path-info.outputs.usr-windows-dir }}" + - name: Test PyArrow + shell: cmd + run: | + call "ci\scripts\python_test.bat" %cd% diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index e5bf48c3e6f..b7b45adc734 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -62,65 +62,6 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: - ubuntu-minimum-cpp-version: - name: Check minimum supported Arrow C++ Version (${{ matrix.cpp_version }}) - # We don't provide Apache Arrow C++ 15.0.2 deb packages for Ubuntu 24.04. - # So we use ubuntu-22.04 here. - runs-on: ubuntu-22.04 - strategy: - matrix: - include: - - cpp_version: "15.0.2" - steps: - - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - path: src - submodules: recursive - - - name: Install Arrow C++ (${{ matrix.cpp_version }}) - run: | - sudo apt update - sudo apt install -y -V ca-certificates lsb-release wget - wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb - sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb - sudo apt update - # We have to list all packages to avoid version conflicts. - sudo apt install -y -V libarrow-dev=${{ matrix.cpp_version }}-1 \ - libarrow-acero-dev=${{ matrix.cpp_version }}-1 \ - libparquet-dev=${{ matrix.cpp_version }}-1 \ - libarrow-dataset-dev=${{ matrix.cpp_version }}-1 - - - name: Install checkbashisms - run: | - sudo apt-get install devscripts - - - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - install-r: false - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::rcmdcheck - needs: check - working-directory: src/r - - - uses: r-lib/actions/check-r-package@v2 - with: - working-directory: src/r - env: - LIBARROW_BINARY: "false" - LIBARROW_BUILD: "false" - ARROW_R_VERBOSE_TEST: "true" - ARROW_R_ALLOW_CPP_VERSION_MISMATCH: "true" - - - name: Show install output - if: always() - run: find src/r/check -name '00install.out*' -exec cat '{}' \; || true - shell: bash - - ubuntu: name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} Force-Tests ${{ matrix.force-tests }} runs-on: ubuntu-latest @@ -137,7 +78,7 @@ jobs: UBUNTU: ${{ matrix.ubuntu }} steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive @@ -155,7 +96,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Setup Archery @@ -209,12 +150,12 @@ jobs: R_TAG: ${{ matrix.config.tag }} steps: - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Setup Archery @@ -255,7 +196,7 @@ jobs: windows-cpp: name: AMD64 Windows C++ RTools ${{ matrix.config.rtools }} ${{ matrix.config.arch }} - runs-on: windows-2019 + runs-on: windows-2022 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 90 strategy: @@ -266,7 +207,7 @@ jobs: steps: - run: git config --global core.autocrlf false - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Setup ccache @@ -307,7 +248,7 @@ jobs: windows-r: needs: [windows-cpp] name: AMD64 Windows R ${{ matrix.config.rversion }} - runs-on: windows-2019 + runs-on: windows-2022 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 strategy: @@ -322,7 +263,7 @@ jobs: steps: - run: git config --global core.autocrlf false - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 # This must be done before r-lib/actions/setup-r because curl in @@ -337,7 +278,7 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts - uses: actions/download-artifact@v4.2.1 + uses: actions/download-artifact@v5.0.0 with: name: libarrow-rtools40-ucrt64.zip path: r/windows @@ -390,24 +331,6 @@ jobs: check_dir = 'check', timeout = 3600 ) - - name: Run lintr - if: ${{ matrix.config.rversion == 'release' }} - env: - NOT_CRAN: "true" - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - shell: Rscript {0} - working-directory: r - run: | - Sys.setenv( - RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"), - MAKEFLAGS = paste0("-j", parallel::detectCores()), - ARROW_R_DEV = TRUE, - "_R_CHECK_FORCE_SUGGESTS_" = FALSE - ) - # we use pak for package installation since it is faster, safer and more convenient - pak::local_install() - pak::pak("lintr") - lintr::expect_lint_free() - name: Dump install logs shell: cmd run: cat r/check/arrow.Rcheck/00install.out diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 6a8f6c75c21..4e12fce9545 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -1,193 +1,217 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Upload R Nightly builds -# This workflow downloads the (nightly) binaries created in crossbow and uploads them -# to nightlies.apache.org. Due to authorization requirements, this upload can't be done -# from the crossbow repository. - -on: - workflow_dispatch: - inputs: - prefix: - description: Job prefix to use. - required: false - default: '' - keep: - description: Number of versions to keep. - required: false - default: 14 - - schedule: - #Crossbow packaging runs at 0 8 * * * - - cron: '0 14 * * *' - -permissions: - contents: read - -jobs: - upload: - if: github.repository == 'apache/arrow' - runs-on: ubuntu-latest - steps: - - name: Checkout Arrow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - fetch-depth: 1 - path: arrow - repository: apache/arrow - ref: main - submodules: recursive - - name: Checkout Crossbow - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 - with: - fetch-depth: 0 - path: crossbow - repository: ursacomputing/crossbow - ref: main - - name: Set up Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - cache: 'pip' - python-version: 3.12 - - name: Install Archery - shell: bash - run: pip install -e arrow/dev/archery[all] - - run: mkdir -p binaries - - name: Download Artifacts - env: - PREFIX: ${{ github.event.inputs.prefix || ''}} - run: | - if [ -z $PREFIX ]; then - PREFIX=nightly-packaging-$(date +%Y-%m-%d)-0 - fi - echo $PREFIX - - archery crossbow download-artifacts -f r-binary-packages -t binaries $PREFIX - - if [ -n "$(ls -A binaries/*/*/)" ]; then - echo "Found files!" - else - echo "No files found. Stopping upload." - exit 1 - fi - - name: Cache Repo - uses: actions/cache@v4 - with: - path: repo - key: r-nightly-${{ github.run_id }} - restore-keys: r-nightly- - - name: Sync from Remote - uses: ./arrow/.github/actions/sync-nightlies - with: - switches: -avzh --update --delete --progress - local_path: repo - remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/arrow/r - remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }} - remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }} - remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }} - remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} - remote_host_key: ${{ secrets.NIGHTLIES_RSYNC_HOST_KEY }} - - run: tree repo - - uses: r-lib/actions/setup-r@v2 - - name: Build Repository - shell: Rscript {0} - run: | - # folder that we sync to nightlies.apache.org - repo_root <- "repo" - # The binaries are in a nested dir - # so we need to find the correct path. - art_path <- list.files("binaries", - recursive = TRUE, - include.dirs = TRUE, - pattern = "r-binary-packages$", - full.names = TRUE - ) - - current_path <- list.files(art_path, full.names = TRUE, recursive = TRUE) - files <- sub("r-(pkg|lib)", repo_root, current_path) - - # decode contrib.url from artifact name: - # bin__windows__contrib__4.1 -> bin/windows/contrib/4.1 - new_paths <- gsub("__", "/", files) - # strip superfluous nested dirs - new_paths <- sub(art_path, ".", new_paths) - dirs <- dirname(new_paths) - sapply(dirs, dir.create, recursive = TRUE, showWarnings = FALSE) - - # overwrite allows us to "force push" a new version with the same name - copy_result <- file.copy(current_path, new_paths, overwrite = TRUE) - - if (!all(copy_result)) { - stop("There was an issue while copying the files!") - } - - name: Prune Repository - shell: bash - env: - KEEP: ${{ github.event.inputs.keep || 14 }} - run: | - prune() { - # list files | retain $KEEP newest files | delete everything else - ls -t $1/arrow* | tail -n +$((KEEP + 1)) | xargs --no-run-if-empty rm - } - - # find leaf sub dirs - repo_dirs=$(find repo -type d -links 2) - - # We want to retain $keep (14) versions of each pkg/lib so we call - # prune on each leaf dir and not on repo/. - for dir in ${repo_dirs[@]}; do - prune $dir - done - - name: Update Repository Index - shell: Rscript {0} - run: | - # folder that we sync to nightlies.apache.org - repo_root <- "repo" - tools::write_PACKAGES(file.path(repo_root, "src/contrib"), - type = "source", - verbose = TRUE, - latestOnly = FALSE - ) - - repo_dirs <- list.dirs(repo_root) - # find dirs with binary R packages: e.g. */contrib/4.1 - pkg_dirs <- grep(".+contrib\\/\\d.+", repo_dirs, value = TRUE) - - - for (dir in pkg_dirs) { - on_win <- grepl("windows", dir) - tools::write_PACKAGES(dir, - type = ifelse(on_win, "win.binary", "mac.binary"), - verbose = TRUE, - latestOnly = FALSE - ) - } - - name: Show repo contents - run: tree repo - - name: Sync to Remote - uses: ./arrow/.github/actions/sync-nightlies - with: - upload: true - switches: -avzh --update --delete --progress - local_path: repo - remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/arrow/r - remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }} - remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }} - remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }} - remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} - remote_host_key: ${{ secrets.NIGHTLIES_RSYNC_HOST_KEY }} +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Upload R Nightly builds +# This workflow downloads the (nightly) binaries created in crossbow and uploads them +# to nightlies.apache.org. Due to authorization requirements, this upload can't be done +# from the crossbow repository. + +on: + workflow_dispatch: + inputs: + prefix: + description: Job prefix to use. + required: false + default: '' + keep: + description: Number of versions to keep. + required: false + default: 14 + + schedule: + #Crossbow packaging runs at 0 8 * * * + - cron: '0 14 * * *' + +permissions: + contents: read + +jobs: + upload: + if: github.repository == 'apache/arrow' + runs-on: ubuntu-latest + steps: + - name: Checkout Arrow + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 1 + path: arrow + repository: apache/arrow + ref: main + submodules: recursive + - name: Checkout Crossbow + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + path: crossbow + repository: ursacomputing/crossbow + ref: main + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + cache: 'pip' + python-version: 3.12 + - name: Install Archery + shell: bash + run: pip install -e arrow/dev/archery[all] + - run: mkdir -p binaries + - name: Download Artifacts + env: + PREFIX: ${{ github.event.inputs.prefix || ''}} + run: | + if [ -z $PREFIX ]; then + PREFIX=nightly-packaging-$(date +%Y-%m-%d)-0 + fi + echo $PREFIX + + archery crossbow download-artifacts -f r-binary-packages -t binaries $PREFIX + + if [ -n "$(ls -A binaries/*/*/)" ]; then + echo "Found files!" + else + echo "No files found. Stopping upload." + exit 1 + fi + - name: Cache Repo + uses: actions/cache@v4 + with: + path: repo + key: r-nightly-${{ github.run_id }} + restore-keys: r-nightly- + - name: Sync from Remote + uses: ./arrow/.github/actions/sync-nightlies + with: + switches: -avzh --update --delete --progress + local_path: repo + remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/arrow/r + remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }} + remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }} + remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }} + remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} + remote_host_key: ${{ secrets.NIGHTLIES_RSYNC_HOST_KEY }} + - run: tree repo + - uses: r-lib/actions/setup-r@v2 + - name: Build Repository + shell: Rscript {0} + run: | + # folder that we sync to nightlies.apache.org + repo_root <- "repo" + # The binaries are in a nested dir + # so we need to find the correct path. + art_path <- list.files("binaries", + recursive = TRUE, + include.dirs = TRUE, + pattern = "r-binary-packages$", + full.names = TRUE + ) + + current_pkg_path <- list.files(art_path, + full.names = TRUE, + pattern = "r-pkg", + recursive = TRUE + ) + current_lib_path <- list.files(art_path, + full.names = TRUE, + pattern = "r-lib", + recursive = TRUE + ) + files <- c( + sub("r-pkg", repo_root, current_pkg_path), + sub("r-lib", paste0(repo_root, "__r-lib"), current_lib_path), + ) + + # decode contrib.url from artifact name: + # bin__windows__contrib__4.1 -> bin/windows/contrib/4.1 + new_paths <- gsub("__", "/", files) + # strip superfluous nested dirs + new_paths <- sub(art_path, ".", new_paths) + dirs <- dirname(new_paths) + sapply(dirs, dir.create, recursive = TRUE, showWarnings = FALSE) + + # overwrite allows us to "force push" a new version with the same name + copy_result <- file.copy(current_path, new_paths, overwrite = TRUE) + + if (!all(copy_result)) { + stop("There was an issue while copying the files!") + } + - name: Prune Repository + shell: bash + env: + KEEP: ${{ github.event.inputs.keep || 14 }} + run: | + prune() { + # list files | retain $KEEP newest files | delete everything else + ls -t "$@" | tail -n +$((KEEP + 1)) | xargs --no-run-if-empty rm + } + + # find leaf sub dirs + repo_dirs=$(find repo -type d -links 2) + + # Old packages: repo/libarrow/bin/${TARGET}/arrow-${VERSION}.zip + # + # We want to retain $keep (14) versions of each pkg/lib so we call + # prune on each leaf dir and not on repo/. + for dir in "${repo_dirs[@]}"; do + prune $dir/arrow* + done + + # New packages: repo/libarrow/${TARGET}-arrow-${VERSION}.zip + prune repo/libarrow/r-libarrow-darwin-arm64-openssl-1.1-* || : + prune repo/libarrow/r-libarrow-darwin-arm64-openssl-3.0-* || : + prune repo/libarrow/r-libarrow-darwin-x86_64-openssl-1.1-* || : + prune repo/libarrow/r-libarrow-darwin-x86_64-openssl-3.0-* || : + prune repo/libarrow/r-libarrow-linux-x86_64-openssl-1.0-* || : + prune repo/libarrow/r-libarrow-linux-x86_64-openssl-1.1-* || : + prune repo/libarrow/r-libarrow-linux-x86_64-openssl-3.0-* || : + prune repo/libarrow/r-libarrow-windows-x86_64-* || : + - name: Update Repository Index + shell: Rscript {0} + run: | + # folder that we sync to nightlies.apache.org + repo_root <- "repo" + tools::write_PACKAGES(file.path(repo_root, "src/contrib"), + type = "source", + verbose = TRUE, + latestOnly = FALSE + ) + + repo_dirs <- list.dirs(repo_root) + # find dirs with binary R packages: e.g. */contrib/4.1 + pkg_dirs <- grep(".+contrib\\/\\d.+", repo_dirs, value = TRUE) + + + for (dir in pkg_dirs) { + on_win <- grepl("windows", dir) + tools::write_PACKAGES(dir, + type = ifelse(on_win, "win.binary", "mac.binary"), + verbose = TRUE, + latestOnly = FALSE + ) + } + - name: Show repo contents + run: tree repo + - name: Sync to Remote + uses: ./arrow/.github/actions/sync-nightlies + with: + upload: true + switches: -avzh --update --delete --progress + local_path: repo + remote_path: ${{ secrets.NIGHTLIES_RSYNC_PATH }}/arrow/r + remote_host: ${{ secrets.NIGHTLIES_RSYNC_HOST }} + remote_port: ${{ secrets.NIGHTLIES_RSYNC_PORT }} + remote_user: ${{ secrets.NIGHTLIES_RSYNC_USER }} + remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} + remote_host_key: ${{ secrets.NIGHTLIES_RSYNC_HOST_KEY }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 655d67df69e..611e7616796 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,9 +20,8 @@ name: Release on: push: tags: - # Trigger workflow when a tag whose name matches the pattern - # pattern "apache-arrow-{MAJOR}.{MINOR}.{PATCH}" is pushed. - - "apache-arrow-[0-9]+.[0-9]+.[0-9]+" + - "apache-arrow-*" + - "!apache-arrow-*-rc*" permissions: contents: write diff --git a/.github/workflows/release_candidate.yml b/.github/workflows/release_candidate.yml index 5e222be06aa..34c3dd01f04 100644 --- a/.github/workflows/release_candidate.yml +++ b/.github/workflows/release_candidate.yml @@ -15,37 +15,59 @@ # specific language governing permissions and limitations # under the License. -name: Release +name: RC on: push: + branches: + - '**' + - '!dependabot/**' tags: - # Trigger workflow when a tag whose name matches the pattern - # "apache-arrow-{MAJOR}.{MINOR}.{PATCH}-rc{RC_NUM}" is pushed. - - "apache-arrow-[0-9]+.[0-9]+.[0-9]+-rc[0-9]+" + - "apache-arrow-*-rc*" + paths: + - ".github/workflows/release_candidate.sh" + - "dev/release/utils-create-release-tarball.sh" + - "dev/release/utils-generate-checksum.sh" + pull_request: + paths: + - ".github/workflows/release_candidate.sh" + - "dev/release/utils-create-release-tarball.sh" + - "dev/release/utils-generate-checksum.sh" permissions: contents: write -env: - GH_TOKEN: ${{ github.token }} - jobs: publish: name: Publish runs-on: ubuntu-latest - timeout-minutes: 5 + timeout-minutes: 10 steps: - name: Checkout Arrow - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y -V gpg reprotest - name: Store Version and Release Candidate Number run: | - version_with_rc=${GITHUB_REF_NAME#apache-arrow-} - version=${version_with_rc%-rc*} - rc_num=${version_with_rc#${version}-rc} - echo "VERSION_WITH_RC=${version_with_rc}" >> ${GITHUB_ENV} + if [ "${GITHUB_REF_TYPE}" = "tag" ]; then + version_with_rc=${GITHUB_REF_NAME#apache-arrow-} + version=${version_with_rc%-rc*} + rc_num=${version_with_rc#${version}-rc} + else + version=$(grep '^set(ARROW_VERSION ' cpp/CMakeLists.txt | \ + grep -E -o '[0-9]+\.[0-9]+\.[0-9]+') + rc_num=999 + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git tag \ + -a \ + "apache-arrow-${version}-rc${rc_num}" \ + -m "Apache Arrow ${version} RC${rc_num}" + fi echo "VERSION=${version}" >> ${GITHUB_ENV} echo "RC_NUM=${rc_num}" >> ${GITHUB_ENV} - name: Create Release Candidate Title @@ -57,11 +79,37 @@ jobs: release_notes="Release Candidate: ${VERSION} RC${RC_NUM}" echo "RELEASE_CANDIDATE_NOTES=${release_notes}" >> ${GITHUB_ENV} - name: Create Release tarball + env: + ARROW_GPG_KEY_UID: A2AC7132B5DA7C273A7A147665F4A8CA9769ECD7 + ARROW_GPG_SECRET_KEY: ${{ secrets.ARROW_GPG_SECRET_KEY }} run: | + sudo reprotest \ + "dev/release/utils-create-release-tarball.sh ${VERSION} ${RC_NUM}" \ + apache-arrow-${VERSION}.tar.gz dev/release/utils-create-release-tarball.sh ${VERSION} ${RC_NUM} - echo "RELEASE_TARBALL=apache-arrow-${VERSION}.tar.gz" >> ${GITHUB_ENV} - dev/release/utils-generate-checksum.sh "apache-arrow-${VERSION}.tar.gz" + RELEASE_TARBALL=apache-arrow-${VERSION}.tar.gz + echo "RELEASE_TARBALL=${RELEASE_TARBALL}" >> ${GITHUB_ENV} + dev/release/run-rat.sh "${RELEASE_TARBALL}" + dev/release/utils-generate-checksum.sh "${RELEASE_TARBALL}" + if [ -n "${ARROW_GPG_SECRET_KEY}" ]; then + echo "${ARROW_GPG_SECRET_KEY}" | gpg --import + gpg \ + --armor \ + --detach-sign \ + --local-user "${ARROW_GPG_KEY_UID}" \ + --output "${RELEASE_TARBALL}.asc" \ + "${RELEASE_TARBALL}" + fi + - name: Upload Artifacts + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: release-candidate + path: ${{ env.RELEASE_TARBALL }}* - name: Create GitHub Release + if: | + github.ref_type == 'tag' + env: + GH_TOKEN: ${{ github.token }} run: | gh release create ${GITHUB_REF_NAME} \ --verify-tag \ diff --git a/.github/workflows/report_ci.yml b/.github/workflows/report_ci.yml new file mode 100644 index 00000000000..8d3e6ffc04b --- /dev/null +++ b/.github/workflows/report_ci.yml @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Report CI results + +on: + workflow_call: + +jobs: + report-ci: + runs-on: ubuntu-latest + # We don't have the job id as part of the context neither the job name. + # The GitHub API exposes numeric id or job name but not the github.job (report-ci). + # We match github.job to the name so we can pass it via context in order to be ignored on the report. + # The job is still running. + name: ${{ github.job }} + if: github.event_name == 'schedule' && always() + steps: + - name: Checkout Arrow + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + - name: Setup Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: 3 + - name: Setup Archery + run: python3 -m pip install -e dev/archery[crossbow] + - name: Prepare common options + run: | + if [ "${GITHUB_REPOSITORY}" = "apache/arrow" ]; then + echo "COMMON_OPTIONS=--send" >> "${GITHUB_ENV}" + else + echo "COMMON_OPTIONS=--dry-run" >> "${GITHUB_ENV}" + fi + - name: Send email + env: + GH_TOKEN: ${{ github.token }} + SMTP_PASSWORD: ${{ secrets.ARROW_SMTP_PASSWORD }} + run: | + archery ci report-email \ + --ignore ${{ github.job }} \ + --recipient-email 'builds@arrow.apache.org' \ + --repository ${{ github.repository }} \ + --sender-email 'arrow@commit-email.info' \ + --sender-name Arrow \ + --smtp-port 587 \ + --smtp-server 'commit-email.info' \ + --smtp-user arrow \ + ${COMMON_OPTIONS} \ + ${{ github.run_id }} + - name: Send chat message + if: always() + env: + GH_TOKEN: ${{ github.token }} + CHAT_WEBHOOK: ${{ secrets.ARROW_ZULIP_WEBHOOK }} + run: | + archery ci report-chat \ + --ignore ${{ github.job }} \ + --repository ${{ github.repository }} \ + ${COMMON_OPTIONS} \ + ${{ github.run_id }} diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index be90eeee39b..aa9df362f11 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -29,6 +29,7 @@ on: - '.github/workflows/ruby.yml' - 'ci/docker/**' - 'ci/scripts/c_glib_*' + - 'ci/scripts/ccache_setup.sh' - 'ci/scripts/cpp_*' - 'ci/scripts/msys2_*' - 'ci/scripts/ruby_*' @@ -43,6 +44,7 @@ on: - '.github/workflows/ruby.yml' - 'ci/docker/**' - 'ci/scripts/c_glib_*' + - 'ci/scripts/ccache_setup.sh' - 'ci/scripts/cpp_*' - 'ci/scripts/msys2_*' - 'ci/scripts/ruby_*' @@ -79,7 +81,7 @@ jobs: UBUNTU: ${{ matrix.ubuntu }} steps: - name: Checkout Arrow - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive @@ -90,7 +92,7 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3.12 - name: Setup Archery @@ -148,18 +150,16 @@ jobs: ARROW_WITH_ZSTD: ON steps: - name: Checkout Arrow - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive - name: Install Homebrew Dependencies shell: bash run: | - # pkg-config formula is deprecated but it's still installed - # in GitHub Actions runner now. We can remove this once - # pkg-config formula is removed from GitHub Actions runner. - brew uninstall pkg-config || : - brew uninstall pkg-config@0.29.2 || : + # We can remove this once GitHub hosted runners include + # Meson 1.8.4 or later by default. + brew update brew bundle --file=cpp/Brewfile brew bundle --file=c_glib/Brewfile # For Meson. @@ -201,7 +201,7 @@ jobs: windows-mingw: name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} GLib & Ruby - runs-on: windows-2019 + runs-on: windows-2022 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 90 strategy: @@ -248,7 +248,7 @@ jobs: /d 1 ` /f - name: Checkout Arrow - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive @@ -366,7 +366,7 @@ jobs: /d 1 ` /f - name: Checkout Arrow - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 submodules: recursive @@ -396,7 +396,7 @@ jobs: # We can invalidate the current cache by updating this. CACHE_VERSION: "2024-05-09" - name: Checkout vcpkg - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: fetch-depth: 0 path: vcpkg diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml deleted file mode 100644 index 15c599bad7b..00000000000 --- a/.github/workflows/swift.yml +++ /dev/null @@ -1,89 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Swift - -on: - push: - branches: - - '**' - - '!dependabot/**' - tags: - - '**' - paths: - - '.dockerignore' - - '.github/workflows/swift.yml' - - 'ci/docker/*swift*' - - 'ci/scripts/swift_*' - - 'docker-compose.yml' - - 'swift/**' - pull_request: - paths: - - '.dockerignore' - - '.github/workflows/swift.yml' - - 'ci/docker/*swift*' - - 'ci/scripts/swift_*' - - 'docker-compose.yml' - - 'swift/**' - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -permissions: - contents: read - -env: - ARCHERY_DEBUG: 1 - DOCKER_VOLUME_PREFIX: ".docker/" - -jobs: - docker: - name: AMD 64 Ubuntu Swift 5.10 - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 15 - steps: - - name: Checkout Arrow - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - submodules: recursive - - name: Setup Python on hosted runner - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: 3 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: | - source ci/scripts/util_enable_core_dumps.sh - archery docker run ubuntu-swift - - name: Docker Push - if: >- - success() && - github.event_name == 'push' && - github.repository == 'apache/arrow' && - github.ref_name == 'main' - env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - run: archery docker push ubuntu-swift diff --git a/.github/workflows/verify_rc.yml b/.github/workflows/verify_rc.yml index dceb04a4923..b0eaa1924c5 100644 --- a/.github/workflows/verify_rc.yml +++ b/.github/workflows/verify_rc.yml @@ -38,11 +38,45 @@ permissions: env: TEST_DEFAULT: "0" VERBOSE: "1" - RC_TAG: "${{ inputs.rc_tag || github.event_name == 'pull_request' && 'apache-arrow-20.0.0-rc0' || github.ref_name }}" jobs: + target: + name: Target + runs-on: ubuntu-latest + timeout-minutes: 5 + outputs: + version: ${{ steps.detect.outputs.version }} + rc: ${{ steps.detect.outputs.rc }} + steps: + - name: Detect + id: detect + env: + GH_TOKEN: ${{ github.token }} + run: | + case "${GITHUB_EVENT_NAME}" in + workflow_dispatch) + tag="${{ inputs.rc_tag }}" + ;; + pull_request) + tag="$(gh release list \ + --jq '.[] | select(.isPrerelease) | .tagName' \ + --json tagName,isPrerelease \ + --repo ${GITHUB_REPOSITORY} | \ + head -n1)" + ;; + *) + tag="${GITHUB_REF_NAME}" + ;; + esac + package_id=${tag%-rc*} + version=${package_id#apache-arrow-} + rc=${tag#*-rc} + echo "version=${version}" >> "${GITHUB_OUTPUT}" + echo "rc=${rc}" >> "${GITHUB_OUTPUT}" + apt: name: APT + needs: target runs-on: ${{ matrix.runs-on }} timeout-minutes: 30 strategy: @@ -52,33 +86,61 @@ jobs: - ubuntu-latest - ubuntu-24.04-arm env: + RC: ${{ needs.target.outputs.rc }} TEST_APT: "1" + VERSION: ${{ needs.target.outputs.version }} steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 - name: Run run: | - package_id=${RC_TAG%-rc*} - version=${package_id#apache-arrow-} - rc=${RC_TAG#*-rc} - dev/release/verify-release-candidate.sh ${version} ${rc} + dev/release/verify-release-candidate.sh ${VERSION} ${RC} + - name: Verify the previous version + # TODO: We may re-enable this in the future. + # There are some problems for now: + # * We need to specify the previous versions for all + # dependencies explicitly. For example, "apt install + # libarrow-glib-dev=20.0.0-1" doesn't work. We need "apt + # install libarrow-glib-dev=20.0.0-1 + # libarrow-acero-dev=20.0.0-1 libparquet-dev=20.0.0-1 + # gir1.2-arrow-1.0=20.0.0-1 libparquet-dev=20.0.0-1 + # libarrow-dev=20.0.0-1" + continue-on-error: true + run: | + major_version=${VERSION%%.*} + previous_major_version=$((major_version - 1)) + previous_version=${previous_major_version}.0.0 + previous_tag=apache-arrow-${previous_version} + git checkout ${previous_tag} + # This is workaround. dev/release/verify-release-candidate.sh + # in < 21.0.0 doesn't accept 20.0.0 as the VERSION argument. + # We can remove this workaround after 21.0.0 release. + sed \ + -i \ + -e 's/^\(ensure_source_directory\)$/# \1/' \ + -e 's/^\(test_source_distribution\)$/# \1/' \ + dev/release/verify-release-candidate.sh + dev/release/verify-release-candidate.sh ${previous_version} binary: name: Binary + needs: target runs-on: ubuntu-latest timeout-minutes: 30 env: + RC: ${{ needs.target.outputs.rc }} TEST_BINARY: "1" + VERSION: ${{ needs.target.outputs.version }} steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Run run: | - package_id=${RC_TAG%-rc*} - version=${package_id#apache-arrow-} - rc=${RC_TAG#*-rc} - dev/release/verify-release-candidate.sh ${version} ${rc} + dev/release/verify-release-candidate.sh ${VERSION} ${RC} wheels-linux: name: Wheels Linux + needs: target runs-on: ubuntu-latest timeout-minutes: 30 strategy: @@ -90,19 +152,18 @@ jobs: - ubuntu-22.04 - ubuntu-24.04 env: + RC: ${{ needs.target.outputs.rc }} TEST_WHEELS: "1" + VERSION: ${{ needs.target.outputs.version }} steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3 - name: Setup Archery run: python -m pip install -e dev/archery[docker] - name: Prepare run: | - package_id=${RC_TAG%-rc*} - echo "VERSION=${package_id#apache-arrow-}" >> ${GITHUB_ENV} - echo "RC=${RC_TAG#*-rc}" >> ${GITHUB_ENV} distro=${{ matrix.distro }} if [ "${distro}" = "conda" ]; then echo "SERVICE=${distro}-verify-rc" >> ${GITHUB_ENV} @@ -127,6 +188,7 @@ jobs: wheels-macos: name: Wheels macOS + needs: target runs-on: ${{ matrix.runs-on }} timeout-minutes: 30 strategy: @@ -136,36 +198,32 @@ jobs: - macos-13 - macos-14 env: + RC: ${{ needs.target.outputs.rc }} TEST_WHEELS: "1" + VERSION: ${{ needs.target.outputs.version }} steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Run env: GH_TOKEN: ${{ github.token }} run: | - package_id=${RC_TAG%-rc*} - version=${package_id#apache-arrow-} - rc=${RC_TAG#*-rc} - dev/release/verify-release-candidate.sh ${version} ${rc} + dev/release/verify-release-candidate.sh ${VERSION} ${RC} wheels-windows: name: Wheels Windows + needs: target runs-on: windows-latest timeout-minutes: 45 env: PYARROW_TEST_GDB: "OFF" + RC: ${{ needs.target.outputs.rc }} TEST_WHEELS: "1" + VERSION: ${{ needs.target.outputs.version }} steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: submodules: recursive - - name: Prepare - shell: bash - run: | - package_id=${RC_TAG%-rc*} - echo "VERSION=${package_id#apache-arrow-}" >> ${GITHUB_ENV} - echo "RC=${RC_TAG#*-rc}" >> ${GITHUB_ENV} - - uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3.1.1 + - uses: conda-incubator/setup-miniconda@835234971496cad1653abb28a638a281cf32541f # v3.2.0 - name: Install System Dependencies run: | choco install --no-progress --yes boost-msvc-14.1 @@ -182,6 +240,7 @@ jobs: yum: name: Yum + needs: target runs-on: ${{ matrix.runs-on }} timeout-minutes: 30 strategy: @@ -191,12 +250,37 @@ jobs: - ubuntu-latest - ubuntu-24.04-arm env: + RC: ${{ needs.target.outputs.rc }} TEST_YUM: "1" + VERSION: ${{ needs.target.outputs.version }} steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 - name: Run run: | - package_id=${RC_TAG%-rc*} - version=${package_id#apache-arrow-} - rc=${RC_TAG#*-rc} - dev/release/verify-release-candidate.sh ${version} ${rc} + dev/release/verify-release-candidate.sh ${VERSION} ${RC} + - name: Verify the previous version + # TODO: We may re-enable this in the future. + # There are some problems for now: + # * x86_64: libLLVM.so.18.1 needed by gandiva2000-libs on AlmaLinux 9 + # * arm64: libarrow.so.2000.0.0: refers the + # "std::condition_variable::wait(std::unique_lock&)@GLIBCXX_3.4.30" + # (not "...@@GLIBCXX_3.4.30" nor "...@GLIBCXX_3.4.11") symbol on + # AmazonLinux 2023. + continue-on-error: true + run: | + major_version=${VERSION%%.*} + previous_major_version=$((major_version - 1)) + previous_version=${previous_major_version}.0.0 + previous_tag=apache-arrow-${previous_version} + git checkout ${previous_tag} + # This is workaround. dev/release/verify-release-candidate.sh + # in < 21.0.0 doesn't accept 20.0.0 as the VERSION argument. + # We can remove this workaround after 21.0.0 release. + sed \ + -i \ + -e 's/^\(ensure_source_directory\)$/# \1/' \ + -e 's/^\(test_source_distribution\)$/# \1/' \ + dev/release/verify-release-candidate.sh + dev/release/verify-release-candidate.sh ${previous_version} diff --git a/.gitignore b/.gitignore index 52ffa6c6124..8354aa8f816 100644 --- a/.gitignore +++ b/.gitignore @@ -16,8 +16,7 @@ # under the License. apache-rat-*.jar -arrow-src.tar -arrow-src.tar.gz +apache-arrow.tar.gz # Compiled source *.a diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2df35dd0837..502bb70c9ad 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,13 +26,20 @@ repos: - id: rat name: Release Audit Tool language: system - entry: bash -c "git archive HEAD --prefix=apache-arrow/ --output=arrow-src.tar && ./dev/release/run-rat.sh arrow-src.tar" + entry: | + bash -c " \ + git archive HEAD \ + --prefix=apache-arrow/ \ + --output=apache-arrow.tar.gz && \ + dev/release/run-rat.sh apache-arrow.tar.gz && \ + rm -f apache-arrow.tar.gz" always_run: true pass_filenames: false - repo: https://github.com/hadolint/hadolint rev: v2.12.0 hooks: - id: hadolint-docker + alias: docker name: Docker Format # We can enable this after we fix all existing lint failures. # files: (/Dockerfile|\.dockerfile)$ @@ -42,28 +49,12 @@ repos: ?^ci/docker/python-.*-wheel-windows-test-vs2022.*\.dockerfile$| ) types: [] - - repo: https://github.com/pycqa/flake8 - rev: 6.1.0 - hooks: - - id: flake8 - name: Python Format - files: ^(python|dev|c_glib|integration)/ - types: - - file - - python - exclude: vendored - args: [--config, python/setup.cfg] - - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.12.5 - hooks: - - id: cython-lint - args: [--no-pycodestyle] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v14.0.6 hooks: - id: clang-format name: C++ Format - alias: cpp-format + alias: cpp types_or: - c++ # - json @@ -83,8 +74,10 @@ repos: rev: 1.6.1 hooks: - id: cpplint + alias: cpp name: C++ Lint args: + - "--quiet" - "--verbose=2" types_or: - c++ @@ -103,8 +96,8 @@ repos: rev: v14.0.6 hooks: - id: clang-format + alias: c-glib name: C/GLib Format - alias: c-glib-cpp-format files: >- ^c_glib/ - repo: https://github.com/pre-commit/mirrors-clang-format @@ -115,12 +108,59 @@ repos: alias: matlab-cpp-format files: >- ^matlab/src/cpp/ + - repo: https://github.com/hhatto/autopep8 + rev: v2.3.2 + hooks: + - id: autopep8 + alias: python + name: Python Format + args: + - "--global-config" + - "python/setup.cfg" + - "--ignore-local-config" + - "--in-place" + files: >- + ^(c_glib|dev|python)/ + types: + - file + types_or: + - cython + - python + exclude: >- + ( + ?^python/pyarrow/vendored/| + ) + - repo: https://github.com/pycqa/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + alias: python + name: Python Lint + args: + - "--config" + - "python/setup.cfg" + files: >- + ^(c_glib|dev|python)/ + exclude: >- + ( + ?^python/pyarrow/vendored/| + ) + - repo: https://github.com/MarcoGorelli/cython-lint + rev: v0.16.2 + hooks: + - id: cython-lint + alias: python + name: Python (Cython) Lint + args: + - "--no-pycodestyle" + files: >- + ^python/ - repo: https://github.com/pre-commit/mirrors-clang-format rev: v14.0.6 hooks: - id: clang-format + alias: python name: Python (C++) Format - alias: python-cpp-format files: >- ^python/pyarrow/src/ exclude: >- @@ -129,12 +169,75 @@ repos: ?.pb\.(cc|h)$| ?^cpp/src/generated/| ) + - repo: https://github.com/numpy/numpydoc + rev: v1.8.0 + hooks: + - id: numpydoc-validation + name: Python (NumPy doc) Lint + alias: python-doc-lint + args: + - "--config=python" + files: >- + ^python/pyarrow/ + exclude: >- + ( + ?^python/pyarrow/interchange/from_dataframe\.py$| + ?^python/pyarrow/jvm\.py$| + ?^python/pyarrow/pandas_compat\.py$| + ?^python/pyarrow/tests/| + ?^python/pyarrow/util\.py$| + ?^python/pyarrow/vendored/| + ) + - repo: local + hooks: + - id: lintr + alias: r + name: R Lint + language: r + additional_dependencies: + - cyclocomp + - lintr + - testthat + entry: | + Rscript -e "Sys.setenv(NOT_CRAN = 'TRUE'); lintr::expect_lint_free('r')" + pass_filenames: false + files: >- + ^r/.*\.(R|Rmd)$ + - repo: local + hooks: + - id: styler + alias: r + name: R Format + language: r + additional_dependencies: + - roxygen2 + - styler + entry: | + Rscript -e "styler::style_file(commandArgs(TRUE)[1])" + files: >- + ^r/.*\.(R|Rmd)$ - repo: https://github.com/pre-commit/mirrors-clang-format rev: v14.0.6 hooks: - id: clang-format + alias: r name: R (C++) Format - alias: r-cpp-format + files: >- + ^r/src/ + exclude: >- + ( + ?^r/src/arrowExports\.cpp$| + ) + - repo: https://github.com/cpplint/cpplint + rev: 1.6.1 + hooks: + - id: cpplint + alias: r + name: R (C++) Lint + args: + - "--verbose=2" + types_or: + - c++ files: >- ^r/src/ exclude: >- @@ -146,7 +249,7 @@ repos: hooks: - id: rubocop name: Ruby Format - alias: ruby-format + alias: ruby args: - "--autocorrect" exclude: >- @@ -157,6 +260,7 @@ repos: rev: v0.6.13 hooks: - id: cmake-format + alias: cpp name: CMake Format files: >- ( @@ -177,6 +281,7 @@ repos: rev: v0.9.1 hooks: - id: sphinx-lint + alias: docs files: ^docs/source exclude: ^docs/source/python/generated args: [ @@ -189,23 +294,86 @@ repos: rev: v0.10.0 hooks: - id: shellcheck + alias: shell # TODO: Remove this when we fix all lint failures files: >- ( + ?^c_glib/test/run-test\.sh$| ?^ci/scripts/c_glib_build\.sh$| ?^ci/scripts/c_glib_test\.sh$| - ?^c_glib/test/run-test\.sh$| + ?^ci/scripts/ccache_setup\.sh$| + ?^ci/scripts/conan_build\.sh$| + ?^ci/scripts/conan_setup\.sh$| + ?^ci/scripts/cpp_test\.sh$| + ?^ci/scripts/download_tz_database\.sh$| + ?^ci/scripts/install_azurite\.sh$| + ?^ci/scripts/install_ccache\.sh$| + ?^ci/scripts/install_ceph\.sh$| + ?^ci/scripts/install_chromedriver\.sh$| + ?^ci/scripts/install_cmake\.sh$| + ?^ci/scripts/install_conda\.sh$| + ?^ci/scripts/install_dask\.sh$| + ?^ci/scripts/install_emscripten\.sh$| + ?^ci/scripts/install_gcs_testbench\.sh$| + ?^ci/scripts/install_iwyu\.sh$| + ?^ci/scripts/install_minio\.sh$| + ?^ci/scripts/install_ninja\.sh$| + ?^ci/scripts/install_numba\.sh$| + ?^ci/scripts/install_numpy\.sh$| + ?^ci/scripts/install_pandas\.sh$| + ?^ci/scripts/install_python\.sh$| + ?^ci/scripts/install_sccache\.sh$| + ?^ci/scripts/install_spark\.sh$| + ?^ci/scripts/install_vcpkg\.sh$| + ?^ci/scripts/integration_arrow_build\.sh$| + ?^ci/scripts/integration_arrow\.sh$| + ?^ci/scripts/integration_dask\.sh$| + ?^ci/scripts/integration_hdfs\.sh$| + ?^ci/scripts/integration_spark\.sh$| + ?^ci/scripts/matlab_build\.sh$| + ?^ci/scripts/msys2_setup\.sh$| + ?^ci/scripts/msys2_system_clean\.sh$| + ?^ci/scripts/msys2_system_upgrade\.sh$| + ?^ci/scripts/nanoarrow_build\.sh$| + ?^ci/scripts/python_build_emscripten\.sh$| + ?^ci/scripts/python_build\.sh$| + ?^ci/scripts/python_sdist_build\.sh$| + ?^ci/scripts/python_sdist_test\.sh$| + ?^ci/scripts/python_wheel_unix_test\.sh$| + ?^ci/scripts/r_build\.sh$| + ?^ci/scripts/r_revdepcheck\.sh$| + ?^ci/scripts/release_test\.sh$| + ?^ci/scripts/ruby_test\.sh$| + ?^ci/scripts/rust_build\.sh$| + ?^ci/scripts/util_enable_core_dumps\.sh$| + ?^ci/scripts/util_free_space\.sh$| + ?^ci/scripts/util_log\.sh$| + ?^cpp/build-support/build-lz4-lib\.sh$| + ?^cpp/build-support/build-zstd-lib\.sh$| + ?^cpp/build-support/get-upstream-commit\.sh$| + ?^cpp/build-support/update-thrift\.sh$| + ?^cpp/examples/minimal_build/run\.sh$| + ?^cpp/examples/tutorial_examples/run\.sh$| ?^dev/release/05-binary-upload\.sh$| ?^dev/release/07-binary-verify\.sh$| + ?^dev/release/binary-recover\.sh$| ?^dev/release/post-03-binary\.sh$| - ?^dev/release/post-10-docs\.sh$| - ?^dev/release/post-11-python\.sh$| + ?^dev/release/post-08-docs\.sh$| + ?^dev/release/post-09-python\.sh$| + ?^dev/release/setup-rhel-rebuilds\.sh$| ?^dev/release/utils-generate-checksum\.sh$| + ?^python/asv-install\.sh$| + ?^python/asv-uninstall\.sh$| + ?^swift/gen-protobuffers\.sh$| ) - repo: https://github.com/scop/pre-commit-shfmt - rev: v3.11.0-1 + # v3.11.0-1 or later requires pre-commit 3.2.0 or later but Ubuntu + # 22.04 ships pre-commit 2.17.0. We can use update this rev after + # Ubuntu 22.04 reached EOL (June 2027). + rev: v3.10.0-1 hooks: - id: shfmt + alias: shell args: # The default args is "--write --simplify" but we don't use # "--simplify". Because it's conflicted will ShellCheck. @@ -214,12 +382,19 @@ repos: files: >- ( ?^dev/release/05-binary-upload\.sh$| + ?^dev/release/binary-recover\.sh$| ?^dev/release/post-03-binary\.sh$| - ?^dev/release/post-10-docs\.sh$| - ?^dev/release/post-11-python\.sh$| + ?^dev/release/post-08-docs\.sh$| + ?^dev/release/post-09-python\.sh$| ) - repo: https://github.com/trim21/pre-commit-mirror-meson - rev: v1.6.1 + rev: v1.9.0 hooks: - id: meson-fmt + alias: cpp args: ['--inplace'] + files: >- + ( + ?.*meson\.build$| + ?.*meson\.options$| + ) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6101f5d3cac..3fb888dee23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,227 @@ +# Apache Arrow 22.0.0 (2025-10-20) + +## Bug Fixes + +* [GH-26727](https://github.com/apache/arrow/issues/26727) - [C++][Flight] Use ipc::RecordBatchWriter with custom IpcPayloadWriter for TransportMessageWriter (DoExchange) (#47410) +* [GH-31603](https://github.com/apache/arrow/issues/31603) - [C++] Wrap Parquet encryption keys in SecureString (#46017) +* [GH-40911](https://github.com/apache/arrow/issues/40911) - [C++][Compute] Fix the decimal division kernel dispatching (#47445) +* [GH-41011](https://github.com/apache/arrow/issues/41011) - [C++][Compute] Fix the issue that comparison function could not handle decimal arguments with different scales (#47459) +* [GH-41110](https://github.com/apache/arrow/issues/41110) - [C#] Handle empty stream in ArrowStreamReaderImplementation (#47098) +* [GH-41336](https://github.com/apache/arrow/issues/41336) - [C++][Compute] Fix case_when kernel dispatch for decimals with different precisions and scales (#47479) +* [GH-42971](https://github.com/apache/arrow/issues/42971) - [C++] Parquet stream writer: Allow writing BYTE_ARRAY with converted type NONE (#44739) +* [GH-43355](https://github.com/apache/arrow/issues/43355) - [C++] Don't require `__once_proxy` in `symbols.map` (#47354) +* [GH-46629](https://github.com/apache/arrow/issues/46629) - [Python] Add options to DatasetFactory.inspect (#46961) +* [GH-46690](https://github.com/apache/arrow/issues/46690) - [GLib][CI] Use Meson 1.8.4 or later (#47425) +* [GH-46739](https://github.com/apache/arrow/issues/46739) - [C++] Fix Float16 signed zero/NaN equality comparisons (#46973) +* [GH-46897](https://github.com/apache/arrow/issues/46897) - [Docs][C++][Python] Fix asof join documentation (#46898) +* [GH-46928](https://github.com/apache/arrow/issues/46928) - [C++] Retry on EINTR while opening file in FileOpenReadable (#47629) +* [GH-46942](https://github.com/apache/arrow/issues/46942) - [Docs] Replace the directive versionadded with note (#46997) +* [GH-46946](https://github.com/apache/arrow/issues/46946) - [Python] PyArrow fails compiling without CSV enabled +* [GH-47009](https://github.com/apache/arrow/issues/47009) - [C#] ExportedAllocationOwner should use 64-bit integer to track total allocated memory. (#47011) +* [GH-47016](https://github.com/apache/arrow/issues/47016) - [C++][FlightSQL] Fix negative timestamps to date types (#47017) +* [GH-47027](https://github.com/apache/arrow/issues/47027) - [C++][Parquet] Fix repeated column pages not being written when reaching page size limit (#47032) +* [GH-47029](https://github.com/apache/arrow/issues/47029) - [Archery][Integration] Fix generation of run-end-encoded data (#47653) +* [GH-47039](https://github.com/apache/arrow/issues/47039) - [C++] Bump RapidJSON dependency in Meson configuration (#47041) +* [GH-47051](https://github.com/apache/arrow/issues/47051) - [Python][Release] verify-rc-source-windows Python tests are failing due to MSVC compiler bug +* [GH-47052](https://github.com/apache/arrow/issues/47052) - [CI][C++] Use Alpine Linux 3.22 instead of 3.18 (#47148) +* [GH-47096](https://github.com/apache/arrow/issues/47096) - [CI][R] Drop support for R 4.0 (#47285) +* [GH-47101](https://github.com/apache/arrow/issues/47101) - [Statistics][C++] Implement Statistics specification attribute ARROW:distinct_count:approximate (#47183) +* [GH-47124](https://github.com/apache/arrow/issues/47124) - [C++][Dataset] Fix DatasetWriter deadlock on concurrent WriteRecordBatch (#47129) +* [GH-47128](https://github.com/apache/arrow/issues/47128) - [Python] Numba-CUDA interop with NVIDIA bindings (#47150) +* [GH-47130](https://github.com/apache/arrow/issues/47130) - [Packaging][deb] Fix upgrade from 20.0.0-1 (#47343) +* [GH-47131](https://github.com/apache/arrow/issues/47131) - [C#] Fix day off by 1 in Date64Array (#47132) +* [GH-47143](https://github.com/apache/arrow/issues/47143) - [Dev] Ignore `apache-arrow.tar.gz` (#47145) +* [GH-47162](https://github.com/apache/arrow/issues/47162) - [Dev][Release][GLib] Fix indent in generate-version-header.py (#47163) +* [GH-47165](https://github.com/apache/arrow/issues/47165) - [Python] Update s3 test with new non-existent bucket (#47166) +* [GH-47175](https://github.com/apache/arrow/issues/47175) - [C++] Require xsimd 13.0.0 or later (#47221) +* [GH-47179](https://github.com/apache/arrow/issues/47179) - [Python] Revert FileSystem.from_uri to be a staticmethod again (#47178) +* [GH-47203](https://github.com/apache/arrow/issues/47203) - [C++] Restore CMAKE_DEBUG_POSTFIX in building bundled Apache Thrift (#47209) +* [GH-47213](https://github.com/apache/arrow/issues/47213) - [R] Require CMake 3.26 or later (#47217) +* [GH-47229](https://github.com/apache/arrow/issues/47229) - [C++][Arm] Force mimalloc to generate armv8.0 binary (#47766) +* [GH-47234](https://github.com/apache/arrow/issues/47234) - [C++][Python] Add test for fill_null regression on Windows (#47249) +* [GH-47241](https://github.com/apache/arrow/issues/47241) - [C++][Parquet] Fix VariantExtensionType conversion (#47242) +* [GH-47243](https://github.com/apache/arrow/issues/47243) - [C++] Initialize arrow::compute in execution_plan_documentation_examples (#47227) +* [GH-47256](https://github.com/apache/arrow/issues/47256) - [Python] Do not use cffi in free-threaded 3.13 builds (#47313) +* [GH-47257](https://github.com/apache/arrow/issues/47257) - [R] Fix truncation of time variables to work with numeric subseconds time with hms bindings (#47278) +* [GH-47265](https://github.com/apache/arrow/issues/47265) - [Ruby] Fix wrong `Time` object detection (#47267) +* [GH-47268](https://github.com/apache/arrow/issues/47268) - [C++][Compute] Fix discarded bad status for call binding (#47284) +* [GH-47277](https://github.com/apache/arrow/issues/47277) - [C++] r-binary-packages nightly failures due to incompatibility with old compiler (#47299) +* [GH-47283](https://github.com/apache/arrow/issues/47283) - [C++] Fix flight visibility issue in Meson configuration (#47298) +* [GH-47287](https://github.com/apache/arrow/issues/47287) - [C++][Compute] Add constraint for kernel signature matching and use it for binary decimal arithmetic kernels (#47297) +* [GH-47301](https://github.com/apache/arrow/issues/47301) - [Python] Fix FileFragment.open() seg fault behavior for file-like objects (#47302) +* [GH-47303](https://github.com/apache/arrow/issues/47303) - [C++] Don't install arrow-compute.pc twice (#47304) +* [GH-47323](https://github.com/apache/arrow/issues/47323) - [R][CI] test-r-rhub-debian-gcc-release-custom-ccache nightly job fails due to update in Debian (#47611) +* [GH-47332](https://github.com/apache/arrow/issues/47332) - [C++][Compute] Fix the issue that the arguments of function call become invalid before wrapping results (#47333) +* [GH-47356](https://github.com/apache/arrow/issues/47356) - [R] NEWS file states version 20.0.0.1 but release package number on CRAN is 20.0.0.2 (#47421) +* [GH-47367](https://github.com/apache/arrow/issues/47367) - [Packaging][Python] Patch vcpkg to show logs and install newer Windows SDK for vs_buildtools (#47484) +* [GH-47373](https://github.com/apache/arrow/issues/47373) - [C++] Raise for invalid decimal precision input from the C Data Interface (#47414) +* [GH-47380](https://github.com/apache/arrow/issues/47380) - [Python] Apply maps_as_pydicts to Nested MapScalar Values (#47454) +* [GH-47399](https://github.com/apache/arrow/issues/47399) - [C++] Update bundled Apache ORC to 2.2.0 with Protobuf patch (#47408) +* [GH-47431](https://github.com/apache/arrow/issues/47431) - [C++] Improve Meson configuration for WrapDB distribution (#47541) +* [GH-47434](https://github.com/apache/arrow/issues/47434) - [C++] Fix issue preventing running of tests on Windows (#47455) +* [GH-47440](https://github.com/apache/arrow/issues/47440) - [C++] Accept gflags::gflags as system gflags CMake target (#47468) +* [GH-47446](https://github.com/apache/arrow/issues/47446) - [C++] Update Meson configuration with compute swizzle change (#47448) +* [GH-47451](https://github.com/apache/arrow/issues/47451) - [Python][CI] Install tzdata-legacy in newer python-wheel-manylinux-test images (#47452) +* [GH-47453](https://github.com/apache/arrow/issues/47453) - [Packaging][CI] Token expired to upload nightly wheels +* [GH-47485](https://github.com/apache/arrow/issues/47485) - [C++][CI] Work around Valgrind failure on Azure tests (#47496) +* [GH-47486](https://github.com/apache/arrow/issues/47486) - [Dev][R] Define default R_UPDATE_CLANG (#47487) +* [GH-47491](https://github.com/apache/arrow/issues/47491) - [C++] Don't set include directories to found targets (#47492) +* [GH-47506](https://github.com/apache/arrow/issues/47506) - [CI][Packaging] Fix Amazon Linux 2023 packages verification (#47507) +* [GH-47534](https://github.com/apache/arrow/issues/47534) - [C++] Detect conda-installed packages in Meson CI (#47535) +* [GH-47537](https://github.com/apache/arrow/issues/47537) - [C++] Use pkgconfig name for benchmark in Meson (#47538) +* [GH-47539](https://github.com/apache/arrow/issues/47539) - [C++] Detect Snappy and bzip2 in Meson CI (#47540) +* [GH-47554](https://github.com/apache/arrow/issues/47554) - [C++] Fix Meson Parquet symbol visibility issues (#47556) +* [GH-47560](https://github.com/apache/arrow/issues/47560) - [C++] Fix host handling for default HDFS URI (#47458) +* [GH-47570](https://github.com/apache/arrow/issues/47570) - [CI] Don't notify nightly "CI: Extra" result from forks (#47571) +* [GH-47590](https://github.com/apache/arrow/issues/47590) - [C++] Use W functions explicitly for Windows UNICODE compatibility (#47593) +* [GH-47591](https://github.com/apache/arrow/issues/47591) - [C++] Fix passing zlib compression level (#47594) +* [GH-47596](https://github.com/apache/arrow/issues/47596) - [C++][Parquet] Fix printing of large Decimal statistics (#47619) +* [GH-47602](https://github.com/apache/arrow/issues/47602) - [Python] Make Schema hashable even when it has metadata (#47601) +* [GH-47614](https://github.com/apache/arrow/issues/47614) - [CI] Upgrade vcpkg on our CI (#47627) +* [GH-47620](https://github.com/apache/arrow/issues/47620) - [CI][C++] Use Ubuntu 24.04 for ASAN UBSAN job (#47623) +* [GH-47625](https://github.com/apache/arrow/issues/47625) - [Python] Free-threaded musllinux and manylinux wheels started failing with cffi 2.0.0 (#47626) +* [GH-47655](https://github.com/apache/arrow/issues/47655) - [C++][Parquet][CI] Fix failure to generate seed corpus (#47656) +* [GH-47659](https://github.com/apache/arrow/issues/47659) - [C++] Fix Arrow Flight Testing's unresolved external symbol error (#47660) +* [GH-47673](https://github.com/apache/arrow/issues/47673) - [CI][Integration] Fix Go build failure (#47674) +* [GH-47682](https://github.com/apache/arrow/issues/47682) - [R] `install_pyarrow(nightly = TRUE)` installs old pyarrow (#47699) +* [GH-47695](https://github.com/apache/arrow/issues/47695) - [CI][Release] Link arrow-io hdfs_test to c++fs on compilers where std:::filesystem is not default present (#47701) +* [GH-47740](https://github.com/apache/arrow/issues/47740) - [C++][Parquet] Fix undefined behavior when reading invalid Parquet data (#47741) +* [GH-47742](https://github.com/apache/arrow/issues/47742) - [C++][CI] Silence Valgrind leak on protobuf initialization (#47743) +* [GH-47748](https://github.com/apache/arrow/issues/47748) - [C++][Dataset] Fix link error on macOS (#47749) +* [GH-47795](https://github.com/apache/arrow/issues/47795) - [Archery] Add support for custom Docker registry (#47796) +* [GH-47803](https://github.com/apache/arrow/issues/47803) - [C++][Parquet] Fix read out of bounds on invalid RLE data (#47804) +* [GH-47809](https://github.com/apache/arrow/issues/47809) - [CI][Release] Fix Windows verification job trying to install patch from conda (#47810) +* [GH-47819](https://github.com/apache/arrow/issues/47819) - [CI][Packaging][Release] Avoid triggering Linux packages on release branch push (#47826) +* [GH-47838](https://github.com/apache/arrow/issues/47838) - [C++][Parquet] Set Variant specification version to 1 to align with the variant spec (#47835) + + +## New Features and Improvements + +* [GH-20125](https://github.com/apache/arrow/issues/20125) - [Docs][Python] Restructure developers/python.rst (#47334) +* [GH-30036](https://github.com/apache/arrow/issues/30036) - [C++] Timezone-aware kernels should handle offset strings (e.g. "+04:30") (#12865) +* [GH-38211](https://github.com/apache/arrow/issues/38211) - [MATLAB] Add support for creating an empty `arrow.tabular.RecordBatch` by calling `arrow.recordBatch` with no input arguments (#47060) +* [GH-38213](https://github.com/apache/arrow/issues/38213) - [MATLAB] Create a superclass for tabular type MATLAB tests (i.e. for `Table` and `RecordBatch`) (#47107) +* [GH-38422](https://github.com/apache/arrow/issues/38422) - [MATLAB] Add `NumNulls` property to `arrow.array.Array` class (#47116) +* [GH-38532](https://github.com/apache/arrow/issues/38532) - [MATLAB] Add a `validate` method to all `arrow.array.Array` classes (#47059) +* [GH-38572](https://github.com/apache/arrow/issues/38572) - [Docs][MATLAB] Update `arrow/matlab/README.md` with the latest change. (#47109) +* [GH-39875](https://github.com/apache/arrow/issues/39875) - [C++] Why arrow decimal divide precision and scale is not correct? +* [GH-41108](https://github.com/apache/arrow/issues/41108) - [Docs] Remove Sphinx pin (#47326) +* [GH-41239](https://github.com/apache/arrow/issues/41239) - [C++] Support to write csv header without quotes (#47524) +* [GH-41476](https://github.com/apache/arrow/issues/41476) - [Python][C++] Impossible to specify `is_adjusted_to_utc` for `Time` type when writing to Parquet (#47316) +* [GH-42137](https://github.com/apache/arrow/issues/42137) - [CI][Python] Add Python Windows GitHub Action and remove AppVeyor (#47567) +* [GH-43662](https://github.com/apache/arrow/issues/43662) - [R] Add binding to stringr::str_replace_na() (#47521) +* [GH-43694](https://github.com/apache/arrow/issues/43694) - [C++] Add `Executor *` Option to `arrow::dataset::ScanOptions` (#43698) +* [GH-43904](https://github.com/apache/arrow/issues/43904) - [CI][Python] Stop uploading nightly wheels to gemfury (#47470) +* [GH-44345](https://github.com/apache/arrow/issues/44345) - [C++][Parquet] Add Decimal32/64 support to Parquet (#47427) +* [GH-44800](https://github.com/apache/arrow/issues/44800) - [C#] Implement Flight SQL Client (#44783) +* [GH-45055](https://github.com/apache/arrow/issues/45055) - [C++][Flight] Update Flight Server RecordBatchStreamImpl to reuse ipc::RecordBatchWriter with custom IpcPayloadWriter instead of manually generating FlightPayload (#47115) +* [GH-45056](https://github.com/apache/arrow/issues/45056) - [C++][Flight] Fully support dictionary replacement in Flight +* [GH-45382](https://github.com/apache/arrow/issues/45382) - [Python] Add support for pandas DataFrame.attrs (#47147) +* [GH-45639](https://github.com/apache/arrow/issues/45639) - [C++][Statistics] Add support for ARROW:average_byte_width:{exac,approximate} (#46385) +* [GH-45860](https://github.com/apache/arrow/issues/45860) - [C++] Respect CPU affinity in cpu_count and ThreadPool default capacity (#47152) +* [GH-45921](https://github.com/apache/arrow/issues/45921) - [Release][R] Use GitHub Release not apache.jfrog.io (#45964) +* [GH-46137](https://github.com/apache/arrow/issues/46137) - [C++] Replace grpc-cpp conda package with libgrpc (#47606) +* [GH-46272](https://github.com/apache/arrow/issues/46272) - [C++] Build Arrow libraries with `-Wmissing-definitions` on gcc (#47042) +* [GH-46374](https://github.com/apache/arrow/issues/46374) - [Python][Doc] Improve docs to specify that source argument on parquet.read_table can also be a list of strings (#47142) +* [GH-46410](https://github.com/apache/arrow/issues/46410) - [C++] Add parquet options to Meson configuration (#46647) +* [GH-46669](https://github.com/apache/arrow/issues/46669) - [CI][Archery] Automate Zulip and email notifications for Extra CI (#47546) +* [GH-46728](https://github.com/apache/arrow/issues/46728) - [Python] Skip test_gdb.py tests if PyArrow wasn't built debug (#46755) +* [GH-46835](https://github.com/apache/arrow/issues/46835) - [C++] Add more configuration options to arrow::EqualOptions (#47204) +* [GH-46860](https://github.com/apache/arrow/issues/46860) - [C++] Making HalfFloatBuilder accept Float16 as well as uint16_t (#46981) +* [GH-46905](https://github.com/apache/arrow/issues/46905) - [C++][Parquet] Expose Statistics.is_{min/max}_value_exact and default set to true if min/max are set (#46992) +* [GH-46908](https://github.com/apache/arrow/issues/46908) - [Docs][Format] Add variant extension type docs (#47456) +* [GH-46937](https://github.com/apache/arrow/issues/46937) - [C++] Enable arrow::EqualOptions for arrow::Table (#47164) +* [GH-46938](https://github.com/apache/arrow/issues/46938) - [C++] Enhance arrow::ChunkedArray::Equals to support floating-point comparison when values share the same memory (#47044) +* [GH-46939](https://github.com/apache/arrow/issues/46939) - [C++] Add support for shared memory comparison in arrow::RecordBatch (#47149) +* [GH-46962](https://github.com/apache/arrow/issues/46962) - [C++][Parquet] Generic xsimd function and dynamic dispatch for Byte Stream Split (#46963) +* [GH-46971](https://github.com/apache/arrow/issues/46971) - [C++][Parquet] Use temporary buffers when decrypting Parquet data pages (#46972) +* [GH-46982](https://github.com/apache/arrow/issues/46982) - [C++] Remove Boost dependency from hdfs_test (#47200) +* [GH-47005](https://github.com/apache/arrow/issues/47005) - [C++] Disable exporting CMake packages (#47006) +* [GH-47012](https://github.com/apache/arrow/issues/47012) - [C++][Parquet] Reserve values correctly when reading BYTE_ARRAY and FLBA (#47013) +* [GH-47040](https://github.com/apache/arrow/issues/47040) - [C++] Refine reset of Span to be reusable (#47004) +* [GH-47045](https://github.com/apache/arrow/issues/47045) - [CI][C++] Use Fedora 42 instead of 39 (#47046) +* [GH-47047](https://github.com/apache/arrow/issues/47047) - [CI][C++] Use Google Cloud Storage Testbench v0.55.0 (#47048) +* [GH-47058](https://github.com/apache/arrow/issues/47058) - [Release] Update Release Management Guide to reflect status in preparation for Arrow 22 (#47474) +* [GH-47075](https://github.com/apache/arrow/issues/47075) - [Release][Dev] Use GH_TOKEN as GitHub token environment variable (#47181) +* [GH-47084](https://github.com/apache/arrow/issues/47084) - [Release] Stop using https://dist.apache.org/repos/dist/dev/arrow/KEYS (#47182) +* [GH-47088](https://github.com/apache/arrow/issues/47088) - [CI][Dev] Fix shellcheck errors in the ci/scripts/integration_arrow.sh (#47089) +* [GH-47102](https://github.com/apache/arrow/issues/47102) - [Statistics][C++] Implement Statistics specification attribute ARROW:max_byte_width:{exact,approximate} Component: C++ (#47463) +* [GH-47106](https://github.com/apache/arrow/issues/47106) - [R] Update R package to use R 4.1+ native forward pipe syntax (#47622) +* [GH-47112](https://github.com/apache/arrow/issues/47112) - [Parquet][C++] Rle BitPacked parser (#47294) +* [GH-47120](https://github.com/apache/arrow/issues/47120) - [R] Update NEWS for 21.0.0 (#47121) +* [GH-47123](https://github.com/apache/arrow/issues/47123) - [Python] Add Enums to PyArrow Types (#47139) +* [GH-47125](https://github.com/apache/arrow/issues/47125) - [CI][Dev] Fix shellcheck errors in the ci/scripts/integration_hdfs.sh (#47126) +* [GH-47137](https://github.com/apache/arrow/issues/47137) - [Python][dependency-groups] ` (#47176) +* [GH-47153](https://github.com/apache/arrow/issues/47153) - [Docs][C++] Update cmake target table in build_system.rst with newly added targets (#47154) +* [GH-47157](https://github.com/apache/arrow/issues/47157) - [Docs] Improve presentation of Other available packages section in build_system.rst (#47411) +* [GH-47172](https://github.com/apache/arrow/issues/47172) - [Python] Add a utility function to create Arrow table instead of pandas df (#47199) +* [GH-47184](https://github.com/apache/arrow/issues/47184) - [Parquet][C++] Avoid multiplication overflow in FixedSizeBinaryBuilder::Reserve (#47185) +* [GH-47191](https://github.com/apache/arrow/issues/47191) - [R] Turn GCS back on by default on MacOS source builds (#47192) +* [GH-47193](https://github.com/apache/arrow/issues/47193) - [R] Update R Makefile to exclude flight odbc from cpp sync (#47194) +* [GH-47205](https://github.com/apache/arrow/issues/47205) - [C++] Suppress GNU variadic macro warnings (#47286) +* [GH-47208](https://github.com/apache/arrow/issues/47208) - [C++][CI] Add a CI job for C++23 (#47261) +* [GH-47208](https://github.com/apache/arrow/issues/47208) - [C++] Update bundled s2n-tls to 1.5.23 (#47220) +* [GH-47211](https://github.com/apache/arrow/issues/47211) - [CI][R] Disable non-system memory allocators when on linux-devel (#47212) +* [GH-47218](https://github.com/apache/arrow/issues/47218) - [C++] Update bundled s2n-tls +* [GH-47222](https://github.com/apache/arrow/issues/47222) - [CI][C++] Add a CI job that uses the same build options for JNI on macOS (#47305) +* [GH-47223](https://github.com/apache/arrow/issues/47223) - [Release] Use "upstream" as apache/arrow{,-site} remote name (#47224) +* [GH-47225](https://github.com/apache/arrow/issues/47225) - [C++] Remove Skyhook (#47262) +* [GH-47232](https://github.com/apache/arrow/issues/47232) - [Ruby] Suppress warnings in test with Ruby 3.5 (#47233) +* [GH-47244](https://github.com/apache/arrow/issues/47244) - [CI][Dev] Fix shellcheck errors in the ci/scripts/msys2_setup.sh (#47245) +* [GH-47258](https://github.com/apache/arrow/issues/47258) - [Release] Set `date:` for apache/arrow-site's `_release/${VERSION}.md` (#47260) +* [GH-47263](https://github.com/apache/arrow/issues/47263) - [MATLAB] Add `NumNulls` property to `arrow.array.ChunkedArray` class (#47264) +* [GH-47289](https://github.com/apache/arrow/issues/47289) - [CI][Dev] Fix shellcheck errors in the ci/scripts/python_build_emscripten.sh (#47290) +* [GH-47291](https://github.com/apache/arrow/issues/47291) - [C++] Update bundled aws-c-common to 0.12.4 (#47292) +* [GH-47306](https://github.com/apache/arrow/issues/47306) - [CI][Dev] Fix shellcheck errors in the ci/scripts/python_build.sh (#47307) +* [GH-47312](https://github.com/apache/arrow/issues/47312) - [Packaging] Add support for Debian forky (#47342) +* [GH-47317](https://github.com/apache/arrow/issues/47317) - [C++][C++23][Gandiva] Use pointer for Cache test (#47318) +* [GH-47319](https://github.com/apache/arrow/issues/47319) - [CI] Fix actions/checkout hash version comments (#47320) +* [GH-47321](https://github.com/apache/arrow/issues/47321) - [CI][Dev] Fix shellcheck errors in the ci/scripts/python_sdist_test.sh (#47322) +* [GH-47338](https://github.com/apache/arrow/issues/47338) - [C++][Python] Remove deprecated string-based Parquet encryption methods (#47339) +* [GH-47349](https://github.com/apache/arrow/issues/47349) - [C++] Include request ID in AWS S3 Error (#47351) +* [GH-47358](https://github.com/apache/arrow/issues/47358) - [Python] IPC and Flight options representation (#47461) +* [GH-47370](https://github.com/apache/arrow/issues/47370) - [Python] Require Cython 3.1 (#47396) +* [GH-47375](https://github.com/apache/arrow/issues/47375) - [C++][Compute] Move scatter function into compute core (#47378) +* [GH-47384](https://github.com/apache/arrow/issues/47384) - [C++][Acero] Isolate BackpressureHandler from ExecNode (#47386) +* [GH-47395](https://github.com/apache/arrow/issues/47395) - [R] Update fedora-clang to install latest clang version to match CRAN setup (#47206) +* [GH-47401](https://github.com/apache/arrow/issues/47401) - [C++] Remove needless Snappy patch (#47407) +* [GH-47404](https://github.com/apache/arrow/issues/47404) - [Ruby] Remove needless `require "extpp/setup"` (#47405) +* [GH-47412](https://github.com/apache/arrow/issues/47412) - [C++] Use inlineshidden visibility in Meson configuration (#47413) +* [GH-47422](https://github.com/apache/arrow/issues/47422) - [Python][C++][Flight] Expose ipc::ReadStats in Flight MetadataRecordBatchReader (#47432) +* [GH-47438](https://github.com/apache/arrow/issues/47438) - [Python][Packaging] Set up wheel building for Python 3.14 (#47616) +* [GH-47443](https://github.com/apache/arrow/issues/47443) - [Python][Packaging] Drop Python 3.9 support (#47478) +* [GH-47449](https://github.com/apache/arrow/issues/47449) - [C++][Parquet] Do not drop all Statistics if SortOrder is UNKNOWN (#47466) +* [GH-47469](https://github.com/apache/arrow/issues/47469) - [C++][Gandiva] Add support for LLVM 21.1.0 (#47473) +* [GH-47483](https://github.com/apache/arrow/issues/47483) - [C++] Bump vendored xxhash to 0.8.3 (#47476) +* [GH-47500](https://github.com/apache/arrow/issues/47500) - [C++] Add QualifierAlignment to clang-format options (#47501) +* [GH-47505](https://github.com/apache/arrow/issues/47505) - [CI][C#][Integration] Use apache/arrow-dotnet (#47508) +* [GH-47509](https://github.com/apache/arrow/issues/47509) - [CI][Packaging][Linux] Enable Docker build cache (#47510) +* [GH-47512](https://github.com/apache/arrow/issues/47512) - [C++] Bump meson-fmt in pre-commit to 1.9.0 (#47513) +* [GH-47514](https://github.com/apache/arrow/issues/47514) - [C++][Parquet] Add unpack tests and benchmarks (#47515) +* [GH-47516](https://github.com/apache/arrow/issues/47516) - [C++][FlightRPC] Initial ODBC driver framework (#47517) +* [GH-47518](https://github.com/apache/arrow/issues/47518) - [C++][FlightRPC] Replace `spdlogs` with Arrow's Internal Logging (#47645) +* [GH-47523](https://github.com/apache/arrow/issues/47523) - [C#] Remove csharp/ (#47547) +* [GH-47543](https://github.com/apache/arrow/issues/47543) - [C++] Search for system install of Azure libraries with Meson (#47544) +* [GH-47552](https://github.com/apache/arrow/issues/47552) - [C++] Fix creating wrong object by `FixedShapeTensorType::MakeArray()` (#47533) +* [GH-47575](https://github.com/apache/arrow/issues/47575) - [Python] add quoting_header option to pyarrow WriterOptions (#47610) +* [GH-47582](https://github.com/apache/arrow/issues/47582) - [CI][Packaging] Move linux-packaging tasks to apache/arrow repository (#47600) +* [GH-47584](https://github.com/apache/arrow/issues/47584) - [C++][CI] Remove "large memory" mark from TestListArray::TestOverflowCheck (#47585) +* [GH-47588](https://github.com/apache/arrow/issues/47588) - [C++] Bump mimalloc version to 3.1.5 (#47589) +* [GH-47597](https://github.com/apache/arrow/issues/47597) - [C++][Parquet] Fuzz more data types (#47621) +* [GH-47632](https://github.com/apache/arrow/issues/47632) - [CI][C++] Add a CI job for JNI on Linux (#47746) +* [GH-47633](https://github.com/apache/arrow/issues/47633) - [Dev][Integration] Write all files with `--write_generated_json` (#47634) +* [GH-47639](https://github.com/apache/arrow/issues/47639) - [Benchmarking] Clean up conbench config (#47638) +* [GH-47646](https://github.com/apache/arrow/issues/47646) - [C++][FlightRPC] Follow Naming Convention (#47658) +* [GH-47648](https://github.com/apache/arrow/issues/47648) - [Archery][Integration] More granularity in JSON test cases (#47649) +* [GH-47650](https://github.com/apache/arrow/issues/47650) - [Archery][Integration] Add option to generate gold files (#47651) +* [GH-47679](https://github.com/apache/arrow/issues/47679) - [C++] Register arrow compute calls in ODBC (#47680) +* [GH-47704](https://github.com/apache/arrow/issues/47704) - [R] Update paths in nightly libarrow upload job (#47727) +* [GH-47705](https://github.com/apache/arrow/issues/47705) - [R][CI] Migrate rhub debian-gcc-release to equivalent supported image (#47730) +* [GH-47738](https://github.com/apache/arrow/issues/47738) - [R] Update NEWS.md for 22.0.0 (#47739) + + + # Apache Arrow 6.0.1 (2021-11-18) ## Bug Fixes diff --git a/CPPLINT.cfg b/CPPLINT.cfg index 2f47b4dbf57..dd1139ac7f8 100644 --- a/CPPLINT.cfg +++ b/CPPLINT.cfg @@ -26,5 +26,7 @@ filter = -readability/alt_tokens filter = -readability/casting filter = -readability/todo filter = -runtime/references +# Let the formatter do the job for whitespaces filter = -whitespace/comments +filter = -whitespace/braces linelength = 90 diff --git a/LICENSE.txt b/LICENSE.txt index 7d5de9e3bfe..2c90f0313d7 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -783,16 +783,6 @@ License: http://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This project includes code from the Google styleguide. - -* cpp/build-support/cpplint.py is based on the scripts from the Google styleguide. - -Copyright: 2009 Google Inc. All rights reserved. -Homepage: https://github.com/google/styleguide -License: 3-clause BSD - --------------------------------------------------------------------------------- - This project includes code from Snappy. * cpp/cmake_modules/{SnappyCMakeLists.txt,SnappyConfig.h} are based on code @@ -2290,3 +2280,46 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- +The files cpp/src/arrow/vendored/whereami/whereami.h, +cpp/src/arrow/vendored/whereami/whereami.cc are adapted from +Grégory Pakosz's whereami library (https://github.com/gpakosz/whereami) +It is dual licensed under both the WTFPLv2 and MIT licenses. + +The WTFPLv2 License + DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE + Version 2, December 2004 + + Copyright (C) 2004 Sam Hocevar + + Everyone is permitted to copy and distribute verbatim or modified + copies of this license document, and changing it is allowed as long + as the name is changed. + + DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. You just DO WHAT THE FUCK YOU WANT TO. + 1. Bla bla bla + 2. Montesqieu et camembert, vive la France, zut alors! + +The MIT License (MIT) +Copyright Gregory Pakosz + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/NOTICE.txt b/NOTICE.txt index 2089c6fb203..9b98364d2ab 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -17,9 +17,6 @@ https://github.com/libdynd This product includes software from the LLVM project * distributed under the University of Illinois Open Source -This product includes software from the google-lint project - * Copyright (c) 2009 Google Inc. All rights reserved. - This product includes software from the mman-win32 project * Copyright https://code.google.com/p/mman-win32/ * Licensed under the MIT License; diff --git a/README.md b/README.md index c557716a4a8..49d56fe1099 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/arrow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:arrow) [![License](http://img.shields.io/:license-Apache%202-blue.svg)](https://github.com/apache/arrow/blob/main/LICENSE.txt) -[![Twitter Follow](https://img.shields.io/twitter/follow/apachearrow.svg?style=social&label=Follow)](https://twitter.com/apachearrow) +[![BlueSky Follow](https://img.shields.io/badge/bluesky-Follow-blue?logo=bluesky)](https://bsky.app/profile/arrow.apache.org) ## Powering In-Memory Analytics @@ -31,26 +31,33 @@ enable data systems to efficiently store, process, and move data. Major components of the project include: - - [The Arrow Columnar In-Memory Format](https://arrow.apache.org/docs/dev/format/Columnar.html): + - [The Arrow Columnar Format](https://arrow.apache.org/docs/dev/format/Columnar.html): a standard and efficient in-memory representation of various datatypes, plain or nested - [The Arrow IPC Format](https://arrow.apache.org/docs/dev/format/Columnar.html#serialization-and-interprocess-communication-ipc): an efficient serialization of the Arrow format and associated metadata, for communication between processes and heterogeneous environments + - [ADBC (Arrow Database Connectivity)](https://github.com/apache/arrow-adbc/) `↗`: Arrow-powered API, + drivers, and libraries for access to databases and query engines - [The Arrow Flight RPC protocol](https://github.com/apache/arrow/tree/main/format/Flight.proto): based on the Arrow IPC format, a building block for remote services exchanging Arrow data with application-defined semantics (for example a storage server or a database) - [C++ libraries](https://github.com/apache/arrow/tree/main/cpp) - [C bindings using GLib](https://github.com/apache/arrow/tree/main/c_glib) - - [C# .NET libraries](https://github.com/apache/arrow/tree/main/csharp) + - [.NET libraries](https://github.com/apache/arrow-dotnet) - [Gandiva](https://github.com/apache/arrow/tree/main/cpp/src/gandiva): an [LLVM](https://llvm.org)-based Arrow expression compiler, part of the C++ codebase - - [Go libraries](https://github.com/apache/arrow-go) - - [Java libraries](https://github.com/apache/arrow-java) - - [JavaScript libraries](https://github.com/apache/arrow/tree/main/js) + - [Go libraries](https://github.com/apache/arrow-go) `↗` + - [Java libraries](https://github.com/apache/arrow-java) `↗` + - [JavaScript libraries](https://github.com/apache/arrow-js) `↗` + - [Julia implementation](https://github.com/apache/arrow-julia) `↗` - [Python libraries](https://github.com/apache/arrow/tree/main/python) - [R libraries](https://github.com/apache/arrow/tree/main/r) - [Ruby libraries](https://github.com/apache/arrow/tree/main/ruby) - - [Rust libraries](https://github.com/apache/arrow-rs) + - [Rust libraries](https://github.com/apache/arrow-rs) `↗` + - [Swift libraries](https://github.com/apache/arrow-swift) `↗` + +The `↗` icon denotes that this component of the project is maintained in a separate +repository. Arrow is an [Apache Software Foundation](https://www.apache.org) project. Learn more at [arrow.apache.org](https://arrow.apache.org). diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 9e4582f1d8d..00000000000 --- a/appveyor.yml +++ /dev/null @@ -1,63 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Operating system (build VM template) -os: Visual Studio 2019 - -only_commits: - # Skip commits not related to Python or C++ - files: - - appveyor.yml - - ci/appveyor* - - ci/conda* - - ci/scripts/*.bat - - cpp/ - - format/ - - python/ - -cache: - - C:\Users\appveyor\AppData\Local\ccache - -matrix: - fast_finish: true - -environment: - global: - APPVEYOR_SAVE_CACHE_ON_ERROR: true - MSVC_DEFAULT_OPTIONS: ON - - ARCH: "64" - ARROW_BUILD_FLIGHT: "ON" - ARROW_BUILD_FLIGHT_SQL: "ON" - ARROW_BUILD_GANDIVA: "ON" - ARROW_GCS: "ON" - ARROW_ORC: "ON" - ARROW_S3: "ON" - GENERATOR: Ninja - PYTHON: "3.10" - -before_build: - - call ci\appveyor-cpp-setup.bat - -build_script: - - call ci\appveyor-cpp-build.bat - -# Disable test discovery -test: off - -after_build: - - ccache -sv diff --git a/c_glib/arrow-cuda-glib/meson.build b/c_glib/arrow-cuda-glib/meson.build index 0f93d95ca01..d80145aedda 100644 --- a/c_glib/arrow-cuda-glib/meson.build +++ b/c_glib/arrow-cuda-glib/meson.build @@ -76,8 +76,7 @@ endif if have_gi gir_dependencies = [declare_dependency(sources: arrow_glib_gir)] - gir_extra_args = [ - '--warn-all', + gir_extra_args = gir_scanner_extra_args + [ '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', ] arrow_cuda_glib_gir = gnome.generate_gir( diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index 5cb61fc462c..62cc7b16627 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -119,8 +119,7 @@ if have_gi libarrow_dataset_glib, dependencies: declare_dependency(sources: arrow_glib_gir), export_packages: 'arrow-dataset-glib', - extra_args: [ - '--warn-all', + extra_args: gir_scanner_extra_args + [ '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', ], header: 'arrow-dataset-glib/arrow-dataset-glib.h', diff --git a/c_glib/arrow-flight-glib/meson.build b/c_glib/arrow-flight-glib/meson.build index 87fc4734312..be89cf772a3 100644 --- a/c_glib/arrow-flight-glib/meson.build +++ b/c_glib/arrow-flight-glib/meson.build @@ -83,8 +83,7 @@ if have_gi libarrow_flight_glib, dependencies: declare_dependency(sources: arrow_glib_gir), export_packages: 'arrow-flight-glib', - extra_args: [ - '--warn-all', + extra_args: gir_scanner_extra_args + [ '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', ], header: 'arrow-flight-glib/arrow-flight-glib.h', diff --git a/c_glib/arrow-flight-sql-glib/meson.build b/c_glib/arrow-flight-sql-glib/meson.build index aa6798e763c..0b8a158bbc8 100644 --- a/c_glib/arrow-flight-sql-glib/meson.build +++ b/c_glib/arrow-flight-sql-glib/meson.build @@ -81,8 +81,7 @@ if have_gi libarrow_flight_sql_glib, dependencies: arrow_flight_sql_glib_gir_dependencies, export_packages: 'arrow-flight-sql-glib', - extra_args: [ - '--warn-all', + extra_args: gir_scanner_extra_args + [ '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', '--include-uninstalled=./arrow-flight-glib/ArrowFlight-1.0.gir', ], diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index 8127344b3a5..8320e30c99c 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -27,6 +27,7 @@ #include +#include #include G_BEGIN_DECLS @@ -476,6 +477,98 @@ garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics) } } +/** + * garrow_array_statistics_has_distinct_count: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: %TRUE if the distinct count is available, %FALSE otherwise. + * + * Since: 21.0.0 + */ +gboolean +garrow_array_statistics_has_distinct_count(GArrowArrayStatistics *statistics) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); + return priv->statistics.distinct_count.has_value(); +} + +/** + * garrow_array_statistics_is_distinct_count_exact: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: %TRUE if the distinct count is available and exact, %FALSE otherwise. + * + * Since: 22.0.0 + */ +gboolean +garrow_array_statistics_is_distinct_count_exact(GArrowArrayStatistics *statistics) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); + return priv->statistics.distinct_count.has_value() && + std::holds_alternative(priv->statistics.distinct_count.value()); +} + +/** + * garrow_array_statistics_get_distinct_count: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: 0 or larger value if @statistics has a valid distinct count value, + * -1 otherwise. + * + * Since: 21.0.0 + * + * Deprecated: 22.0.0. Use garrow_array_statistics_is_distinct_count_exact(), + * garrow_array_statistics_get_distinct_count_exact() and + * garrow_array_statistics_get_distinct_count_approximate() instead. + */ +gint64 +garrow_array_statistics_get_distinct_count(GArrowArrayStatistics *statistics) +{ + return garrow_array_statistics_get_distinct_count_exact(statistics); +} + +/** + * garrow_array_statistics_get_distinct_count_exact: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: 0 or larger value if @statistics has a valid exact distinct count + * value, -1 otherwise. + * + * Since: 22.0.0 + */ +gint64 +garrow_array_statistics_get_distinct_count_exact(GArrowArrayStatistics *statistics) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); + const auto &distinct_count = priv->statistics.distinct_count; + if (distinct_count && std::holds_alternative(distinct_count.value())) { + return std::get(distinct_count.value()); + } else { + return -1; + } +} + +/** + * garrow_array_statistics_get_distinct_count_approximate: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: Non `NaN` value if @statistics has a valid approximate distinct count + * value, `NaN` otherwise. + * + * Since: 22.0.0 + */ +gdouble +garrow_array_statistics_get_distinct_count_approximate(GArrowArrayStatistics *statistics) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); + const auto &distinct_count = priv->statistics.distinct_count; + if (distinct_count && std::holds_alternative(distinct_count.value())) { + return std::get(distinct_count.value()); + } else { + return std::nan(""); + } +} + typedef struct GArrowArrayPrivate_ { std::shared_ptr array; diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h index 615d6793b08..4021c16f060 100644 --- a/c_glib/arrow-glib/basic-array.h +++ b/c_glib/arrow-glib/basic-array.h @@ -58,6 +58,25 @@ GARROW_AVAILABLE_IN_20_0 gint64 garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics); +GARROW_AVAILABLE_IN_21_0 +gboolean +garrow_array_statistics_has_distinct_count(GArrowArrayStatistics *statistics); +GARROW_AVAILABLE_IN_22_0 +gboolean +garrow_array_statistics_is_distinct_count_exact(GArrowArrayStatistics *statistics); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_AVAILABLE_IN_21_0 +GARROW_DEPRECATED_IN_22_0_FOR(garrow_array_statistics_get_distinct_count_exact) +gint64 +garrow_array_statistics_get_distinct_count(GArrowArrayStatistics *statistics); +#endif +GARROW_AVAILABLE_IN_22_0 +gint64 +garrow_array_statistics_get_distinct_count_exact(GArrowArrayStatistics *statistics); +GARROW_AVAILABLE_IN_22_0 +gdouble +garrow_array_statistics_get_distinct_count_approximate(GArrowArrayStatistics *statistics); + GARROW_AVAILABLE_IN_6_0 GArrowArray * garrow_array_import(gpointer c_abi_array, GArrowDataType *data_type, GError **error); diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index c195af7de03..51fffb73693 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -26,6 +26,8 @@ #include #include +#include +#include G_BEGIN_DECLS @@ -131,6 +133,10 @@ G_BEGIN_DECLS * #GArrowStringViewDataType is a class for the string view data type. * * #GArrowBinaryViewDataType is a class for the binary view data type. + * + * #GArrowFixedShapeTensorDataType is a class for the fixed shape tensor data type. + * + * #GArrowUUIDDataType is a class for UUID data type. */ struct GArrowDataTypePrivate @@ -2267,6 +2273,253 @@ garrow_string_view_data_type_new(void) return data_type; } +enum { + PROP_N_DIMENSIONS = 1 +}; + +G_DEFINE_TYPE(GArrowFixedShapeTensorDataType, + garrow_fixed_shape_tensor_data_type, + GARROW_TYPE_EXTENSION_DATA_TYPE) + +static void +garrow_fixed_shape_tensor_data_type_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + case PROP_N_DIMENSIONS: + { + auto arrow_data_type = + std::static_pointer_cast( + garrow_data_type_get_raw(GARROW_DATA_TYPE(object))); + g_value_set_uint64(value, arrow_data_type->ndim()); + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_fixed_shape_tensor_data_type_init(GArrowFixedShapeTensorDataType *object) +{ +} + +static void +garrow_fixed_shape_tensor_data_type_class_init(GArrowFixedShapeTensorDataTypeClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->get_property = garrow_fixed_shape_tensor_data_type_get_property; + + /** + * GArrowFixedShapeTensorDataType::n-dimensions: + * + * The number of dimensions of tensor elements. + * + * Since: 21.0.0 + */ + spec = g_param_spec_uint64("n_dimensions", + "N dimensions", + "Number of dimensions of tensor elements", + 0, + G_MAXUINT64, + 0, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_N_DIMENSIONS, spec); +} + +/** + * garrow_fixed_shape_tensor_data_type_new: + * @value_type: A #GArrowDataType of individual tensor elements. + * @shape: (array length=shape_length): A physical shape of the contained tensors as an + * array. + * @shape_length: The length of `shape`. + * @permutation: (array length=permutation_length) (nullable): An indices of the desired + * ordering of the original dimensions, defined as an array. This must be `NULL` or + * the same length array of `shape`. + * @permutation_length: The length of `permutation`. + * @dim_names: (array length=n_dim_names) (nullable): Explicit names to tensor dimensions + * as an array. This must be `NULL` or the same length array of `shape`. + * @n_dim_names. The length of `dim_names`. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: The newly created fixed shape tensor data type. + * + * Since: 21.0.0 + */ +GArrowFixedShapeTensorDataType * +garrow_fixed_shape_tensor_data_type_new(GArrowDataType *value_type, + const gint64 *shape, + gsize shape_length, + const gint64 *permutation, + gsize permutation_length, + const gchar **dim_names, + gsize n_dim_names, + GError **error) +{ + std::vector arrow_shape; + std::vector arrow_permutation; + std::vector arrow_dim_names; + + auto arrow_value_type = garrow_data_type_get_raw(value_type); + + for (gsize i = 0; i < shape_length; i++) { + arrow_shape.push_back(shape[i]); + } + + for (gsize i = 0; i < permutation_length; i++) { + arrow_permutation.push_back(permutation[i]); + } + + for (gsize i = 0; i < n_dim_names; i++) { + arrow_dim_names.push_back(dim_names[i]); + } + + auto arrow_data_type_result = + arrow::extension::FixedShapeTensorType::Make(arrow_value_type, + arrow_shape, + arrow_permutation, + arrow_dim_names); + if (!garrow::check(error, arrow_data_type_result, "[fixed-shape-tensor][new]")) { + return NULL; + } + + auto arrow_data_type = *arrow_data_type_result; + auto data_type = GARROW_FIXED_SHAPE_TENSOR_DATA_TYPE( + g_object_new(GARROW_TYPE_FIXED_SHAPE_TENSOR_DATA_TYPE, + "data-type", + &arrow_data_type, + NULL)); + return data_type; +} + +/** + * garrow_fixed_shape_tensor_data_type_get_shape: + * @data_type: A #GArrowFixedShapeTensorDataType. + * @length: (out): Return location for the number of dimensions of the tensor. + * + * Returns: (array length=length): Shape of the tensor. + * + * Since: 21.0.0 + */ +const gint64 * +garrow_fixed_shape_tensor_data_type_get_shape(GArrowFixedShapeTensorDataType *data_type, + gsize *length) +{ + auto arrow_data_type = std::static_pointer_cast( + garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type))); + + const auto &arrow_shape = arrow_data_type->shape(); + *length = arrow_shape.size(); + return arrow_shape.data(); +} + +/** + * garrow_fixed_shape_tensor_data_type_get_permutation: + * @data_type: A #GArrowFixedShapeTensorDataType. + * @length: (out): Return location for the number of elements of permutation. + * + * Returns: (array length=length): Permutation of the tensor. + * + * Since: 21.0.0 + */ +const gint64 * +garrow_fixed_shape_tensor_data_type_get_permutation( + GArrowFixedShapeTensorDataType *data_type, gsize *length) +{ + auto arrow_data_type = std::static_pointer_cast( + garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type))); + + const auto &arrow_permutation = arrow_data_type->permutation(); + *length = arrow_permutation.size(); + return arrow_permutation.data(); +} + +/** + * garrow_fixed_shape_tensor_data_type_get_dim_names: + * @data_type: A #GArrowFixedShapeTensorDataType. + * + * Returns: (array zero-terminated=1) (element-type utf8) (transfer full): + * Dimention names of the tensor. + * + * It's a %NULL-terminated string array. It must be freed with + * g_strfreev() when no longer needed. + * + * Since: 21.0.0 + */ +gchar ** +garrow_fixed_shape_tensor_data_type_get_dim_names( + GArrowFixedShapeTensorDataType *data_type) +{ + auto arrow_data_type = std::static_pointer_cast( + garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type))); + const auto &arrow_dim_names = arrow_data_type->dim_names(); + auto n = arrow_dim_names.size(); + auto dim_names = g_new(gchar *, n + 1); + for (size_t i = 0; i < n; ++i) { + dim_names[i] = g_strndup(arrow_dim_names[i].data(), arrow_dim_names[i].size()); + } + dim_names[n] = nullptr; + return dim_names; +} + +/** + * garrow_fixed_shape_tensor_data_type_get_strides: + * @data_type: A #GArrowFixedShapeTensorDataType. + * @length: (out): Return location for the number of strides of tensor shape. + * + * Returns: (array length=length): Strides in bytes for each tensor dimension. + * + * Since: 21.0.0 + */ +const gint64 * +garrow_fixed_shape_tensor_data_type_get_strides(GArrowFixedShapeTensorDataType *data_type, + gsize *length) +{ + auto arrow_data_type = std::static_pointer_cast( + garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type))); + const auto &arrow_strides = arrow_data_type->strides(); + *length = arrow_strides.size(); + return arrow_strides.data(); +} + +G_DEFINE_TYPE(GArrowUUIDDataType, garrow_uuid_data_type, GARROW_TYPE_EXTENSION_DATA_TYPE) + +static void +garrow_uuid_data_type_init(GArrowUUIDDataType *object) +{ +} + +static void +garrow_uuid_data_type_class_init(GArrowUUIDDataTypeClass *klass) +{ +} + +/* + * garrow_uuid_data_type_new: + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): + * The newly created UUID data type on success, %NULL on error. + * + * Since: 21.0.0 + */ +GArrowUUIDDataType * +garrow_uuid_data_type_new(GError **error) +{ + auto arrow_data_type_result = arrow::extension::UuidType::Make(); + if (garrow::check(error, arrow_data_type_result, "[uuid-data-type][new]")) { + auto arrow_data_type = *arrow_data_type_result; + return GARROW_UUID_DATA_TYPE( + g_object_new(GARROW_TYPE_UUID_DATA_TYPE, "data-type", &arrow_data_type, NULL)); + } else { + return NULL; + } +} G_END_DECLS GArrowDataType * @@ -2399,6 +2652,9 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) } type = GARROW_TYPE_EXTENSION_DATA_TYPE; break; + case arrow::Type::type::FIXED_SIZE_LIST: + type = GARROW_TYPE_FIXED_SIZE_LIST_DATA_TYPE; + break; case arrow::Type::type::RUN_END_ENCODED: type = GARROW_TYPE_RUN_END_ENCODED_DATA_TYPE; break; diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index b692395e481..d3998bd81b8 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -802,4 +802,63 @@ GARROW_AVAILABLE_IN_19_0 GArrowStringViewDataType * garrow_string_view_data_type_new(void); +#define GARROW_TYPE_FIXED_SHAPE_TENSOR_DATA_TYPE \ + (garrow_fixed_shape_tensor_data_type_get_type()) +GARROW_AVAILABLE_IN_21_0 +G_DECLARE_DERIVABLE_TYPE(GArrowFixedShapeTensorDataType, + garrow_fixed_shape_tensor_data_type, + GARROW, + FIXED_SHAPE_TENSOR_DATA_TYPE, + GArrowExtensionDataType) +struct _GArrowFixedShapeTensorDataTypeClass +{ + GArrowExtensionDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_21_0 +GArrowFixedShapeTensorDataType * +garrow_fixed_shape_tensor_data_type_new(GArrowDataType *value_type, + const gint64 *shape, + gsize shape_length, + const gint64 *permutation, + gsize permutation_length, + const gchar **dim_names, + gsize n_dim_names, + GError **error); + +GARROW_AVAILABLE_IN_21_0 +const gint64 * +garrow_fixed_shape_tensor_data_type_get_shape(GArrowFixedShapeTensorDataType *data_type, + gsize *length); + +GARROW_AVAILABLE_IN_21_0 +const gint64 * +garrow_fixed_shape_tensor_data_type_get_permutation( + GArrowFixedShapeTensorDataType *data_type, gsize *length); + +GARROW_AVAILABLE_IN_21_0 +gchar ** +garrow_fixed_shape_tensor_data_type_get_dim_names( + GArrowFixedShapeTensorDataType *data_type); + +GARROW_AVAILABLE_IN_21_0 +const gint64 * +garrow_fixed_shape_tensor_data_type_get_strides(GArrowFixedShapeTensorDataType *data_type, + gsize *length); + +#define GARROW_TYPE_UUID_DATA_TYPE (garrow_uuid_data_type_get_type()) +GARROW_AVAILABLE_IN_21_0 +G_DECLARE_DERIVABLE_TYPE(GArrowUUIDDataType, + garrow_uuid_data_type, + GARROW, + UUID_DATA_TYPE, + GArrowExtensionDataType) +struct _GArrowUUIDDataTypeClass +{ + GArrowExtensionDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_21_0 +GArrowUUIDDataType * +garrow_uuid_data_type_new(GError **error); G_END_DECLS diff --git a/c_glib/arrow-glib/chunked-array.cpp b/c_glib/arrow-glib/chunked-array.cpp index 39c8faad6c0..8870e71894a 100644 --- a/c_glib/arrow-glib/chunked-array.cpp +++ b/c_glib/arrow-glib/chunked-array.cpp @@ -23,6 +23,8 @@ #include #include +#include + #include G_BEGIN_DECLS @@ -406,6 +408,56 @@ garrow_chunked_array_combine(GArrowChunkedArray *chunked_array, GError **error) } } +/** + * garrow_chunked_array_import: + * @c_abi_array_stream: (not nullable): A `struct ArrowArrayStream *`. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): An imported chunked array on success, + * %NULL on error. + * + * Since: 21.0.0 + */ +GArrowChunkedArray * +garrow_chunked_array_import(gpointer c_abi_array_stream, GError **error) +{ + auto arrow_chunked_array_result = + arrow::ImportChunkedArray(static_cast(c_abi_array_stream)); + if (garrow::check(error, arrow_chunked_array_result, "[chunked-array][import]")) { + return garrow_chunked_array_new_raw(&(*arrow_chunked_array_result)); + } else { + return NULL; + } +} + +/** + * garrow_chunked_array_export: + * @chunked_array: A #GArrowChunkedArray to be exported. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): An exported chunked array as + * `struct ArrowArrayStream *` on success, %NULL on error. + * It should be freed with the `ArrowArrayStream::release` callback then + * g_free() when no longer needed. + * + * Since: 21.0.0 + */ +gpointer +garrow_chunked_array_export(GArrowChunkedArray *chunked_array, GError **error) +{ + const auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); + auto c_abi_array_stream = g_new(struct ArrowArrayStream, 1); + auto status = + arrow::ExportChunkedArray(arrow_chunked_array, + static_cast(c_abi_array_stream)); + if (garrow::check(error, status, "[chunked-array][export]")) { + return c_abi_array_stream; + } else { + g_free(c_abi_array_stream); + return NULL; + } +} + G_END_DECLS GArrowChunkedArray * diff --git a/c_glib/arrow-glib/chunked-array.h b/c_glib/arrow-glib/chunked-array.h index 712d16504f6..27fedb6a4fe 100644 --- a/c_glib/arrow-glib/chunked-array.h +++ b/c_glib/arrow-glib/chunked-array.h @@ -83,4 +83,12 @@ GARROW_AVAILABLE_IN_4_0 GArrowArray * garrow_chunked_array_combine(GArrowChunkedArray *chunked_array, GError **error); +GARROW_AVAILABLE_IN_21_0 +GArrowChunkedArray * +garrow_chunked_array_import(gpointer c_abi_array_stream, GError **error); + +GARROW_AVAILABLE_IN_21_0 +gpointer +garrow_chunked_array_export(GArrowChunkedArray *chunked_array, GError **error); + G_END_DECLS diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index 914bb3196fa..3c216867da2 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -32,6 +32,8 @@ G_BEGIN_DECLS * @title: Composite data type classes * @include: arrow-glib/arrow-glib.h * + * #GArrowBaseListDataType is an abstract class for list data type. + * * #GArrowListDataType is a class for list data type. * * #GArrowLargeListDataType is a class for 64-bit offsets list data type. @@ -49,9 +51,43 @@ G_BEGIN_DECLS * #GArrowDictionaryDataType is a class for dictionary data type. * * #GArrowRunEndEncodedDataType is a class for run end encoded data type. + * + * #GArrowFixedSizeListDataType is a class for fixed size list data type. */ -G_DEFINE_TYPE(GArrowListDataType, garrow_list_data_type, GARROW_TYPE_DATA_TYPE) +G_DEFINE_TYPE(GArrowBaseListDataType, garrow_base_list_data_type, GARROW_TYPE_DATA_TYPE) + +static void +garrow_base_list_data_type_init(GArrowBaseListDataType *object) +{ +} + +static void +garrow_base_list_data_type_class_init(GArrowBaseListDataTypeClass *klass) +{ +} + +/** + * garrow_base_list_data_type_get_field: + * @base_list_data_type: A #GArrowBaseListDataType. + * + * Returns: (transfer full): The field of value. + * + * Since: 21.0.0 + */ +GArrowField * +garrow_base_list_data_type_get_field(GArrowBaseListDataType *base_list_data_type) +{ + auto data_type = GARROW_DATA_TYPE(base_list_data_type); + auto arrow_data_type = garrow_data_type_get_raw(data_type); + auto arrow_base_list_data_type = + std::static_pointer_cast(arrow_data_type); + + auto arrow_field = arrow_base_list_data_type->value_field(); + return garrow_field_new_raw(&arrow_field, nullptr); +} + +G_DEFINE_TYPE(GArrowListDataType, garrow_list_data_type, GARROW_TYPE_BASE_LIST_DATA_TYPE) static void garrow_list_data_type_init(GArrowListDataType *object) @@ -102,16 +138,14 @@ garrow_list_data_type_get_value_field(GArrowListDataType *list_data_type) * Returns: (transfer full): The field of value. * * Since: 0.13.0 + * + * Deprecated: 21.0.0: + * Use garrow_base_list_data_type_get_field() instead. */ GArrowField * garrow_list_data_type_get_field(GArrowListDataType *list_data_type) { - auto data_type = GARROW_DATA_TYPE(list_data_type); - auto arrow_data_type = garrow_data_type_get_raw(data_type); - auto arrow_list_data_type = static_cast(arrow_data_type.get()); - - auto arrow_field = arrow_list_data_type->value_field(); - return garrow_field_new_raw(&arrow_field, nullptr); + return garrow_base_list_data_type_get_field(GARROW_BASE_LIST_DATA_TYPE(list_data_type)); } G_DEFINE_TYPE(GArrowLargeListDataType, garrow_large_list_data_type, GARROW_TYPE_DATA_TYPE) @@ -753,4 +787,93 @@ garrow_run_end_encoded_data_type_get_value_data_type( return garrow_data_type_new_raw(&arrow_value_data_type); } +enum { + PROP_LIST_SIZE = 1 +}; + +G_DEFINE_TYPE(GArrowFixedSizeListDataType, + garrow_fixed_size_list_data_type, + GARROW_TYPE_BASE_LIST_DATA_TYPE) + +static void +garrow_fixed_size_list_data_type_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(object)); + const auto arrow_fixed_size_list_type = + std::static_pointer_cast(arrow_data_type); + + switch (prop_id) { + case PROP_LIST_SIZE: + g_value_set_int(value, arrow_fixed_size_list_type->list_size()); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_fixed_size_list_data_type_class_init(GArrowFixedSizeListDataTypeClass *klass) +{ + GObjectClass *gobject_class; + GParamSpec *spec; + + gobject_class = G_OBJECT_CLASS(klass); + gobject_class->get_property = garrow_fixed_size_list_data_type_get_property; + + spec = g_param_spec_int("list-size", + "List size", + "The list size of the elements", + 0, + G_MAXINT, + 0, + G_PARAM_READABLE); + g_object_class_install_property(gobject_class, PROP_LIST_SIZE, spec); +} + +static void +garrow_fixed_size_list_data_type_init(GArrowFixedSizeListDataType *object) +{ +} + +/** + * garrow_fixed_size_list_data_type_new_data_type: + * @value_type: The data type of an element of each list. + * @list_size: The size of each list. + * + * Returns: A newly created fixed size list data type. + * + * Since: 21.0.0 + */ +GArrowFixedSizeListDataType * +garrow_fixed_size_list_data_type_new_data_type(GArrowDataType *value_type, + gint32 list_size) +{ + auto arrow_value_type = garrow_data_type_get_raw(value_type); + auto arrow_fixed_size_list_data_type = + arrow::fixed_size_list(arrow_value_type, list_size); + return GARROW_FIXED_SIZE_LIST_DATA_TYPE( + garrow_data_type_new_raw(&arrow_fixed_size_list_data_type)); +} + +/** + * garrow_fixed_size_list_data_type_new_field: + * @field: The field of lists. + * @list_size: The size of value. + * + * Returns: A newly created fixed size list data type. + * + * Since: 21.0.0 + */ +GArrowFixedSizeListDataType * +garrow_fixed_size_list_data_type_new_field(GArrowField *field, gint32 list_size) +{ + auto arrow_field = garrow_field_get_raw(field); + auto arrow_fixed_size_list_data_type = arrow::fixed_size_list(arrow_field, list_size); + return GARROW_FIXED_SIZE_LIST_DATA_TYPE( + garrow_data_type_new_raw(&arrow_fixed_size_list_data_type)); +} G_END_DECLS diff --git a/c_glib/arrow-glib/composite-data-type.h b/c_glib/arrow-glib/composite-data-type.h index 7a0a462af00..207647bd46a 100644 --- a/c_glib/arrow-glib/composite-data-type.h +++ b/c_glib/arrow-glib/composite-data-type.h @@ -26,13 +26,32 @@ G_BEGIN_DECLS +#define GARROW_TYPE_BASE_LIST_DATA_TYPE (garrow_base_list_data_type_get_type()) +GARROW_AVAILABLE_IN_21_0 +G_DECLARE_DERIVABLE_TYPE(GArrowBaseListDataType, + garrow_base_list_data_type, + GARROW, + BASE_LIST_DATA_TYPE, + GArrowDataType) +struct _GArrowBaseListDataTypeClass +{ + GArrowDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_21_0 +GArrowField * +garrow_base_list_data_type_get_field(GArrowBaseListDataType *base_list_data_type); + #define GARROW_TYPE_LIST_DATA_TYPE (garrow_list_data_type_get_type()) GARROW_AVAILABLE_IN_ALL -G_DECLARE_DERIVABLE_TYPE( - GArrowListDataType, garrow_list_data_type, GARROW, LIST_DATA_TYPE, GArrowDataType) +G_DECLARE_DERIVABLE_TYPE(GArrowListDataType, + garrow_list_data_type, + GARROW, + LIST_DATA_TYPE, + GArrowBaseListDataType) struct _GArrowListDataTypeClass { - GArrowDataTypeClass parent_class; + GArrowBaseListDataTypeClass parent_class; }; GARROW_AVAILABLE_IN_ALL @@ -241,4 +260,25 @@ GArrowDataType * garrow_run_end_encoded_data_type_get_value_data_type( GArrowRunEndEncodedDataType *data_type); +#define GARROW_TYPE_FIXED_SIZE_LIST_DATA_TYPE \ + (garrow_fixed_size_list_data_type_get_type()) +GARROW_AVAILABLE_IN_21_0 +G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeListDataType, + garrow_fixed_size_list_data_type, + GARROW, + FIXED_SIZE_LIST_DATA_TYPE, + GArrowBaseListDataType) +struct _GArrowFixedSizeListDataTypeClass +{ + GArrowBaseListDataTypeClass parent_class; +}; + +GARROW_AVAILABLE_IN_21_0 +GArrowFixedSizeListDataType * +garrow_fixed_size_list_data_type_new_data_type(GArrowDataType *value_type, + gint32 list_size); + +GARROW_AVAILABLE_IN_21_0 +GArrowFixedSizeListDataType * +garrow_fixed_size_list_data_type_new_field(GArrowField *field, gint32 list_size); G_END_DECLS diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 9b9faeb4495..5f494f3bc7b 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -36,6 +36,7 @@ #include #include +#include template typename ArrowType::c_type @@ -160,6 +161,9 @@ G_BEGIN_DECLS * @title: Computation on data * @include: arrow-glib/arrow-glib.h * + * You must call garrow_compute_initialize() explicitly before you use + * computation related features. + * * #GArrowExecuteContext is a class to customize how to execute a * function. * @@ -250,6 +254,25 @@ G_BEGIN_DECLS * There are many functions to compute data on an array. */ +/** + * garrow_compute_initialize: + * @error: (nullable): Return location for a #GError or %NULL. + * + * You must call this explicitly before you use computation related + * features. + * + * Returns: %TRUE if initializing the compute module completed successfully, + * %FALSE otherwise. + * + * Since: 21.0.0 + */ +gboolean +garrow_compute_initialize(GError **error) +{ + auto status = arrow::compute::Initialize(); + return garrow::check(error, status, "[compute][initialize]"); +} + typedef struct GArrowExecuteContextPrivate_ { arrow::compute::ExecContext context; diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 54b0ddb014f..0f689d147e3 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -25,6 +25,10 @@ G_BEGIN_DECLS +GARROW_AVAILABLE_IN_21_0 +gboolean +garrow_compute_initialize(GError **error); + #define GARROW_TYPE_EXECUTE_CONTEXT (garrow_execute_context_get_type()) GARROW_AVAILABLE_IN_1_0 G_DECLARE_DERIVABLE_TYPE( diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index a5e67463102..b755ffb56ac 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -223,7 +223,7 @@ gio = cxx.find_library('gio-2.0', dirs: [gobject_libdir], required: false) if not gio.found() gio = dependency('gio-2.0') endif -dependencies = [arrow, arrow_acero, gobject, gio] +dependencies = [arrow_acero, arrow_compute, arrow, gobject, gio] libarrow_glib = library( 'arrow-glib', sources: sources + enums, @@ -266,7 +266,7 @@ if have_gi arrow_glib_gir = gnome.generate_gir( libarrow_glib, export_packages: 'arrow-glib', - extra_args: ['--warn-all'], + extra_args: gir_scanner_extra_args, header: 'arrow-glib/arrow-glib.h', identifier_prefix: 'GArrow', includes: ['GObject-2.0', 'Gio-2.0'], diff --git a/c_glib/gandiva-glib/meson.build b/c_glib/gandiva-glib/meson.build index 267b01344f5..c609526a6c8 100644 --- a/c_glib/gandiva-glib/meson.build +++ b/c_glib/gandiva-glib/meson.build @@ -123,8 +123,7 @@ if have_gi libgandiva_glib, dependencies: declare_dependency(sources: arrow_glib_gir), export_packages: 'gandiva-glib', - extra_args: [ - '--warn-all', + extra_args: gir_scanner_extra_args + [ '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', ], header: 'gandiva-glib/gandiva-glib.h', diff --git a/c_glib/meson.build b/c_glib/meson.build index fd931cc6a48..f10a8042545 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -34,7 +34,7 @@ project( # * 22.04: 0.61.2 # * 24.04: 1.3.2 meson_version: '>=0.61.2', - version: '20.0.0-SNAPSHOT', + version: '22.0.0', ) version = meson.project_version() @@ -147,7 +147,13 @@ if arrow_cpp_build_lib_dir == '' modules: ['ArrowCUDA::arrow_cuda_shared'], required: false, ) - # we do not support compiling GLib without Acero engine + # we do not support compiling GLib without Compute and Acero engine + arrow_compute = dependency( + 'arrow-compute', + 'ArrowCompute', + kwargs: common_args, + modules: ['ArrowCompute::arrow_compute_shared'], + ) arrow_acero = dependency( 'arrow-acero', 'ArrowAcero', @@ -215,6 +221,11 @@ main(void) dirs: [arrow_cpp_build_lib_dir], required: false, ) + arrow_compute = cpp_compiler.find_library( + 'arrow_compute', + dirs: [arrow_cpp_build_lib_dir], + required: true, + ) arrow_acero = cpp_compiler.find_library( 'arrow_acero', dirs: [arrow_cpp_build_lib_dir], @@ -258,6 +269,19 @@ python = import('python') python3 = python.find_installation('python3') generate_version_header_py = project_source_root / 'tool' / 'generate-version-header.py' +c_compiler = meson.get_compiler('c') +# Specify -fPIE explicitly for g-ir-scanner because PIE is disabled by +# default on AlmaLinux 9 RPM build by +# LDFLAGS="-specs=/usr/lib/rpm/redhat/redhat-hardened-ld". +gir_scanner_cflags = c_compiler.get_supported_arguments('-fPIE') +if gir_scanner_cflags.length() == 0 + gir_scanner_extra_args = [] +else + gir_scanner_extra_args = ['--cflags-begin'] + gir_scanner_cflags + [ + '--cflags-end', + ] +endif + subdir('arrow-glib') if arrow_cuda.found() subdir('arrow-cuda-glib') diff --git a/c_glib/parquet-glib/meson.build b/c_glib/parquet-glib/meson.build index a85ba18f30f..4d1931199fb 100644 --- a/c_glib/parquet-glib/meson.build +++ b/c_glib/parquet-glib/meson.build @@ -97,8 +97,7 @@ if have_gi libparquet_glib, dependencies: declare_dependency(sources: arrow_glib_gir), export_packages: 'parquet-glib', - extra_args: [ - '--warn-all', + extra_args: gir_scanner_extra_args + [ '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', ], header: 'parquet-glib/parquet-glib.h', diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb index 46d2ebe3f6e..9fdcdcdce0e 100755 --- a/c_glib/test/run-test.rb +++ b/c_glib/test/run-test.rb @@ -31,6 +31,7 @@ Gio = GI.load("Gio") Arrow = GI.load("Arrow") +Arrow.compute_initialize module Arrow class Buffer alias_method :initialize_raw, :initialize diff --git a/c_glib/test/test-array-statistics.rb b/c_glib/test/test-array-statistics.rb index bf470b4e722..03407b7e340 100644 --- a/c_glib/test/test-array-statistics.rb +++ b/c_glib/test/test-array-statistics.rb @@ -48,4 +48,26 @@ def setup test("#null_count") do assert_equal(1, @statistics.null_count) end + + test("#has_distinct_count?") do + assert do + not @statistics.has_distinct_count? + end + end + + test("#distinct_count_exact?") do + assert do + not @statistics.distinct_count_exact? + end + end + + test ("#distinct_count_exact") do + assert_equal(-1, @statistics.distinct_count_exact) + end + + test ("#distinct_count_approximate") do + assert do + @statistics.distinct_count_approximate.nan? + end + end end diff --git a/c_glib/test/test-chunked-array.rb b/c_glib/test/test-chunked-array.rb index 86bd23af6f5..2d5dd5cd642 100644 --- a/c_glib/test/test-chunked-array.rb +++ b/c_glib/test/test-chunked-array.rb @@ -144,4 +144,15 @@ def test_combine assert_equal(build_boolean_array([true, false, nil]), chunked_array.combine) end + + def test_export_import + chunks = [ + build_boolean_array([true, false, true]), + build_boolean_array([false, nil]), + ] + original_chunked_array = Arrow::ChunkedArray.new(chunks) + c_abi_array_stream = original_chunked_array.export + assert_equal(original_chunked_array, + Arrow::ChunkedArray.import(c_abi_array_stream)) + end end diff --git a/c_glib/test/test-decimal128.rb b/c_glib/test/test-decimal128.rb index d032afd510d..c9405326fae 100644 --- a/c_glib/test/test-decimal128.rb +++ b/c_glib/test/test-decimal128.rb @@ -123,7 +123,7 @@ def test_divide_zero decimal1 = Arrow::Decimal128.new(23423445) decimal2 = Arrow::Decimal128.new(0) message = - "[decimal128][divide]: Invalid: Division by 0 in Decimal128" + "[decimal128][divide]: Invalid: Division by 0 in Decimal" assert_raise(Arrow::Error::Invalid.new(message)) do decimal1.divide(decimal2) end @@ -236,7 +236,7 @@ def test_rescale_fail decimal = Arrow::Decimal128.new(10) message = "[decimal128][rescale]: Invalid: " + - "Rescaling Decimal128 value would cause data loss" + "Rescaling Decimal value would cause data loss" assert_raise(Arrow::Error::Invalid.new(message)) do decimal.rescale(1, -1) end diff --git a/c_glib/test/test-decimal256.rb b/c_glib/test/test-decimal256.rb index 24fd3b5552b..0592972286b 100644 --- a/c_glib/test/test-decimal256.rb +++ b/c_glib/test/test-decimal256.rb @@ -110,7 +110,7 @@ def test_divide_zero decimal1 = Arrow::Decimal256.new(23423445) decimal2 = Arrow::Decimal256.new(0) message = - "[decimal256][divide]: Invalid: Division by 0 in Decimal256" + "[decimal256][divide]: Invalid: Division by 0 in Decimal" assert_raise(Arrow::Error::Invalid.new(message)) do decimal1.divide(decimal2) end @@ -223,7 +223,7 @@ def test_rescale_fail decimal = Arrow::Decimal256.new(10) message = "[decimal256][rescale]: Invalid: " + - "Rescaling Decimal256 value would cause data loss" + "Rescaling Decimal value would cause data loss" assert_raise(Arrow::Error::Invalid.new(message)) do decimal.rescale(1, -1) end diff --git a/c_glib/test/test-decimal32.rb b/c_glib/test/test-decimal32.rb index 33b84ccc6b5..83b719251f7 100644 --- a/c_glib/test/test-decimal32.rb +++ b/c_glib/test/test-decimal32.rb @@ -106,7 +106,7 @@ def test_divide_zero decimal1 = Arrow::Decimal32.new(23423445) decimal2 = Arrow::Decimal32.new(0) message = - "[decimal32][divide]: Invalid: Division by 0 in Decimal32" + "[decimal32][divide]: Invalid: Division by 0 in Decimal" assert_raise(Arrow::Error::Invalid.new(message)) do decimal1.divide(decimal2) end @@ -214,7 +214,7 @@ def test_rescale_fail decimal = Arrow::Decimal32.new(10) message = "[decimal32][rescale]: Invalid: " + - "Rescaling Decimal32 value would cause data loss" + "Rescaling Decimal value would cause data loss" assert_raise(Arrow::Error::Invalid.new(message)) do decimal.rescale(1, -1) end diff --git a/c_glib/test/test-decimal64.rb b/c_glib/test/test-decimal64.rb index add4f3e0b49..3fd7f3b4198 100644 --- a/c_glib/test/test-decimal64.rb +++ b/c_glib/test/test-decimal64.rb @@ -106,7 +106,7 @@ def test_divide_zero decimal1 = Arrow::Decimal64.new(23423445) decimal2 = Arrow::Decimal64.new(0) message = - "[decimal64][divide]: Invalid: Division by 0 in Decimal64" + "[decimal64][divide]: Invalid: Division by 0 in Decimal" assert_raise(Arrow::Error::Invalid.new(message)) do decimal1.divide(decimal2) end @@ -214,7 +214,7 @@ def test_rescale_fail decimal = Arrow::Decimal64.new(10) message = "[decimal64][rescale]: Invalid: " + - "Rescaling Decimal64 value would cause data loss" + "Rescaling Decimal value would cause data loss" assert_raise(Arrow::Error::Invalid.new(message)) do decimal.rescale(1, -1) end diff --git a/c_glib/test/test-fixed-shape-tensor-data-type.rb b/c_glib/test/test-fixed-shape-tensor-data-type.rb new file mode 100644 index 00000000000..abf8bf00db9 --- /dev/null +++ b/c_glib/test/test-fixed-shape-tensor-data-type.rb @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFixedShapeTensorDataType < Test::Unit::TestCase + def setup + @data_type = Arrow::FixedShapeTensorDataType.new(Arrow::UInt64DataType.new, + [3, 4], + [1, 0], + ["x", "y"]) + end + + def test_id + assert_equal(Arrow::Type::EXTENSION, @data_type.id) + end + + def test_name + assert_equal(["extension", "arrow.fixed_shape_tensor"], + [@data_type.name, @data_type.extension_name]) + end + + def test_n_dimensions + assert_equal(2, @data_type.n_dimensions) + end + + def test_shape + assert_equal([3, 4], @data_type.shape) + end + + def test_permutation + assert_equal([1, 0], @data_type.permutation) + end + + def test_strides + assert_equal([8, 32], @data_type.strides) + end + + def test_dim_names + assert_equal(["x", "y"], @data_type.dim_names) + end + + def test_to_s + assert do + @data_type.to_s.start_with?("extension[5]", @data_type.to_s) + end + + def test_list_size + assert_equal(@list_size, @data_type.list_size) + end + + def test_field + field = Arrow::Field.new("item", @value_type) + assert_equal(field, @data_type.field) + end + end +end diff --git a/c_glib/test/test-uuid-data-type.rb b/c_glib/test/test-uuid-data-type.rb new file mode 100644 index 00000000000..74db32c6eb9 --- /dev/null +++ b/c_glib/test/test-uuid-data-type.rb @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestUUIDDataType < Test::Unit::TestCase + def setup + @data_type = Arrow::UUIDDataType.new + end + + def test_id + assert_equal(Arrow::Type::EXTENSION, @data_type.id) + end + + def test_name + assert_equal(["extension", "arrow.uuid"], + [@data_type.name, @data_type.extension_name]) + end + + def test_to_s + assert_equal("extension", @data_type.to_s) + end +end diff --git a/c_glib/tool/generate-version-header.py b/c_glib/tool/generate-version-header.py index 6a8976204c0..b72e4f284f1 100755 --- a/c_glib/tool/generate-version-header.py +++ b/c_glib/tool/generate-version-header.py @@ -26,32 +26,32 @@ def main(): parser = argparse.ArgumentParser( - description="Generate C header with version macros") + description="Generate C header with version macros") parser.add_argument( - "--library", - required=True, - help="The library name to use in macro prefixes") + "--library", + required=True, + help="The library name to use in macro prefixes") parser.add_argument( - "--version", - required=True, - help="The library version number") + "--version", + required=True, + help="The library version number") parser.add_argument( - "--input", - type=Path, - required=True, - help="Path to the input template file") + "--input", + type=Path, + required=True, + help="Path to the input template file") parser.add_argument( - "--output", - type=Path, - required=True, - help="Path to the output file to generate") + "--output", + type=Path, + required=True, + help="Path to the output file to generate") args = parser.parse_args() with open(args.input, "r", encoding="utf-8") as input_file, \ open(args.output, "w", encoding="utf-8") as output_file: write_header( - input_file, output_file, args.library, args.version) + input_file, output_file, args.library, args.version) def write_header( @@ -70,13 +70,13 @@ def write_header( availability_macros = generate_availability_macros(library_name) replacements = { - "VERSION_MAJOR": str(version_major), - "VERSION_MINOR": str(version_minor), - "VERSION_MICRO": str(version_micro), - "VERSION_TAG": version_tag, - "ENCODED_VERSIONS": encoded_versions, - "VISIBILITY_MACROS": visibility_macros, - "AVAILABILITY_MACROS": availability_macros, + "VERSION_MAJOR": str(version_major), + "VERSION_MINOR": str(version_minor), + "VERSION_MICRO": str(version_micro), + "VERSION_TAG": version_tag, + "ENCODED_VERSIONS": encoded_versions, + "VISIBILITY_MACROS": visibility_macros, + "AVAILABILITY_MACROS": availability_macros, } output_file.write(re.sub( @@ -140,34 +140,36 @@ def generate_availability_macros(library: str) -> str: ALL_VERSIONS = [ - (20, 0), - (19, 0), - (18, 0), - (17, 0), - (16, 0), - (15, 0), - (14, 0), - (13, 0), - (12, 0), - (11, 0), - (10, 0), - (9, 0), - (8, 0), - (7, 0), - (6, 0), - (5, 0), - (4, 0), - (3, 0), - (2, 0), - (1, 0), - (0, 17), - (0, 16), - (0, 15), - (0, 14), - (0, 13), - (0, 12), - (0, 11), - (0, 10), + (22, 0), + (21, 0), + (20, 0), + (19, 0), + (18, 0), + (17, 0), + (16, 0), + (15, 0), + (14, 0), + (13, 0), + (12, 0), + (11, 0), + (10, 0), + (9, 0), + (8, 0), + (7, 0), + (6, 0), + (5, 0), + (4, 0), + (3, 0), + (2, 0), + (1, 0), + (0, 17), + (0, 16), + (0, 15), + (0, 14), + (0, 13), + (0, 12), + (0, 11), + (0, 10), ] diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index b05f7e8d1f2..150f54a1d41 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow-glib", - "version-string": "20.0.0-SNAPSHOT", + "version-string": "22.0.0", "$comment:dependencies": "We can enable gobject-introspection again once it's updated", "dependencies": [ "glib", diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat deleted file mode 100644 index eb6a2cb7201..00000000000 --- a/ci/appveyor-cpp-build.bat +++ /dev/null @@ -1,163 +0,0 @@ -@rem Licensed to the Apache Software Foundation (ASF) under one -@rem or more contributor license agreements. See the NOTICE file -@rem distributed with this work for additional information -@rem regarding copyright ownership. The ASF licenses this file -@rem to you under the Apache License, Version 2.0 (the -@rem "License"); you may not use this file except in compliance -@rem with the License. You may obtain a copy of the License at -@rem -@rem http://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, -@rem software distributed under the License is distributed on an -@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -@rem KIND, either express or implied. See the License for the -@rem specific language governing permissions and limitations -@rem under the License. - -@echo on - -git config core.symlinks true -git reset --hard - -@rem Retrieve git submodules, configure env var for Parquet unit tests -git submodule update --init || exit /B - -set ARROW_TEST_DATA=%CD%\testing\data -set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data - -@rem Enable memory debug checks if the env is not set already -IF "%ARROW_DEBUG_MEMORY_POOL%"=="" ( - set ARROW_DEBUG_MEMORY_POOL=trap -) - -set CMAKE_BUILD_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% -set CTEST_PARALLEL_LEVEL=%NUMBER_OF_PROCESSORS% - -call activate arrow - -@rem The "main" C++ build script for Windows CI -@rem (i.e. for usual configurations) - -set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON - -@rem Enable warnings-as-errors -set ARROW_CXXFLAGS=/WX /MP - -@rem Install GCS testbench -set PIPX_BIN_DIR=C:\Windows\ -call %CD%\ci\scripts\install_gcs_testbench.bat -storage-testbench -h || exit /B - -@rem -@rem Build and test Arrow C++ libraries (including Parquet) -@rem - -mkdir cpp\build -pushd cpp\build - -@rem XXX Without forcing CMAKE_CXX_COMPILER, CMake can re-run itself and -@rem unfortunately switch from Release to Debug mode... -@rem -@rem In release mode, disable optimizations (/Od) for faster compiling -@rem and enable runtime assertions. - -cmake -G "%GENERATOR%" %ARROW_CMAKE_ARGS% ^ - -DARROW_ACERO=ON ^ - -DARROW_BOOST_USE_SHARED=ON ^ - -DARROW_BUILD_EXAMPLES=ON ^ - -DARROW_BUILD_STATIC=OFF ^ - -DARROW_BUILD_TESTS=ON ^ - -DARROW_COMPUTE=ON ^ - -DARROW_CSV=ON ^ - -DARROW_CXXFLAGS="%ARROW_CXXFLAGS%" ^ - -DARROW_DATASET=ON ^ - -DARROW_ENABLE_TIMING_TESTS=OFF ^ - -DARROW_FILESYSTEM=ON ^ - -DARROW_FLIGHT=%ARROW_BUILD_FLIGHT% ^ - -DARROW_FLIGHT_SQL=%ARROW_BUILD_FLIGHT_SQL% ^ - -DARROW_GANDIVA=%ARROW_BUILD_GANDIVA% ^ - -DARROW_GCS=%ARROW_GCS% ^ - -DARROW_HDFS=ON ^ - -DARROW_JSON=ON ^ - -DARROW_MIMALLOC=ON ^ - -DARROW_ORC=%ARROW_ORC% ^ - -DARROW_PARQUET=ON ^ - -DARROW_S3=%ARROW_S3% ^ - -DARROW_SUBSTRAIT=ON ^ - -DARROW_VERBOSE_THIRDPARTY_BUILD=OFF ^ - -DARROW_WITH_BROTLI=ON ^ - -DARROW_WITH_LZ4=ON ^ - -DARROW_WITH_SNAPPY=ON ^ - -DARROW_WITH_ZLIB=ON ^ - -DARROW_WITH_ZSTD=ON ^ - -DCMAKE_BUILD_TYPE="Release" ^ - -DCMAKE_CXX_FLAGS_RELEASE="/MD /Od /UNDEBUG" ^ - -DCMAKE_CXX_STANDARD=17 ^ - -DCMAKE_INSTALL_PREFIX=%CONDA_PREFIX%\Library ^ - -DCMAKE_UNITY_BUILD=ON ^ - -DCMAKE_VERBOSE_MAKEFILE=OFF ^ - -DPARQUET_BUILD_EXECUTABLES=ON ^ - -DPARQUET_REQUIRE_ENCRYPTION=ON ^ - .. || exit /B -cmake --build . --target install --config Release || exit /B - -@rem For ORC C++ -set TZDIR=%CONDA_PREFIX%\share\zoneinfo - -@rem For finding Python executable for GCS tests -set PYTHON=python - -ctest --output-on-failure || exit /B - -popd - -pushd python - -@rem -@rem Build and install pyarrow -@rem - -set PYARROW_CMAKE_GENERATOR=%GENERATOR% -set PYARROW_CXXFLAGS=%ARROW_CXXFLAGS% -set PYARROW_PARALLEL=2 -set PYARROW_WITH_ACERO=ON -set PYARROW_WITH_DATASET=ON -set PYARROW_WITH_FLIGHT=%ARROW_BUILD_FLIGHT% -set PYARROW_WITH_GANDIVA=%ARROW_BUILD_GANDIVA% -set PYARROW_WITH_GCS=%ARROW_GCS% -set PYARROW_WITH_ORC=%ARROW_ORC% -set PYARROW_WITH_PARQUET=ON -set PYARROW_WITH_PARQUET_ENCRYPTION=ON -set PYARROW_WITH_S3=%ARROW_S3% -set PYARROW_WITH_SUBSTRAIT=ON - -set ARROW_HOME=%CONDA_PREFIX%\Library -@rem ARROW-3075; pkgconfig is broken for Parquet for now -set PARQUET_HOME=%CONDA_PREFIX%\Library - -pip install --no-deps --no-build-isolation -vv --editable . - -@rem -@rem Run pyarrow tests -@rem - -@rem Download IANA Timezone Database to a non-standard location to -@rem test the configurability of the timezone database path -curl https://data.iana.org/time-zones/releases/tzdata2024b.tar.gz --output tzdata.tar.gz || exit /B -mkdir %USERPROFILE%\Downloads\test\tzdata -tar --extract --file tzdata.tar.gz --directory %USERPROFILE%\Downloads\test\tzdata -curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^ - --output %USERPROFILE%\Downloads\test\tzdata\windowsZones.xml || exit /B -@rem Remove the database from the default location -rmdir /s /q %USERPROFILE%\Downloads\tzdata -@rem Set the env var for the non-standard location of the database -@rem (only needed for testing purposes) -set PYARROW_TZDATA_PATH=%USERPROFILE%\Downloads\test\tzdata - -set AWS_EC2_METADATA_DISABLED=true -set PYTHONDEVMODE=1 - -popd - -python -m pytest -r sxX --durations=15 --pyargs pyarrow || exit /B diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat deleted file mode 100644 index ff159bd0b4b..00000000000 --- a/ci/appveyor-cpp-setup.bat +++ /dev/null @@ -1,102 +0,0 @@ -@rem Licensed to the Apache Software Foundation (ASF) under one -@rem or more contributor license agreements. See the NOTICE file -@rem distributed with this work for additional information -@rem regarding copyright ownership. The ASF licenses this file -@rem to you under the Apache License, Version 2.0 (the -@rem "License"); you may not use this file except in compliance -@rem with the License. You may obtain a copy of the License at -@rem -@rem http://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, -@rem software distributed under the License is distributed on an -@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -@rem KIND, either express or implied. See the License for the -@rem specific language governing permissions and limitations -@rem under the License. - -@echo on - -@rem -@rem The miniconda install on AppVeyor is very outdated, use Mambaforge instead -@rem - -appveyor DownloadFile https://github.com/conda-forge/miniforge/releases/download/24.9.2-0/Mambaforge-Windows-x86_64.exe || exit /B -start /wait "" Mambaforge-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /D=C:\Mambaforge -set "PATH=C:\Mambaforge\scripts;C:\Mambaforge\condabin;%PATH%" - -@rem -@rem Avoid picking up AppVeyor-installed OpenSSL (linker errors with gRPC) -@rem XXX Perhaps there is a smarter way of solving this issue? -@rem -rd /s /q C:\OpenSSL-Win32 -rd /s /q C:\OpenSSL-Win64 -rd /s /q C:\OpenSSL-v11-Win32 -rd /s /q C:\OpenSSL-v11-Win64 -rd /s /q C:\OpenSSL-v111-Win32 -rd /s /q C:\OpenSSL-v111-Win64 -rd /s /q C:\OpenSSL-v30-Win32 -rd /s /q C:\OpenSSL-v30-Win64 - -@rem -@rem Configure conda -@rem -conda config --set auto_update_conda false -conda config --set show_channel_urls true -conda config --set always_yes true -@rem Help with SSL timeouts to S3 -conda config --set remote_connect_timeout_secs 12 - -conda info -a || exit /B - -@rem -@rem Create conda environment -@rem - -set CONDA_PACKAGES= - -if "%ARROW_BUILD_GANDIVA%" == "ON" ( - @rem Install llvmdev in the toolchain if building gandiva.dll - set CONDA_PACKAGES=%CONDA_PACKAGES% --file=ci\conda_env_gandiva_win.txt -) -@rem Install pre-built "toolchain" packages for faster builds -set CONDA_PACKAGES=%CONDA_PACKAGES% --file=ci\conda_env_cpp.txt -@rem Arrow conda environment -conda create -n arrow ^ - --file=ci\conda_env_python.txt ^ - %CONDA_PACKAGES% ^ - "ccache" ^ - "cmake" ^ - "ninja" ^ - "nomkl" ^ - "pandas" ^ - "python=%PYTHON%" ^ - || exit /B -conda list -n arrow - -@rem -@rem Configure compiler -@rem -call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64 -set CC=cl.exe -set CXX=cl.exe - -@rem -@rem Download Minio somewhere on PATH, for unit tests -@rem -if "%ARROW_S3%" == "ON" ( - appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2025-01-20T14-49-07Z -FileName C:\Windows\Minio.exe || exit /B -) - -@rem -@rem Download IANA Timezone Database for unit tests -@rem -@rem (Doc section: Download timezone database) -curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output tzdata.tar.gz -mkdir tzdata -tar --extract --file tzdata.tar.gz --directory tzdata -move tzdata %USERPROFILE%\Downloads\tzdata -@rem Also need Windows timezone mapping -curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^ - --output %USERPROFILE%\Downloads\tzdata\windowsZones.xml -@rem (Doc section: Download timezone database) diff --git a/ci/conan/all/conandata.yml b/ci/conan/all/conandata.yml index a13b31c2e82..3bd3ec61eb9 100644 --- a/ci/conan/all/conandata.yml +++ b/ci/conan/all/conandata.yml @@ -21,6 +21,12 @@ # SOFTWARE. sources: + "20.0.0": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-20.0.0/apache-arrow-20.0.0.tar.gz?action=download" + sha256: "89efbbf852f5a1f79e9c99ab4c217e2eb7f991837c005cba2d4a2fbd35fad212" + "19.0.1": + url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-19.0.1/apache-arrow-19.0.1.tar.gz?action=download" + sha256: "acb76266e8b0c2fbb7eb15d542fbb462a73b3fd1e32b80fad6c2fafd95a51160" "18.1.0": url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-18.1.0/apache-arrow-18.1.0.tar.gz?action=download" sha256: "2dc8da5f8796afe213ecc5e5aba85bb82d91520eff3cf315784a52d0fa61d7fc" @@ -40,6 +46,17 @@ sources: url: "https://www.apache.org/dyn/closer.lua/arrow/arrow-14.0.2/apache-arrow-14.0.2.tar.gz?action=download" sha256: "1304dedb41896008b89fe0738c71a95d9b81752efc77fa70f264cb1da15d9bc2" patches: + "20.0.0": + - patch_file: "patches/20.0.0-0001-fix-downloaded-mimalloc.patch" + patch_description: "use cci package" + patch_type: "conan" + "19.0.1": + - patch_file: "patches/19.0.1-0001-fix-cmake.patch" + patch_description: "use cci package" + patch_type: "conan" + - patch_file: "patches/19.0.1-0002-fix-downloaded-mimalloc.patch" + patch_description: "use cci package" + patch_type: "conan" "18.1.0": - patch_file: "patches/18.0.0-0001-fix-cmake.patch" patch_description: "use cci package" @@ -64,4 +81,4 @@ patches: - patch_file: "patches/11.0.0-0001-fix-cmake.patch" patch_description: "use cci package" patch_type: "conan" - \ No newline at end of file + diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index 5db9fe35672..7dab8c82f69 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -20,6 +20,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import os + from conan import ConanFile from conan.errors import ConanInvalidConfiguration, ConanException from conan.tools.build import check_min_cppstd, cross_building @@ -28,11 +30,9 @@ from conan.tools.microsoft import is_msvc, is_msvc_static_runtime from conan.tools.scm import Version -import os -import glob - required_conan_version = ">=2.1.0" + class ArrowConan(ConanFile): name = "arrow" description = "Apache Arrow is a cross-language development platform for in-memory data" @@ -102,7 +102,7 @@ class ArrowConan(ConanFile): "dataset_modules": False, "deprecated": True, "encryption": False, - "filesystem_layer": False, + "filesystem_layer": True, "hdfs_bridgs": False, "plasma": "deprecated", "simd_level": "default", @@ -142,7 +142,7 @@ class ArrowConan(ConanFile): def _min_cppstd(self): # arrow >= 10.0.0 requires C++17. # https://github.com/apache/arrow/pull/13991 - return "11" if Version(self.version) < "10.0.0" else "17" + return "17" def export_sources(self): export_conandata_patches(self) @@ -151,10 +151,10 @@ def export_sources(self): def config_options(self): if self.settings.os == "Windows": del self.options.fPIC - if Version(self.version) < "8.0.0": - del self.options.substrait if is_msvc(self): self.options.with_boost = True + if Version(self.version) >= "19.0.0": + self.options.with_mimalloc = True def configure(self): if self.options.shared: @@ -209,9 +209,6 @@ def requirements(self): self.requires("snappy/1.1.9") if self.options.get_safe("simd_level") != None or \ self.options.get_safe("runtime_simd_level") != None: - if Version(self.version) < 8: - self.requires("xsimd/9.0.1") - else: self.requires("xsimd/13.0.0") if self.options.with_zlib: self.requires("zlib/[>=1.2.11 <2]") @@ -242,6 +239,15 @@ def validate(self): raise ConanException("'with_boost' option should be True when 'gandiva=True'") if not self.options.with_utf8proc: raise ConanException("'with_utf8proc' option should be True when 'gandiva=True'") + if self.options.with_orc: + if not self.options.with_lz4: + raise ConanException("'with_lz4' option should be True when 'orc=True'") + if not self.options.with_snappy: + raise ConanException("'with_snappy' option should be True when 'orc=True'") + if not self.options.with_zlib: + raise ConanException("'with_zlib' option should be True when 'orc=True'") + if not self.options.with_zstd: + raise ConanException("'with_zstd' option should be True when 'orc=True'") if self.options.with_thrift and not self.options.with_boost: raise ConanException("'with_boost' option should be True when 'thrift=True'") if self.options.parquet: @@ -250,17 +256,7 @@ def validate(self): if self.options.with_flight_rpc and not self.options.with_protobuf: raise ConanException("'with_protobuf' option should be True when 'with_flight_rpc=True'") - if self.settings.compiler.get_safe("cppstd"): - check_min_cppstd(self, self._min_cppstd) - - if ( - Version(self.version) < "10.0.0" - and self.settings.compiler == "clang" - and Version(self.settings.compiler.version) < "3.9" - ): - raise ConanInvalidConfiguration( - f"{self.ref} requires C++11, which needs at least clang-3.9" - ) + check_min_cppstd(self, self._min_cppstd) if self.options.get_safe("skyhook", False): raise ConanInvalidConfiguration("CCI has no librados recipe (yet)") @@ -280,7 +276,9 @@ def validate(self): raise ConanInvalidConfiguration("arrow:parquet requires arrow:with_thrift") def build_requirements(self): - if Version(self.version) >= "13.0.0": + if Version(self.version) >= "20.0.0": + self.tool_requires("cmake/[>=3.25 <4]") + else: self.tool_requires("cmake/[>=3.16 <4]") def source(self): @@ -306,6 +304,7 @@ def source(self): # END get(self, **self.conan_data["sources"][self.version], filename=f"apache-arrow-{self.version}.tar.gz", strip_root=True) + self._patch_sources() def generate(self): tc = CMakeToolchain(self) @@ -386,6 +385,7 @@ def generate(self): if self.options.with_zstd: tc.variables["ARROW_ZSTD_USE_SHARED"] = bool(self.dependencies["zstd"].options.shared) tc.variables["ORC_SOURCE"] = "SYSTEM" + tc.variables["ARROW_ORC"] = bool(self.options.with_orc) tc.variables["ARROW_WITH_THRIFT"] = bool(self.options.with_thrift) tc.variables["ARROW_THRIFT"] = bool(self.options.with_thrift) tc.variables["Thrift_SOURCE"] = "SYSTEM" @@ -425,31 +425,13 @@ def generate(self): tc.generate() deps = CMakeDeps(self) + deps.set_property("mimalloc", "cmake_target_name", "mimalloc::mimalloc") deps.generate() def _patch_sources(self): apply_conandata_patches(self) - if Version(self.version) < "10.0.0": - for filename in glob.glob(os.path.join(self.source_folder, "cpp", "cmake_modules", "Find*.cmake")): - if os.path.basename(filename) not in [ - "FindArrow.cmake", - "FindArrowAcero.cmake", - "FindArrowCUDA.cmake", - "FindArrowDataset.cmake", - "FindArrowFlight.cmake", - "FindArrowFlightSql.cmake", - "FindArrowFlightTesting.cmake", - "FindArrowPython.cmake", - "FindArrowPythonFlight.cmake", - "FindArrowSubstrait.cmake", - "FindArrowTesting.cmake", - "FindGandiva.cmake", - "FindParquet.cmake", - ]: - os.remove(filename) def build(self): - self._patch_sources() cmake = CMake(self) cmake.configure(build_script_folder=os.path.join(self.source_folder, "cpp")) cmake.build() @@ -464,29 +446,6 @@ def package(self): rmdir(self, os.path.join(self.package_folder, "lib", "pkgconfig")) rmdir(self, os.path.join(self.package_folder, "share")) - cmake_suffix = "shared" if self.options.shared else "static" - - alias_map = { f"Arrow::arrow_{cmake_suffix}": f"arrow::arrow_{cmake_suffix}" } - - if self.options.parquet: - alias_map[f"Parquet::parquet_{cmake_suffix}"] = f"arrow::parquet_{cmake_suffix}" - - if self.options.get_safe("substrait"): - alias_map[f"Arrow::arrow_substrait_{cmake_suffix}"] = f"arrow::arrow_substrait_{cmake_suffix}" - - if self.options.acero: - alias_map[f"Arrow::arrow_acero_{cmake_suffix}"] = f"arrow::arrow_acero_{cmake_suffix}" - - if self.options.gandiva: - alias_map[f"Gandiva::gandiva_{cmake_suffix}"] = f"arrow::gandiva_{cmake_suffix}" - - if self.options.with_flight_rpc: - alias_map[f"ArrowFlight::arrow_flight_sql_{cmake_suffix}"] = f"arrow::arrow_flight_sql_{cmake_suffix}" - - @property - def _module_subfolder(self): - return os.path.join("lib", "cmake") - def package_info(self): # FIXME: fix CMake targets of components @@ -556,6 +515,8 @@ def package_info(self): self.cpp_info.components["dataset"].libs = ["arrow_dataset"] if self.options.parquet: self.cpp_info.components["dataset"].requires = ["libparquet"] + if self.options.acero and Version(self.version) >= "19.0.0": + self.cpp_info.components["dataset"].requires = ["libacero"] if self.options.cli and (self.options.with_cuda or self.options.with_flight_rpc or self.options.parquet): binpath = os.path.join(self.package_folder, "bin") diff --git a/ci/conan/all/patches/19.0.1-0001-fix-cmake.patch b/ci/conan/all/patches/19.0.1-0001-fix-cmake.patch new file mode 100644 index 00000000000..0d37465a0eb --- /dev/null +++ b/ci/conan/all/patches/19.0.1-0001-fix-cmake.patch @@ -0,0 +1,79 @@ +MIT License + +Copyright (c) 2025 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/FindThriftAlt.cmake b/cpp/cmake_modules/FindThriftAlt.cmake +index 98a706d..edf195e 100644 +--- a/cpp/cmake_modules/FindThriftAlt.cmake ++++ b/cpp/cmake_modules/FindThriftAlt.cmake +@@ -45,22 +45,20 @@ endif() + # * https://github.com/apache/thrift/pull/2725 + # * https://github.com/apache/thrift/pull/2726 + # * https://github.com/conda-forge/thrift-cpp-feedstock/issues/68 +-if(NOT WIN32) +- set(find_package_args "") +- if(ThriftAlt_FIND_VERSION) +- list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) +- endif() +- if(ThriftAlt_FIND_QUIETLY) +- list(APPEND find_package_args QUIET) +- endif() +- find_package(Thrift ${find_package_args}) +- if(Thrift_FOUND) +- set(ThriftAlt_FOUND TRUE) +- add_executable(thrift::compiler IMPORTED) +- set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION +- "${THRIFT_COMPILER}") +- return() +- endif() ++set(find_package_args "") ++if(ThriftAlt_FIND_VERSION) ++ list(APPEND find_package_args ${ThriftAlt_FIND_VERSION}) ++endif() ++if(ThriftAlt_FIND_QUIETLY) ++ list(APPEND find_package_args QUIET) ++endif() ++find_package(Thrift ${find_package_args}) ++if(Thrift_FOUND) ++ set(ThriftAlt_FOUND TRUE) ++ add_executable(thrift::compiler IMPORTED) ++ set_target_properties(thrift::compiler PROPERTIES IMPORTED_LOCATION ++ "${THRIFT_COMPILER}") ++ return() + endif() + + function(extract_thrift_version) +diff --git a/cpp/src/parquet/size_statistics.cc b/cpp/src/parquet/size_statistics.cc +index 1ce6c937a..e45eef3f0 100644 +--- a/cpp/src/parquet/size_statistics.cc ++++ b/cpp/src/parquet/size_statistics.cc +@@ -18,9 +18,11 @@ + #include "parquet/size_statistics.h" + + #include ++#include + #include + #include + #include ++#include + + #include "arrow/util/logging.h" + #include "parquet/exception.h" diff --git a/ci/conan/all/patches/19.0.1-0002-fix-downloaded-mimalloc.patch b/ci/conan/all/patches/19.0.1-0002-fix-downloaded-mimalloc.patch new file mode 100644 index 00000000000..d49f14dcc8d --- /dev/null +++ b/ci/conan/all/patches/19.0.1-0002-fix-downloaded-mimalloc.patch @@ -0,0 +1,37 @@ +MIT License + +Copyright (c) 2025 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake +index abfe6d2..cc0f3c5 100644 +--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake +@@ -2259,6 +2259,10 @@ endif() + # mimalloc - Cross-platform high-performance allocator, from Microsoft + + if(ARROW_MIMALLOC) ++ find_package(mimalloc REQUIRED CONFIG) ++endif() ++ ++if(0) + if(NOT ARROW_ENABLE_THREADING) + message(FATAL_ERROR "Can't use mimalloc with ARROW_ENABLE_THREADING=OFF") + endif() diff --git a/ci/conan/all/patches/20.0.0-0001-fix-downloaded-mimalloc.patch b/ci/conan/all/patches/20.0.0-0001-fix-downloaded-mimalloc.patch new file mode 100644 index 00000000000..44b1691bdb3 --- /dev/null +++ b/ci/conan/all/patches/20.0.0-0001-fix-downloaded-mimalloc.patch @@ -0,0 +1,37 @@ +MIT License + +Copyright (c) 2019 Conan.io + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake +index 63bdd4ab76..9744f01b1e 100644 +--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake ++++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake +@@ -2268,1 +2268,1 @@ endif() + # mimalloc - Cross-platform high-performance allocator, from Microsoft + + if(ARROW_MIMALLOC) ++ find_package(mimalloc REQUIRED CONFIG) ++endif() ++ ++if(0) + if(NOT ARROW_ENABLE_THREADING) + message(FATAL_ERROR "Can't use mimalloc with ARROW_ENABLE_THREADING=OFF") + endif() diff --git a/ci/conan/all/test_package/CMakeLists.txt b/ci/conan/all/test_package/CMakeLists.txt index b25c8e889cb..d85120a6626 100644 --- a/ci/conan/all/test_package/CMakeLists.txt +++ b/ci/conan/all/test_package/CMakeLists.txt @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -cmake_minimum_required(VERSION 3.8) +cmake_minimum_required(VERSION 3.15) project(test_package LANGUAGES CXX) find_package(Arrow REQUIRED CONFIG) diff --git a/ci/conan/config.yml b/ci/conan/config.yml index cbb2fce0547..02100ba9d4c 100644 --- a/ci/conan/config.yml +++ b/ci/conan/config.yml @@ -21,6 +21,10 @@ # SOFTWARE. versions: + "20.0.0": + folder: all + "19.0.1": + folder: all "18.1.0": folder: all "18.0.0": diff --git a/ci/conan/merge_status.sh b/ci/conan/merge_status.sh index 600385c0e17..295f6a37087 100644 --- a/ci/conan/merge_status.sh +++ b/ci/conan/merge_status.sh @@ -15,4 +15,4 @@ # specific language governing permissions and limitations # under the License. -UPSTREAM_REVISION=a9b270f9d2052e193ce3c0a6c4e2fda0b0ac5ade +UPSTREAM_REVISION=2cf8d725f6387f65be58a13435896328b36a14b9 diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index 731b49fa462..6e23e920a40 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -31,8 +31,8 @@ gflags glog gmock>=1.10.0 google-cloud-cpp>=1.34.0 -grpc-cpp<=1.50.1 gtest>=1.10.0 +libgrpc libprotobuf libutf8proc lz4-c @@ -40,7 +40,7 @@ make meson ninja nodejs -orc +orc<2.1.0 pkg-config python rapidjson diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 9a48f26b79c..4e3fd9f2de7 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -20,7 +20,7 @@ # Not a direct dependency of s3fs, but needed for our s3fs fixture boto3 cffi -cython>=3 +cython>=3.1 cloudpickle fsspec hypothesis diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 840577fdd97..565d147bf77 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -17,6 +17,7 @@ # Requirements for building the documentation breathe +cython>3.1.1 doxygen ipython linkify-it-py @@ -25,13 +26,13 @@ linkify-it-py # linuxdoc myst-parser numpydoc -pydata-sphinx-theme=0.14 +pydata-sphinx-theme=0.16 sphinx-autobuild sphinx-design sphinx-copybutton sphinx-lint sphinxcontrib-jquery sphinxcontrib-mermaid -sphinx==6.2 +sphinx pytest-cython pandas diff --git a/ci/conda_env_unix.txt b/ci/conda_env_unix.txt index 1973238adff..4728068c4e8 100644 --- a/ci/conda_env_unix.txt +++ b/ci/conda_env_unix.txt @@ -20,4 +20,5 @@ autoconf ccache orc +patch pkg-config diff --git a/ci/docker/alpine-linux-3.18-cpp.dockerfile b/ci/docker/alpine-linux-3.18-cpp.dockerfile deleted file mode 100644 index 60815b7e55d..00000000000 --- a/ci/docker/alpine-linux-3.18-cpp.dockerfile +++ /dev/null @@ -1,104 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG arch=amd64 -FROM ${arch}/alpine:3.18 - -RUN apk add \ - apache-orc-dev \ - bash \ - benchmark-dev \ - boost-dev \ - brotli-dev \ - bzip2-dev \ - c-ares-dev \ - ccache \ - clang \ - cmake \ - curl-dev \ - g++ \ - gcc \ - gdb \ - gflags-dev \ - git \ - glog-dev \ - gmock \ - grpc-dev \ - gtest-dev \ - libxml2-dev \ - llvm16-dev \ - llvm16-static \ - lz4-dev \ - make \ - musl-locales \ - nlohmann-json \ - openssl-dev \ - perl \ - pkgconfig \ - protobuf-dev \ - py3-pip \ - py3-numpy-dev \ - python3-dev \ - rapidjson-dev \ - re2-dev \ - rsync \ - samurai \ - snappy-dev \ - sqlite-dev \ - thrift-dev \ - tzdata \ - utf8proc-dev \ - xsimd-dev \ - zlib-dev \ - zstd-dev && \ - rm -rf /var/cache/apk/* && \ - ln -s /usr/share/zoneinfo/Etc/UTC /etc/localtime && \ - echo "Etc/UTC" > /etc/timezone - -COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh latest /usr/local - -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - -ENV ARROW_ACERO=ON \ - ARROW_AZURE=OFF \ - ARROW_BUILD_TESTS=ON \ - ARROW_DATASET=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_FLIGHT=ON \ - ARROW_FLIGHT_SQL=ON \ - ARROW_GANDIVA=ON \ - ARROW_GCS=ON \ - ARROW_HOME=/usr/local \ - ARROW_ORC=ON \ - ARROW_PARQUET=ON \ - ARROW_S3=ON \ - ARROW_SUBSTRAIT=ON \ - ARROW_USE_CCACHE=ON \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_OPENTELEMETRY=OFF \ - ARROW_WITH_MUSL=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - AWSSDK_SOURCE=BUNDLED \ - google_cloud_cpp_storage_SOURCE=BUNDLED \ - MUSL_LOCPATH=/usr/share/i18n/locales/musl \ - PATH=/usr/lib/ccache/bin:$PATH diff --git a/ci/docker/alpine-linux-3.22-cpp.dockerfile b/ci/docker/alpine-linux-3.22-cpp.dockerfile new file mode 100644 index 00000000000..48907e61a4a --- /dev/null +++ b/ci/docker/alpine-linux-3.22-cpp.dockerfile @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG arch=amd64 +FROM ${arch}/alpine:3.22 + +RUN apk add \ + apache-orc-dev \ + bash \ + benchmark-dev \ + boost-dev \ + brotli-dev \ + bzip2-dev \ + c-ares-dev \ + ccache \ + clang \ + cmake \ + curl-dev \ + g++ \ + gcc \ + gdb \ + gflags-dev \ + git \ + glog-dev \ + gmock \ + grpc-dev \ + gtest-dev \ + libxml2-dev \ + llvm-dev \ + llvm-gtest \ + llvm-static \ + lz4-dev \ + make \ + musl-locales \ + nlohmann-json \ + openssl-dev \ + patch \ + perl \ + pkgconfig \ + protobuf-dev \ + py3-pip \ + py3-numpy-dev \ + python3-dev \ + rapidjson-dev \ + re2-dev \ + rsync \ + samurai \ + snappy-dev \ + sqlite-dev \ + thrift-dev \ + tzdata \ + utf8proc-dev \ + xsimd-dev \ + zlib-dev \ + zstd-dev && \ + rm -rf /var/cache/apk/* && \ + ln -s /usr/share/zoneinfo/Etc/UTC /etc/localtime && \ + echo "Etc/UTC" > /etc/timezone + +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + +ENV ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ + ARROW_BUILD_TESTS=ON \ + ARROW_DATASET=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ + ARROW_FLIGHT=ON \ + ARROW_FLIGHT_SQL=ON \ + ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ + ARROW_HOME=/usr/local \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ + ARROW_USE_CCACHE=ON \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_OPENTELEMETRY=OFF \ + ARROW_WITH_MUSL=ON \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ + MUSL_LOCPATH=/usr/share/i18n/locales/musl \ + PATH=/usr/lib/ccache/bin:$PATH diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index a485fd8836c..a387fb26699 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -27,6 +27,7 @@ RUN /arrow/ci/scripts/install_minio.sh latest /opt/conda ARG python=3.10 # install the required conda packages into the test environment +# use `mold` to work around issues with GNU `ld` (GH-47015). COPY ci/conda_env_cpp.txt \ ci/conda_env_gandiva.txt \ /arrow/ci/ @@ -36,9 +37,10 @@ RUN mamba install -q -y \ compilers \ doxygen \ libnuma \ + mold \ python=${python} \ valgrind && \ - mamba clean --all + mamba clean --all --yes # We want to install the GCS testbench using the Conda base environment's Python, # because the test environment's Python may later change. @@ -73,6 +75,7 @@ ENV ARROW_ACERO=ON \ ARROW_S3_MODULE=ON \ ARROW_SUBSTRAIT=ON \ ARROW_USE_CCACHE=ON \ + ARROW_USE_MOLD=ON \ ARROW_WITH_BROTLI=ON \ ARROW_WITH_BZ2=ON \ ARROW_WITH_LZ4=ON \ diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index 56618532981..651bf984853 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -20,10 +20,12 @@ ARG arch=amd64 FROM ${repo}:${arch}-conda-cpp ARG arch=amd64 +# We need to synchronize the following values with the values in .env +# and services.conda-integration in docker-compose.yml. ARG maven=3.8.7 -ARG node=16 +ARG node=20 ARG yarn=1.22 -ARG jdk=11 +ARG jdk=17 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ @@ -35,37 +37,19 @@ RUN mamba install -q -y \ "python < 3.12" \ numpy \ compilers \ + go \ maven=${maven} \ nodejs=${node} \ yarn=${yarn} \ openjdk=${jdk} \ zstd && \ - mamba clean --all --force-pkgs-dirs + mamba clean --yes --all --force-pkgs-dirs # Install Rust with only the needed components # (rustfmt is needed for tonic-build to compile the protobuf definitions) RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --profile=minimal -y && \ $HOME/.cargo/bin/rustup component add rustfmt -ENV GOROOT=/opt/go \ - GOBIN=/opt/go/bin \ - GOPATH=/go \ - PATH=/opt/go/bin:$PATH -# Use always latest go -RUN wget -nv -O - https://dl.google.com/go/go$( \ - curl \ - --fail \ - --location \ - --show-error \ - --silent \ - https://api.github.com/repos/golang/go/git/matching-refs/tags/go | \ - grep -o '"ref": "refs/tags/go.*"' | \ - tail -n 1 | \ - sed \ - -e 's,^"ref": "refs/tags/go,,g' \ - -e 's/"$//g' \ - ).linux-${arch}.tar.gz | tar -xzf - -C /opt - ENV DOTNET_ROOT=/opt/dotnet \ PATH=/opt/dotnet:$PATH RUN curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Channel 8.0 -InstallDir /opt/dotnet diff --git a/ci/docker/conda-python-cpython-debug.dockerfile b/ci/docker/conda-python-cpython-debug.dockerfile index 36ba7865a88..12717f35fe8 100644 --- a/ci/docker/conda-python-cpython-debug.dockerfile +++ b/ci/docker/conda-python-cpython-debug.dockerfile @@ -17,12 +17,12 @@ ARG repo ARG arch -ARG python=3.9 +ARG python=3.10 FROM ${repo}:${arch}-conda-python-${python} # (Docker oddity: ARG needs to be repeated after FROM) -ARG python=3.9 +ARG python=3.10 RUN mamba install -y "conda-forge/label/python_debug::python=${python}[build=*_cpython]" && \ - mamba clean --all + mamba clean --all --yes # Quick check that we do have a debug mode CPython RUN python -c "import sys; sys.gettotalrefcount()" diff --git a/ci/docker/conda-python-dask.dockerfile b/ci/docker/conda-python-dask.dockerfile index 2c063b2e643..5317a64e01c 100644 --- a/ci/docker/conda-python-dask.dockerfile +++ b/ci/docker/conda-python-dask.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch=amd64 -ARG python=3.9 +ARG python=3.10 FROM ${repo}:${arch}-conda-python-${python} ARG dask=latest diff --git a/ci/docker/conda-python-hdfs.dockerfile b/ci/docker/conda-python-hdfs.dockerfile index 4cf35f4b37a..25d98e21053 100644 --- a/ci/docker/conda-python-hdfs.dockerfile +++ b/ci/docker/conda-python-hdfs.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch=amd64 -ARG python=3.9 +ARG python=3.10 FROM ${repo}:${arch}-conda-python-${python} ARG jdk=11 @@ -26,7 +26,7 @@ RUN mamba install -q -y \ maven=${maven} \ openjdk=${jdk} \ pandas && \ - mamba clean --all + mamba clean --all --yes # installing libhdfs (JNI) ARG hdfs=3.2.1 diff --git a/ci/docker/conda-python-jpype.dockerfile b/ci/docker/conda-python-jpype.dockerfile index c28400f0262..60be53d5bfb 100644 --- a/ci/docker/conda-python-jpype.dockerfile +++ b/ci/docker/conda-python-jpype.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch=amd64 -ARG python=3.9 +ARG python=3.10 FROM ${repo}:${arch}-conda-python-${python} ARG jdk=11 @@ -26,4 +26,4 @@ RUN mamba install -q -y \ maven=${maven} \ openjdk=${jdk} \ jpype1 && \ - mamba clean --all + mamba clean --all --yes diff --git a/ci/docker/conda-python-pandas.dockerfile b/ci/docker/conda-python-pandas.dockerfile index 4a52ffa8e12..5a8ec7c6a8f 100644 --- a/ci/docker/conda-python-pandas.dockerfile +++ b/ci/docker/conda-python-pandas.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch=amd64 -ARG python=3.9 +ARG python=3.10 FROM ${repo}:${arch}-conda-python-${python} ARG pandas=latest @@ -29,7 +29,7 @@ COPY ci/conda_env_sphinx.txt /arrow/ci/ RUN mamba install -q -y --file arrow/ci/conda_env_sphinx.txt && \ # We can't install linuxdoc by mamba. We install linuxdoc by pip here. pip install linuxdoc && \ - mamba clean --all + mamba clean --all --yes COPY ci/scripts/install_pandas.sh /arrow/ci/scripts/ RUN mamba uninstall -q -y numpy && \ diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index a8e8250797f..0c64c88bc59 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -17,7 +17,7 @@ ARG repo ARG arch=amd64 -ARG python=3.9 +ARG python=3.10 FROM ${repo}:${arch}-conda-python-${python} ARG jdk=11 @@ -30,7 +30,7 @@ RUN mamba install -q -y \ openjdk=${jdk} \ maven=${maven} \ pandas && \ - mamba clean --all && \ + mamba clean --all --yes && \ mamba uninstall -q -y numpy && \ /arrow/ci/scripts/install_numpy.sh ${numpy} diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index c08b69e6ef8..3127ee6edd0 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -20,7 +20,7 @@ ARG arch FROM ${repo}:${arch}-conda-cpp # install python specific packages -ARG python=3.9 +ARG python=3.10 COPY ci/conda_env_python.txt \ /arrow/ci/ # If the Python version being tested is the same as the Python used by the system gdb, @@ -30,7 +30,7 @@ RUN mamba install -q -y \ $([ "$python" == $(gdb --batch --eval-command 'python import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') ] && echo "gdb") \ "python=${python}.*=*_cp*" \ nomkl && \ - mamba clean --all + mamba clean --all --yes ENV ARROW_ACERO=ON \ ARROW_BUILD_STATIC=OFF \ diff --git a/ci/docker/conda.dockerfile b/ci/docker/conda.dockerfile index f97be554982..4b497b47036 100644 --- a/ci/docker/conda.dockerfile +++ b/ci/docker/conda.dockerfile @@ -38,7 +38,7 @@ ENV PATH=/opt/conda/bin:$PATH # create a conda environment ADD ci/conda_env_unix.txt /arrow/ci/ RUN mamba create -n arrow --file arrow/ci/conda_env_unix.txt git && \ - mamba clean --all + mamba clean --all --yes # activate the created environment by default RUN echo "conda activate arrow" >> ~/.profile diff --git a/ci/docker/cpp-jni.dockerfile b/ci/docker/cpp-jni.dockerfile new file mode 100644 index 00000000000..b21ec762d67 --- /dev/null +++ b/ci/docker/cpp-jni.dockerfile @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base +FROM ${base} + +ARG arch +ARG arch_short + +SHELL ["/bin/bash", "-i", "-c"] +ENTRYPOINT ["/bin/bash", "-i", "-c"] + +# Install basic dependencies +RUN dnf install -y \ + autoconf \ + curl \ + flex \ + gdb \ + git \ + perl-IPC-Cmd \ + wget \ + zip + +# A system Python is required for Ninja and vcpkg in this Dockerfile. +# On manylinux_2_28 base images, no system Python is installed. +# We therefore override the PATH with Python 3.10 in /opt/python +# so that we have a consistent Python version across base images. +ENV CPYTHON_VERSION=cp310 +ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH} + +# Install CMake +ARG cmake=3.29.2 +COPY ci/scripts/install_cmake.sh arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_cmake.sh ${cmake} /usr/local + +# Install Ninja +ARG ninja=1.10.2 +COPY ci/scripts/install_ninja.sh arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_ninja.sh ${ninja} /usr/local + +# Install ccache +ARG ccache=4.1 +COPY ci/scripts/install_ccache.sh arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_ccache.sh ${ccache} /usr/local + +# Install vcpkg +ARG vcpkg +COPY ci/vcpkg/*.patch \ + ci/vcpkg/*linux*.cmake \ + ci/vcpkg/vcpkg.json \ + arrow/ci/vcpkg/ +COPY ci/scripts/install_vcpkg.sh \ + arrow/ci/scripts/ +ENV VCPKG_ROOT=/opt/vcpkg +ARG build_type=release +ENV CMAKE_BUILD_TYPE=${build_type} \ + PATH="${PATH}:${VCPKG_ROOT}" \ + VCPKG_DEFAULT_TRIPLET=${arch_short}-linux-static-${build_type} \ + VCPKG_FEATURE_FLAGS="manifests" \ + VCPKG_FORCE_SYSTEM_BINARIES=1 \ + VCPKG_OVERLAY_TRIPLETS=/arrow/ci/vcpkg +# For --mount=type=secret: The GITHUB_TOKEN is the only real secret but we use +# --mount=type=secret for GITHUB_REPOSITORY_OWNER and +# VCPKG_BINARY_SOURCES too because we don't want to store them +# into the built image in order to easily reuse the built image cache. +# +# For vcpkg install: cannot use the S3 feature here because while +# aws-sdk-cpp=1.9.160 contains ssl related fixes as well as we can +# patch the vcpkg portfile to support arm machines it hits ARROW-15141 +# where we would need to fall back to 1.8.186 but we cannot patch +# those portfiles since vcpkg-tool handles the checkout of previous +# versions => use bundled S3 build +RUN --mount=type=secret,id=github_repository_owner \ + --mount=type=secret,id=github_token \ + --mount=type=secret,id=vcpkg_binary_sources \ + export GITHUB_REPOSITORY_OWNER=$(cat /run/secrets/github_repository_owner); \ + export GITHUB_TOKEN=$(cat /run/secrets/github_token); \ + export VCPKG_BINARY_SOURCES=$(cat /run/secrets/vcpkg_binary_sources); \ + arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg} && \ + vcpkg install \ + --clean-after-build \ + --x-install-root=${VCPKG_ROOT}/installed \ + --x-manifest-root=/arrow/ci/vcpkg \ + --x-feature=azure \ + --x-feature=dev \ + --x-feature=flight \ + --x-feature=gandiva \ + --x-feature=gcs \ + --x-feature=json \ + --x-feature=orc \ + --x-feature=parquet \ + --x-feature=s3 && \ + rm -rf ~/.config/NuGet/ + +ENV ARROW_BUILD_TESTS=ON \ + ARROW_CMAKE_ARGS="-DARROW_BUILD_TESTS=ON" \ + CMAKE_PRESET=ninja-${CMAKE_BUILD_TYPE}-jni-linux diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index 15716151fce..44c845bb17e 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -85,6 +85,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler-grpc \ python3-dev \ diff --git a/ci/docker/debian-12-js.dockerfile b/ci/docker/debian-12-js.dockerfile deleted file mode 100644 index e0935676d93..00000000000 --- a/ci/docker/debian-12-js.dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG arch=amd64 -ARG node=18 -FROM ${arch}/node:${node} - -ENV NODE_NO_WARNINGS=1 - -# install rsync for copying the generated documentation -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends rsync && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# TODO(kszucs): -# 1. add the files required to install the dependencies to .dockerignore -# 2. copy these files to their appropriate path -# 3. download and compile the dependencies diff --git a/ci/docker/debian-13-cpp.dockerfile b/ci/docker/debian-13-cpp.dockerfile new file mode 100644 index 00000000000..ca96b4177ff --- /dev/null +++ b/ci/docker/debian-13-cpp.dockerfile @@ -0,0 +1,145 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG arch=amd64 +FROM ${arch}/debian:13 +ARG arch + +ENV DEBIAN_FRONTEND noninteractive + +ARG llvm +RUN apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + lsb-release \ + wget && \ + if [ ${llvm} -ge 20 ]; then \ + wget -O /usr/share/keyrings/llvm-snapshot.asc \ + https://apt.llvm.org/llvm-snapshot.gpg.key && \ + (echo "Types: deb"; \ + echo "URIs: https://apt.llvm.org/$(lsb_release --codename --short)/"; \ + echo "Suites: llvm-toolchain-$(lsb_release --codename --short)-${llvm}"; \ + echo "Components: main"; \ + echo "Signed-By: /usr/share/keyrings/llvm-snapshot.asc") | \ + tee /etc/apt/sources.list.d/llvm.sources; \ + fi && \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + autoconf \ + ccache \ + clang-${llvm} \ + cmake \ + curl \ + g++ \ + gcc \ + gdb \ + git \ + libbenchmark-dev \ + libboost-filesystem-dev \ + libboost-system-dev \ + libbrotli-dev \ + libbz2-dev \ + libc-ares-dev \ + libcurl4-openssl-dev \ + libgflags-dev \ + libgmock-dev \ + libgoogle-glog-dev \ + libgrpc++-dev \ + libidn2-dev \ + libkrb5-dev \ + libldap-dev \ + liblz4-dev \ + libnghttp2-dev \ + libprotobuf-dev \ + libprotoc-dev \ + libpsl-dev \ + libre2-dev \ + librtmp-dev \ + libsnappy-dev \ + libsqlite3-dev \ + libssh-dev \ + libssh2-1-dev \ + libssl-dev \ + libthrift-dev \ + libutf8proc-dev \ + libxml2-dev \ + libxsimd-dev \ + libzstd-dev \ + llvm-${llvm}-dev \ + make \ + ninja-build \ + nlohmann-json3-dev \ + npm \ + opentelemetry-cpp-dev \ + patch \ + pkg-config \ + protobuf-compiler-grpc \ + python3-dev \ + python3-pip \ + python3-venv \ + rapidjson-dev \ + rsync \ + tzdata \ + zlib1g-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + +COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_azurite.sh + +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + +# Prioritize system packages and local installation. +ENV ARROW_ACERO=ON \ + ARROW_AZURE=ON \ + ARROW_BUILD_TESTS=ON \ + ARROW_DATASET=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ + ARROW_DATASET=ON \ + ARROW_FLIGHT=ON \ + ARROW_FLIGHT_SQL=ON \ + ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ + ARROW_HOME=/usr/local \ + ARROW_JEMALLOC=ON \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ + ARROW_USE_CCACHE=ON \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_OPENTELEMETRY=ON \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ + Azure_SOURCE=BUNDLED \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ + ORC_SOURCE=BUNDLED \ + PATH=/usr/lib/ccache/:$PATH \ + PYTHON=python3 diff --git a/ci/docker/debian-experimental-cpp.dockerfile b/ci/docker/debian-experimental-cpp.dockerfile index 2721b1d5f20..743f5ddd3be 100644 --- a/ci/docker/debian-experimental-cpp.dockerfile +++ b/ci/docker/debian-experimental-cpp.dockerfile @@ -79,6 +79,7 @@ RUN if [ -n "${gcc}" ]; then \ nlohmann-json3-dev \ npm \ opentelemetry-cpp-dev \ + patch \ pkg-config \ protobuf-compiler-grpc \ python3-dev \ diff --git a/ci/docker/fedora-39-cpp.dockerfile b/ci/docker/fedora-39-cpp.dockerfile deleted file mode 100644 index 6c5edd444e2..00000000000 --- a/ci/docker/fedora-39-cpp.dockerfile +++ /dev/null @@ -1,111 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG arch -FROM ${arch}/fedora:39 -ARG arch - -# install dependencies -RUN dnf update -y && \ - dnf install -y \ - autoconf \ - boost-devel \ - brotli-devel \ - bzip2-devel \ - c-ares-devel \ - ccache \ - clang-devel \ - cmake \ - curl \ - curl-devel \ - gcc \ - gcc-c++ \ - gdb \ - gflags-devel \ - git \ - glog-devel \ - gmock-devel \ - google-benchmark-devel \ - grpc-devel \ - grpc-plugins \ - gtest-devel \ - java-latest-openjdk-devel \ - java-latest-openjdk-headless \ - json-devel \ - liborc-devel \ - libzstd-devel \ - llvm-devel \ - lz4-devel \ - make \ - ninja-build \ - openssl-devel \ - protobuf-devel \ - python \ - python-devel \ - python-pip \ - rapidjson-devel \ - re2-devel \ - snappy-devel \ - thrift-devel \ - utf8proc-devel \ - wget \ - which \ - xsimd-devel \ - zlib-devel - -COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh latest /usr/local - -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - -COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin - -# PYARROW_TEST_GANDIVA=OFF: GH-39695: We need to make LLVM symbols visible in -# Python process explicitly if we use LLVM 17 or later. -ENV ARROW_ACERO=ON \ - ARROW_AZURE=OFF \ - ARROW_BUILD_TESTS=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_DATASET=ON \ - ARROW_FLIGHT=ON \ - ARROW_GANDIVA=ON \ - ARROW_GCS=ON \ - ARROW_HOME=/usr/local \ - ARROW_JEMALLOC=ON \ - ARROW_ORC=ON \ - ARROW_PARQUET=ON \ - ARROW_S3=ON \ - ARROW_SUBSTRAIT=ON \ - ARROW_USE_CCACHE=ON \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_OPENTELEMETRY=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - AWSSDK_SOURCE=BUNDLED \ - CC=gcc \ - CXX=g++ \ - google_cloud_cpp_storage_SOURCE=BUNDLED \ - opentelemetry_cpp_SOURCE=BUNDLED \ - PARQUET_BUILD_EXAMPLES=ON \ - PARQUET_BUILD_EXECUTABLES=ON \ - PATH=/usr/lib/ccache/:$PATH \ - PYARROW_TEST_GANDIVA=OFF diff --git a/ci/docker/fedora-42-cpp.dockerfile b/ci/docker/fedora-42-cpp.dockerfile new file mode 100644 index 00000000000..cabb066fec3 --- /dev/null +++ b/ci/docker/fedora-42-cpp.dockerfile @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG arch +FROM ${arch}/fedora:42 +ARG arch + +# install dependencies +RUN dnf update -y && \ + dnf install -y \ + autoconf \ + boost-devel \ + brotli-devel \ + bzip2-devel \ + c-ares-devel \ + ccache \ + clang-devel \ + cmake \ + curl \ + curl-devel \ + gcc \ + gcc-c++ \ + gdb \ + gflags-devel \ + git \ + glog-devel \ + gmock-devel \ + google-benchmark-devel \ + grpc-devel \ + grpc-plugins \ + gtest-devel \ + java-latest-openjdk-devel \ + java-latest-openjdk-headless \ + json-devel \ + liborc-devel \ + libzstd-devel \ + llvm-devel \ + lz4-devel \ + make \ + ninja-build \ + openssl-devel \ + patch \ + protobuf-devel \ + python \ + python-devel \ + python-pip \ + rapidjson-devel \ + re2-devel \ + snappy-devel \ + thrift-devel \ + utf8proc-devel \ + wget \ + which \ + xsimd-devel \ + zlib-devel + +COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + +COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin + +# PYARROW_TEST_GANDIVA=OFF: GH-39695: We need to make LLVM symbols visible in +# Python process explicitly if we use LLVM 17 or later. +ENV ARROW_ACERO=ON \ + ARROW_AZURE=OFF \ + ARROW_BUILD_TESTS=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ + ARROW_DATASET=ON \ + ARROW_FLIGHT=ON \ + ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ + ARROW_HOME=/usr/local \ + ARROW_JEMALLOC=ON \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_S3=ON \ + ARROW_SUBSTRAIT=ON \ + ARROW_USE_CCACHE=ON \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_OPENTELEMETRY=ON \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ + CC=gcc \ + CXX=g++ \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ + opentelemetry_cpp_SOURCE=BUNDLED \ + PARQUET_BUILD_EXAMPLES=ON \ + PARQUET_BUILD_EXECUTABLES=ON \ + PATH=/usr/lib/ccache/:$PATH \ + PYARROW_TEST_GANDIVA=OFF diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 31435d49891..b9f7c716e52 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -18,7 +18,7 @@ ARG base FROM ${base} -ARG r=4.4 +ARG r=4.5 ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium diff --git a/ci/docker/linux-apt-lint.dockerfile b/ci/docker/linux-apt-lint.dockerfile deleted file mode 100644 index b73cc585ea7..00000000000 --- a/ci/docker/linux-apt-lint.dockerfile +++ /dev/null @@ -1,74 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base -FROM hadolint/hadolint:v1.17.2 AS hadolint -FROM ${base} - -ARG clang_tools -RUN apt-get update && \ - apt-get install -y -q \ - clang-${clang_tools} \ - clang-format-${clang_tools} \ - clang-tidy-${clang_tools} \ - clang-tools-${clang_tools} \ - cmake \ - curl \ - libclang-${clang_tools}-dev \ - llvm-${clang_tools}-dev \ - openjdk-11-jdk-headless \ - python3 \ - python3-dev \ - python3-pip \ - ruby \ - apt-transport-https \ - software-properties-common \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -ARG r=4.4 -RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ - tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ - # NOTE: Only R >= 4.0 is available in this repo - add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran40/' && \ - apt-get install -y \ - r-base=${r}* \ - r-recommended=${r}* \ - libxml2-dev - -# Ensure parallel R package installation, set CRAN repo mirror, -# and use pre-built binaries where possible -COPY ci/etc/rprofile /arrow/ci/etc/ -RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site -# Also ensure parallel compilation of C/C++ code -RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Renviron.site -# We don't need arrow's dependencies, only lintr (and its dependencies) -RUN R -e "install.packages('lintr')" -RUN R -e "install.packages('cyclocomp')" - -# Docker linter -COPY --from=hadolint /bin/hadolint /usr/bin/hadolint - -# IWYU -COPY ci/scripts/install_iwyu.sh /arrow/ci/scripts/ -RUN arrow/ci/scripts/install_iwyu.sh /tmp/iwyu /usr/local ${clang_tools} - -# Use python3 by default in scripts -RUN ln -s /usr/bin/python3 /usr/local/bin/python - -ENV LC_ALL=C.UTF-8 \ - LANG=C.UTF-8 diff --git a/ci/docker/linux-apt-python-3.dockerfile b/ci/docker/linux-apt-python-3.dockerfile index e215976d448..d68bed26288 100644 --- a/ci/docker/linux-apt-python-3.dockerfile +++ b/ci/docker/linux-apt-python-3.dockerfile @@ -32,9 +32,10 @@ RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ -r arrow/python/requirements-test.txt ARG numba +ARG numba_cuda COPY ci/scripts/install_numba.sh /arrow/ci/scripts/ RUN if [ "${numba}" != "" ]; then \ - /arrow/ci/scripts/install_numba.sh ${numba} \ + /arrow/ci/scripts/install_numba.sh ${numba} ${numba_cuda} \ ; fi ENV ARROW_ACERO=ON \ diff --git a/ci/docker/linux-apt-python-313-freethreading.dockerfile b/ci/docker/linux-apt-python-313-freethreading.dockerfile index f5505e67f00..ceed5bac7e7 100644 --- a/ci/docker/linux-apt-python-313-freethreading.dockerfile +++ b/ci/docker/linux-apt-python-313-freethreading.dockerfile @@ -27,7 +27,7 @@ RUN apt-get update -y -q && \ rm -rf /var/lib/apt/lists* COPY python/requirements-build.txt \ - python/requirements-test.txt \ + python/requirements-test-3.13t.txt \ /arrow/python/ ENV ARROW_PYTHON_VENV /arrow-dev @@ -38,7 +38,7 @@ RUN ${ARROW_PYTHON_VENV}/bin/python -m pip install \ --prefer-binary \ --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" \ -r arrow/python/requirements-build.txt \ - -r arrow/python/requirements-test.txt + -r arrow/python/requirements-test-3.13t.txt # We want to run the PyArrow test suite with the GIL disabled, but cffi # (more precisely, the `_cffi_backend` module) currently doesn't declare diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 48c7154ef0e..83a7b8b9baa 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -35,7 +35,7 @@ ENV LANG=C.UTF-8 # Build R # [1] https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-18-04 # [2] https://linuxize.com/post/how-to-install-r-on-ubuntu-18-04/#installing-r-packages-from-cran -ARG r=4.4 +ARG r=4.5 RUN apt-get update -y && \ apt-get install -y \ dirmngr \ diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index da378eac430..c0d5a69a94e 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -33,6 +33,9 @@ ENV R_PRUNE_DEPS=${r_prune_deps} ARG r_custom_ccache=false ENV R_CUSTOM_CCACHE=${r_custom_ccache} +ARG r_update_clang=false +ENV R_UPDATE_CLANG=${r_update_clang} + ARG tz="UTC" ENV TZ=${tz} diff --git a/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile b/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile index c884611ca39..e4149821de3 100644 --- a/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile +++ b/ci/docker/python-free-threaded-wheel-manylinux-test-imports.dockerfile @@ -18,18 +18,20 @@ ARG base FROM ${base} +ARG python_version=3.13 + ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y -q && \ apt install -y -q --no-install-recommends software-properties-common gpg-agent && \ add-apt-repository -y ppa:deadsnakes/ppa && \ apt-get update -y -q && \ - apt install -y -q --no-install-recommends python3.13-dev python3.13-nogil python3.13-venv && \ + apt install -y -q --no-install-recommends python${python_version}-dev python${python_version}-nogil python${python_version}-venv && \ apt-get clean && \ rm -rf /var/lib/apt/lists* ENV ARROW_PYTHON_VENV /arrow-dev -RUN python3.13t -m venv ${ARROW_PYTHON_VENV} +RUN python${python_version}t -m venv ${ARROW_PYTHON_VENV} ENV PYTHON_GIL 0 ENV PATH "${ARROW_PYTHON_VENV}/bin:${PATH}" diff --git a/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile b/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile index 13b3bc140a9..566f0c0402a 100644 --- a/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile +++ b/ci/docker/python-free-threaded-wheel-manylinux-test-unittests.dockerfile @@ -18,6 +18,8 @@ ARG base FROM ${base} +ARG python_version=3.13 + ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y -q && \ @@ -27,25 +29,23 @@ RUN apt-get update -y -q && \ apt install -y -q --no-install-recommends \ build-essential \ libffi-dev \ - python3.13-dev \ - python3.13-nogil \ - python3.13-venv && \ + python${python_version}-dev \ + python${python_version}-nogil \ + python${python_version}-venv && \ apt-get clean && \ rm -rf /var/lib/apt/lists* ENV ARROW_PYTHON_VENV /arrow-dev -RUN python3.13t -m venv ${ARROW_PYTHON_VENV} +RUN python${python_version}t -m venv ${ARROW_PYTHON_VENV} ENV PYTHON_GIL 0 ENV PATH "${ARROW_PYTHON_VENV}/bin:${PATH}" # pandas doesn't provide wheels for aarch64 yet, so we have to install nightly Cython # along with the rest of pandas' build dependencies and disable build isolation -COPY python/requirements-wheel-test.txt /arrow/python/ RUN python -m pip install \ --pre \ --prefer-binary \ --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" \ Cython numpy RUN python -m pip install "meson-python==0.13.1" "meson==1.2.1" wheel "versioneer[toml]" ninja -RUN python -m pip install --no-build-isolation -r /arrow/python/requirements-wheel-test.txt diff --git a/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile b/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile index 526f345416b..e79facb4904 100644 --- a/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile +++ b/ci/docker/python-free-threaded-wheel-musllinux-test-imports.dockerfile @@ -18,6 +18,9 @@ ARG base FROM ${base} +ARG python_version=3.13 +ARG python_patch_version=3.13.7 + RUN apk add --no-cache \ bash \ build-base \ @@ -34,19 +37,19 @@ RUN apk add --no-cache \ tzdata \ zlib-dev -# Install Python3.13.2 without GIL -RUN wget https://github.com/python/cpython/archive/refs/tags/v3.13.2.tar.gz && \ - tar -xzf v3.13.2.tar.gz && \ - rm v3.13.2.tar.gz && \ - cd cpython-3.13.2/ && \ +# Install Python without GIL +RUN wget https://github.com/python/cpython/archive/refs/tags/v${python_patch_version}.tar.gz && \ + tar -xzf v${python_patch_version}.tar.gz && \ + rm v${python_patch_version}.tar.gz && \ + cd cpython-${python_patch_version}/ && \ ./configure --disable-gil --with-ensurepip && \ make -j && \ make install && \ cd ../ && \ - rm -rf cpython-3.13.2/ + rm -rf cpython-${python_patch_version}/ ENV ARROW_PYTHON_VENV /arrow-dev -RUN python3.13t -m venv ${ARROW_PYTHON_VENV} +RUN python${python_version}t -m venv ${ARROW_PYTHON_VENV} ENV PYTHON_GIL 0 ENV PATH "${ARROW_PYTHON_VENV}/bin:${PATH}" diff --git a/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile b/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile index 5acf4d92642..3b170087ba8 100644 --- a/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile +++ b/ci/docker/python-free-threaded-wheel-musllinux-test-unittests.dockerfile @@ -18,6 +18,9 @@ ARG base FROM ${base} +ARG python_version=3.13 +ARG python_patch_version=3.13.7 + RUN apk add --no-cache \ bash \ build-base \ @@ -34,19 +37,19 @@ RUN apk add --no-cache \ tzdata \ zlib-dev -# Install Python3.13.2 without GIL -RUN wget https://github.com/python/cpython/archive/refs/tags/v3.13.2.tar.gz && \ - tar -xzf v3.13.2.tar.gz && \ - rm v3.13.2.tar.gz && \ - cd cpython-3.13.2/ && \ +# Install Python without GIL +RUN wget https://github.com/python/cpython/archive/refs/tags/v${python_patch_version}.tar.gz && \ + tar -xzf v${python_patch_version}.tar.gz && \ + rm v${python_patch_version}.tar.gz && \ + cd cpython-${python_patch_version}/ && \ ./configure --disable-gil --with-ensurepip && \ make -j && \ make install && \ cd ../ && \ - rm -rf cpython-3.13.2/ + rm -rf cpython-${python_patch_version}/ ENV ARROW_PYTHON_VENV /arrow-dev -RUN python3.13t -m venv ${ARROW_PYTHON_VENV} +RUN python${python_version}t -m venv ${ARROW_PYTHON_VENV} ENV PYTHON_GIL 0 ENV PATH "${ARROW_PYTHON_VENV}/bin:${PATH}" @@ -56,11 +59,9 @@ RUN cp /usr/share/zoneinfo/Etc/UTC /etc/localtime # pandas doesn't provide wheels for aarch64 yet, so we have to install nightly Cython # along with the rest of pandas' build dependencies and disable build isolation -COPY python/requirements-wheel-test.txt /arrow/python/ RUN python -m pip install \ --pre \ --prefer-binary \ --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" \ Cython numpy RUN python -m pip install "meson-python==0.13.1" "meson==1.2.1" wheel "versioneer[toml]" ninja -RUN python -m pip install --no-build-isolation -r /arrow/python/requirements-wheel-test.txt diff --git a/ci/docker/python-free-threaded-wheel-windows-test-vs2022.dockerfile b/ci/docker/python-free-threaded-wheel-windows-test-vs2022.dockerfile index 4b972999b04..ab257b271e5 100644 --- a/ci/docker/python-free-threaded-wheel-windows-test-vs2022.dockerfile +++ b/ci/docker/python-free-threaded-wheel-windows-test-vs2022.dockerfile @@ -26,27 +26,34 @@ FROM ${base} ARG python=3.13 +# hadolint ignore=SC1072 +RUN (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.1") & \ + (if "%python%"=="3.14" setx PYTHON_VERSION "3.14.0") + SHELL ["powershell", "-NoProfile", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] -RUN $filename = 'python-3.13.1-amd64.exe'; \ - $url = 'https://www.python.org/ftp/python/3.13.1/' + $filename; \ +RUN $version = $env:PYTHON_VERSION; \ + $filename = 'python-' + $version + '-amd64.exe'; \ + $url = 'https://www.python.org/ftp/python/' + $version + '/' + $filename; \ Invoke-WebRequest -Uri $url -OutFile $filename; \ Start-Process -FilePath $filename -ArgumentList '/quiet', 'Include_freethreaded=1' -Wait ENV PYTHON_CMD="py -${python}t" SHELL ["cmd", "/S", "/C"] -RUN %PYTHON_CMD% -m pip install -U pip setuptools - -COPY python/requirements-wheel-test.txt C:/arrow/python/ -# Cython and Pandas wheels for 3.13 free-threaded are not released yet +RUN %PYTHON_CMD% -m pip install -U pip setuptools & \ + if "%python%"=="3.13" ( \ + setx REQUIREMENTS_FILE "requirements-wheel-test-3.13t.txt" \ + ) else ( \ + setx REQUIREMENTS_FILE "requirements-wheel-test.txt" \ + ) + +COPY python/requirements-wheel-test-3.13t.txt python/requirements-wheel-test.txt C:/arrow/python/ +# Cython and Pandas wheels for free-threaded are not released yet RUN %PYTHON_CMD% -m pip install \ --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ --pre \ --prefer-binary \ - -r C:/arrow/python/requirements-wheel-test.txt -# cffi-based tests would crash when importing cffi. -# hadolint ignore=DL3059 -RUN %PYTHON_CMD% -m pip uninstall -y cffi + -r C:/arrow/python/%REQUIREMENTS_FILE% ENV PYTHON="${python}t" ENV PYTHON_GIL=0 diff --git a/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile b/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile index adbdccde71d..77a64fd5c24 100644 --- a/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile +++ b/ci/docker/python-free-threaded-wheel-windows-vs2022.dockerfile @@ -26,9 +26,13 @@ FROM ${base} ARG python=3.13 +RUN (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.1") & \ + (if "%python%"=="3.14" setx PYTHON_VERSION "3.14.0") + SHELL ["powershell", "-NoProfile", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] -RUN $filename = 'python-3.13.1-amd64.exe'; \ - $url = 'https://www.python.org/ftp/python/3.13.1/' + $filename; \ +RUN $version = $env:PYTHON_VERSION; \ + $filename = 'python-' + $version + '-amd64.exe'; \ + $url = 'https://www.python.org/ftp/python/' + $version + '/' + $filename; \ Invoke-WebRequest -Uri $url -OutFile $filename; \ Start-Process -FilePath $filename -ArgumentList '/quiet', 'Include_freethreaded=1' -Wait diff --git a/ci/docker/python-sdist.dockerfile b/ci/docker/python-sdist.dockerfile index 853b532ab5e..efa1b56f9f8 100644 --- a/ci/docker/python-sdist.dockerfile +++ b/ci/docker/python-sdist.dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM amd64/ubuntu:20.04 +FROM amd64/ubuntu:24.04 SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -25,12 +25,19 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \ RUN apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ git \ - python3-pip && \ + python3-pip \ + python3-venv && \ apt-get clean && \ rm -rf /var/lib/apt/lists* COPY python/requirements-build.txt \ /arrow/python/requirements-build.txt -RUN pip3 install --requirement /arrow/python/requirements-build.txt -ENV PYTHON=/usr/bin/python3 +ENV ARROW_PYTHON_VENV /arrow-dev + +RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ + . ${ARROW_PYTHON_VENV}/bin/activate && \ + pip install \ + -r arrow/python/requirements-build.txt + +ENV PYTHON=${ARROW_PYTHON_VENV}/bin/python diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile index 09883f9780a..4b84117a1cb 100644 --- a/ci/docker/python-wheel-manylinux-test.dockerfile +++ b/ci/docker/python-wheel-manylinux-test.dockerfile @@ -28,7 +28,8 @@ RUN pip install -r /arrow/python/requirements-wheel-test.txt RUN apt-get update -y -q && \ apt-get install -y -q \ build-essential \ - python3-dev && \ + python3-dev \ + tzdata-legacy && \ apt-get clean && \ rm -rf /var/lib/apt/lists* diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 789f1531dd7..ffdd0d44f5f 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -25,30 +25,14 @@ ARG manylinux ENV LINUX_WHEEL_KIND='manylinux' ENV LINUX_WHEEL_VERSION=${manylinux} -# Ensure dnf is installed, especially for the manylinux2014 base -RUN if [ "${LINUX_WHEEL_VERSION}" = "2014" ]; then \ - sed -i \ - -e 's/^mirrorlist/#mirrorlist/' \ - -e 's/^#baseurl/baseurl/' \ - -e 's/mirror\.centos\.org/vault.centos.org/' \ - /etc/yum.repos.d/*.repo; \ - if [ "${arch}" != "amd64" ]; then \ - sed -i \ - -e 's,vault\.centos\.org/centos,vault.centos.org/altarch,' \ - /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo; \ - fi; \ - fi -RUN yum install -y dnf - # Install basic dependencies RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget -# A system Python is required for ninja and vcpkg in this Dockerfile. -# On manylinux2014 base images, system Python is 2.7.5, while -# on manylinux_2_28, no system python is installed. -# We therefore override the PATH with Python 3.8 in /opt/python +# A system Python is required for Ninja and vcpkg in this Dockerfile. +# On manylinux_2_28 base images, no system Python is installed. +# We therefore override the PATH with Python 3.10 in /opt/python # so that we have a consistent Python version across base images. -ENV CPYTHON_VERSION=cp39 +ENV CPYTHON_VERSION=cp310 ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH} # Install CMake @@ -114,11 +98,13 @@ RUN --mount=type=secret,id=github_repository_owner \ rm -rf ~/.config/NuGet/ # Make sure auditwheel is up-to-date -RUN pipx upgrade auditwheel +# Force upgrade version to 6.4.0 or later to ensure platform tags order is correct +# See https://github.com/apache/arrow/pull/46705 +RUN pipx upgrade auditwheel>=6.4.0 # Configure Python for applications running in the bash shell of this Dockerfile -ARG python=3.9 -ARG python_abi_tag=cp39 +ARG python=3.10 +ARG python_abi_tag=cp310 ENV PYTHON_VERSION=${python} ENV PYTHON_ABI_TAG=${python_abi_tag} RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-${PYTHON_ABI_TAG}) && \ diff --git a/ci/docker/python-wheel-musllinux.dockerfile b/ci/docker/python-wheel-musllinux.dockerfile index ab8e3f6ec9d..d00d44bd093 100644 --- a/ci/docker/python-wheel-musllinux.dockerfile +++ b/ci/docker/python-wheel-musllinux.dockerfile @@ -37,15 +37,16 @@ RUN apk add --no-cache \ unzip \ wget \ zip -# Add mono from testing repo because it's not in the main repo -RUN apk add --no-cache --repository=https://dl-cdn.alpinelinux.org/alpine/edge/testing mono +# Add mono from community repo because it's not in the main repo. +# We will be able to use the main repo once we move to alpine 3.22 or later. +RUN apk add --no-cache --repository=https://dl-cdn.alpinelinux.org/alpine/edge/community mono # A system Python is required for ninja and vcpkg in this Dockerfile. # On musllinux_1_2 a system python is installed (3.12) but pip is not -# We therefore override the PATH with Python 3.9 in /opt/python +# We therefore override the PATH with Python 3.10 in /opt/python # so that we have a consistent Python version across base images # as well as pip. -ENV CPYTHON_VERSION=cp39 +ENV CPYTHON_VERSION=cp310 ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH} # Install vcpkg @@ -81,6 +82,7 @@ RUN --mount=type=secret,id=github_repository_owner \ export GITHUB_REPOSITORY_OWNER=$(cat /run/secrets/github_repository_owner); \ export GITHUB_TOKEN=$(cat /run/secrets/github_token); \ export VCPKG_BINARY_SOURCES=$(cat /run/secrets/vcpkg_binary_sources); \ + export CMAKE_POLICY_VERSION_MINIMUM=3.5; \ arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg} && \ vcpkg install \ --clean-after-build \ @@ -99,8 +101,8 @@ RUN --mount=type=secret,id=github_repository_owner \ RUN pipx upgrade auditwheel # Configure Python for applications running in the bash shell of this Dockerfile -ARG python=3.9 -ARG python_abi_tag=cp39 +ARG python=3.10 +ARG python_abi_tag=cp310 ENV PYTHON_VERSION=${python} ENV PYTHON_ABI_TAG=${python_abi_tag} RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-${PYTHON_ABI_TAG}) && \ @@ -109,10 +111,5 @@ RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-${PYTHON_ABI_TAG SHELL ["/bin/bash", "-i", "-c", "-l"] ENTRYPOINT ["/bin/bash", "-i", "-c", "-l"] -# Remove once there are released Cython wheels for 3.13 free-threaded available -RUN if [ "${python_abi_tag}" = "cp313t" ]; then \ - pip install cython --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --prefer-binary ; \ - fi - COPY python/requirements-wheel-build.txt /arrow/python/ RUN pip install -r /arrow/python/requirements-wheel-build.txt diff --git a/ci/docker/python-wheel-windows-test-vs2022-base.dockerfile b/ci/docker/python-wheel-windows-test-vs2022-base.dockerfile index 1d1602c03a2..bd1da7b14b3 100644 --- a/ci/docker/python-wheel-windows-test-vs2022-base.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2022-base.dockerfile @@ -35,7 +35,7 @@ RUN ` --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" ` --add Microsoft.VisualStudio.Component.VC.CoreBuildTools ` --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ` - --add Microsoft.VisualStudio.Component.Windows10SDK.20348 ` + --add Microsoft.VisualStudio.Component.Windows11SDK.26100 ` --add Microsoft.VisualStudio.Component.VC.CMake.Project ` || IF "%ERRORLEVEL%"=="3010" EXIT 0) ` && del /q vs_buildtools.exe @@ -51,7 +51,7 @@ SHELL ["cmd", "/S", "/C"] # Install git, wget, minio RUN choco install --no-progress -r -y git wget -RUN curl https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z ` +RUN curl https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2025-01-20T14-49-07Z ` --output "C:\Windows\Minio.exe" # Install the GCS testbench using a well-known Python version. diff --git a/ci/docker/python-wheel-windows-test-vs2022.dockerfile b/ci/docker/python-wheel-windows-test-vs2022.dockerfile index f80e7ba0fe0..1bed37eb001 100644 --- a/ci/docker/python-wheel-windows-test-vs2022.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2022.dockerfile @@ -26,13 +26,13 @@ FROM ${base} # hadolint shell=cmd.exe -# Define the full version number otherwise choco falls back to patch number 0 (3.9 => 3.9.0) -ARG python=3.9 -RUN (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PYTHON_CMD "C:\Python39\python") & \ - (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PYTHON_CMD "py -3.10") & \ +# Define the full version number otherwise choco falls back to patch number 0 (3.10 => 3.10.0) +ARG python=3.10 +RUN (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PYTHON_CMD "py -3.10") & \ (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PYTHON_CMD "py -3.11") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.8" && setx PYTHON_CMD "py -3.12") & \ - (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.1" && setx PYTHON_CMD "py -3.13") + (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.10" && setx PYTHON_CMD "py -3.12") & \ + (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.7" && setx PYTHON_CMD "py -3.13") & \ + (if "%python%"=="3.14" setx PYTHON_VERSION "3.14.0" && setx PYTHON_CMD "py -3.14") # hadolint ignore=DL3059 RUN choco install -r -y --pre --no-progress --force python --version=%PYTHON_VERSION% diff --git a/ci/docker/python-wheel-windows-vs2022-base.dockerfile b/ci/docker/python-wheel-windows-vs2022-base.dockerfile index 7f683487a8c..99dd27b987a 100644 --- a/ci/docker/python-wheel-windows-vs2022-base.dockerfile +++ b/ci/docker/python-wheel-windows-vs2022-base.dockerfile @@ -67,7 +67,7 @@ RUN ` --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" ` --add Microsoft.VisualStudio.Component.VC.CoreBuildTools ` --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ` - --add Microsoft.VisualStudio.Component.Windows10SDK.20348 ` + --add Microsoft.VisualStudio.Component.Windows11SDK.26100 ` --add Microsoft.VisualStudio.Component.VC.CMake.Project ` || IF "%ERRORLEVEL%"=="3010" EXIT 0) ` && del /q vs_buildtools.exe diff --git a/ci/docker/python-wheel-windows-vs2022.dockerfile b/ci/docker/python-wheel-windows-vs2022.dockerfile index 50e942fd6bd..04750ff44c4 100644 --- a/ci/docker/python-wheel-windows-vs2022.dockerfile +++ b/ci/docker/python-wheel-windows-vs2022.dockerfile @@ -21,14 +21,13 @@ ARG base FROM ${base} -# Define the full version number otherwise choco falls back to patch number 0 (3.9 => 3.9.0) -# Note that Python 3.9 does not come with the "py" launcher -ARG python=3.9 -RUN (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PYTHON_CMD "C:\Python39\python") & \ - (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PYTHON_CMD "py -3.10") & \ +# Define the full version number otherwise choco falls back to patch number 0 (3.10 => 3.10.0) +ARG python=3.10 +RUN (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PYTHON_CMD "py -3.10") & \ (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PYTHON_CMD "py -3.11") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.8" && setx PYTHON_CMD "py -3.12") & \ - (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.1" && setx PYTHON_CMD "py -3.13") + (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.10" && setx PYTHON_CMD "py -3.12") & \ + (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.7" && setx PYTHON_CMD "py -3.13") & \ + (if "%python%"=="3.14" setx PYTHON_VERSION "3.14.0" && setx PYTHON_CMD "py -3.14") RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% RUN %PYTHON_CMD% -m pip install -U pip setuptools diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index 2a90a5637d4..d38dd418e29 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -33,6 +33,7 @@ RUN apt-get update -y -q && \ git \ libssl-dev \ libcurl4-openssl-dev \ + patch \ python3-pip \ python3-venv \ tzdata \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 846a910903d..88a27efe335 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -107,6 +107,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler \ protobuf-compiler-grpc \ @@ -173,9 +174,6 @@ RUN /arrow/ci/scripts/install_minio.sh latest /usr/local COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_gcs_testbench.sh default -COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_azurite.sh - COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_ceph.sh diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile index a1fd178a2c7..5e114d5dcd9 100644 --- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile @@ -33,6 +33,7 @@ RUN apt-get update -y -q && \ git \ libssl-dev \ libcurl4-openssl-dev \ + patch \ python3-pip \ python3-venv \ tzdata \ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 6bc49a4c842..0347d452d7b 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -108,6 +108,7 @@ RUN apt-get update -y -q && \ ninja-build \ nlohmann-json3-dev \ npm \ + patch \ pkg-config \ protobuf-compiler \ protobuf-compiler-grpc \ diff --git a/ci/docker/ubuntu-swift.dockerfile b/ci/docker/ubuntu-swift.dockerfile deleted file mode 100644 index d90d2d87b90..00000000000 --- a/ci/docker/ubuntu-swift.dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM swift:5.10-noble - -# Go is needed for generating test data -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - golang-go \ - unzip \ - wget && \ - apt-get clean - -ARG swift_lint=0.53.0 -RUN wget https://github.com/realm/SwiftLint/releases/download/${swift_lint}/swiftlint_linux.zip && \ - unzip swiftlint_linux.zip && \ - mv swiftlint /usr/local/bin/ && \ - mkdir -p /usr/local/share/doc/swiftlint/ && \ - mv LICENSE /usr/local/share/doc/swiftlint/ && \ - rm -rf swiftlint_linux.zip diff --git a/ci/rtools/README.md b/ci/rtools/README.md deleted file mode 100644 index 08b5ea7f513..00000000000 --- a/ci/rtools/README.md +++ /dev/null @@ -1,35 +0,0 @@ - - -# rtools40 patches for AWS SDK and related libs - -The patches in this directory are solely for the purpose of building Arrow C++ -under [Rtools40](https://cran.r-project.org/bin/windows/Rtools/rtools40.html) -and not used elsewhere. Once we've dropped support for Rtools40, we can consider -removing these patches. - -The larger reason these patches are needed is that Rtools provides their own -packages and their versions of the AWS libraries weren't compatible with CMake -3.25. Our solution was to bundle the AWS libs instead and these patches were -required to get them building under the Rtools40 environment. - -The patches were added while upgrading the minimum required CMake version to -3.25 in [GH-44950](https://github.com/apache/arrow/issues/44950). Please see the -associated PR, [GH-44989](https://github.com/apache/arrow/pull/44989), for more -context. diff --git a/ci/rtools/aws_c_common_ep.patch b/ci/rtools/aws_c_common_ep.patch deleted file mode 100644 index 94c84d0fe1b..00000000000 --- a/ci/rtools/aws_c_common_ep.patch +++ /dev/null @@ -1,39 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -diff --git a/include/aws/common/byte_order.inl b/include/aws/common/byte_order.inl -index 1204be0..0abd9cb 100644 ---- a/include/aws/common/byte_order.inl -+++ b/include/aws/common/byte_order.inl -@@ -13,7 +13,7 @@ - # include - #else - # include --#endif /* _MSC_VER */ -+#endif /* _WIN32 */ - - AWS_EXTERN_C_BEGIN - -@@ -39,7 +39,7 @@ AWS_STATIC_IMPL uint64_t aws_hton64(uint64_t x) { - uint64_t v; - __asm__("bswap %q0" : "=r"(v) : "0"(x)); - return v; --#elif defined(_MSC_VER) -+#elif defined(_WIN32) - return _byteswap_uint64(x); - #else - uint32_t low = x & UINT32_MAX; diff --git a/ci/rtools/aws_c_io_ep.patch b/ci/rtools/aws_c_io_ep.patch deleted file mode 100644 index a15d706ba12..00000000000 --- a/ci/rtools/aws_c_io_ep.patch +++ /dev/null @@ -1,56 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -diff --git a/source/windows/secure_channel_tls_handler.c b/source/windows/secure_channel_tls_handler.c -index 50caf02..29fe850 100644 ---- a/source/windows/secure_channel_tls_handler.c -+++ b/source/windows/secure_channel_tls_handler.c -@@ -35,6 +36,25 @@ - # pragma warning(disable : 4306) /* Identifier is type cast to a larger pointer. */ - #endif - -+#ifndef SP_PROT_TLS1_0_SERVER -+#define SP_PROT_TLS1_0_SERVER SP_PROT_TLS1_SERVER -+#endif -+#ifndef SP_PROT_TLS1_0_CLIENT -+#define SP_PROT_TLS1_0_CLIENT SP_PROT_TLS1_CLIENT -+#endif -+#ifndef SP_PROT_TLS1_1_SERVER -+#define SP_PROT_TLS1_1_SERVER 0x00000100 -+#endif -+#ifndef SP_PROT_TLS1_1_CLIENT -+#define SP_PROT_TLS1_1_CLIENT 0x00000200 -+#endif -+#ifndef SCH_USE_STRONG_CRYPTO -+#define SCH_USE_STRONG_CRYPTO 0x00400000 -+#endif -+#ifndef SECBUFFER_ALERT -+#define SECBUFFER_ALERT 0x11 -+#endif -+ - #define KB_1 1024 - #define READ_OUT_SIZE (16 * KB_1) - #define READ_IN_SIZE READ_OUT_SIZE -@@ -456,7 +476,7 @@ static int s_fillin_alpn_data( - - *extension_length += sizeof(uint32_t) + sizeof(uint16_t); - -- *extension_name = SecApplicationProtocolNegotiationExt_ALPN; -+ *extension_name = 2; - /*now add the protocols*/ - for (size_t i = 0; i < protocols_count; ++i) { - struct aws_byte_cursor *protocol_ptr = NULL; diff --git a/ci/rtools/awssdk_ep.patch b/ci/rtools/awssdk_ep.patch deleted file mode 100644 index bd26f853290..00000000000 --- a/ci/rtools/awssdk_ep.patch +++ /dev/null @@ -1,181 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -diff --git a/aws-cpp-sdk-core/include/aws/core/utils/Array.h b/aws-cpp-sdk-core/include/aws/core/utils/Array.h -index 2b5bbc566..7cb93bdf0 100644 ---- a/aws-cpp-sdk-core/include/aws/core/utils/Array.h -+++ b/aws-cpp-sdk-core/include/aws/core/utils/Array.h -@@ -54,7 +54,7 @@ namespace Aws - { - m_data.reset(Aws::NewArray(m_size, ARRAY_ALLOCATION_TAG)); - --#ifdef _WIN32 -+#ifdef _MSC_VER - std::copy(arrayToCopy, arrayToCopy + arraySize, stdext::checked_array_iterator< T * >(m_data.get(), m_size)); - #else - std::copy(arrayToCopy, arrayToCopy + arraySize, m_data.get()); -@@ -82,7 +82,7 @@ namespace Aws - if(arr->m_size > 0 && arr->m_data) - { - size_t arraySize = arr->m_size; --#ifdef _WIN32 -+#ifdef _MSC_VER - std::copy(arr->m_data.get(), arr->m_data.get() + arraySize, stdext::checked_array_iterator< T * >(m_data.get() + location, m_size)); - #else - std::copy(arr->m_data.get(), arr->m_data.get() + arraySize, m_data.get() + location); -@@ -101,7 +101,7 @@ namespace Aws - { - m_data.reset(Aws::NewArray(m_size, ARRAY_ALLOCATION_TAG)); - --#ifdef _WIN32 -+#ifdef _MSC_VER - std::copy(other.m_data.get(), other.m_data.get() + other.m_size, stdext::checked_array_iterator< T * >(m_data.get(), m_size)); - #else - std::copy(other.m_data.get(), other.m_data.get() + other.m_size, m_data.get()); -@@ -134,7 +134,7 @@ namespace Aws - { - m_data.reset(Aws::NewArray(m_size, ARRAY_ALLOCATION_TAG)); - --#ifdef _WIN32 -+#ifdef _MSC_VER - std::copy(other.m_data.get(), other.m_data.get() + other.m_size, stdext::checked_array_iterator< T * >(m_data.get(), m_size)); - #else - std::copy(other.m_data.get(), other.m_data.get() + other.m_size, m_data.get()); -diff --git a/aws-cpp-sdk-core/source/http/windows/WinHttpSyncHttpClient.cpp b/aws-cpp-sdk-core/source/http/windows/WinHttpSyncHttpClient.cpp -index 4dade6489..a0456cf8e 100644 ---- a/aws-cpp-sdk-core/source/http/windows/WinHttpSyncHttpClient.cpp -+++ b/aws-cpp-sdk-core/source/http/windows/WinHttpSyncHttpClient.cpp -@@ -22,6 +22,16 @@ - #include - #include - -+#ifndef WINHTTP_OPTION_WEB_SOCKET_KEEPALIVE_INTERVAL -+#define WINHTTP_OPTION_WEB_SOCKET_KEEPALIVE_INTERVAL 116 -+#endif -+#ifndef WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_1 -+#define WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_1 0x00000200 -+#endif -+#ifndef WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_2 -+#define WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_2 0x00000800 -+#endif -+ - using namespace Aws::Client; - using namespace Aws::Http; - using namespace Aws::Http::Standard; -@@ -272,7 +282,7 @@ bool WinHttpSyncHttpClient::DoQueryHeaders(void* hHttpRequest, std::shared_ptr(dwSize / sizeof(wchar_t))); - - WinHttpQueryHeaders(hHttpRequest, WINHTTP_QUERY_CONTENT_TYPE, nullptr, &contentTypeStr, &dwSize, 0); -- if (contentTypeStr[0] != NULL) -+ if (contentTypeStr[0]) - { - Aws::String contentStr = StringUtils::FromWString(contentTypeStr); - response->SetContentType(contentStr); -@@ -303,7 +313,7 @@ bool WinHttpSyncHttpClient::DoQueryHeaders(void* hHttpRequest, std::shared_ptrSetContentType(contentTypeStr); - AWS_LOGSTREAM_DEBUG(GetLogTag(), "Received content type " << contentTypeStr); -diff --git a/aws-cpp-sdk-core/source/http/windows/WinSyncHttpClient.cpp b/aws-cpp-sdk-core/source/http/windows/WinSyncHttpClient.cpp -index d7513cc3c..e390a8d4e 100644 ---- a/aws-cpp-sdk-core/source/http/windows/WinSyncHttpClient.cpp -+++ b/aws-cpp-sdk-core/source/http/windows/WinSyncHttpClient.cpp -@@ -349,7 +349,7 @@ std::shared_ptr WinSyncHttpClient::MakeRequest(const std::shared_p - } - } - -- if (!success && !IsRequestProcessingEnabled() || !ContinueRequest(*request)) -+ if ((!success && !IsRequestProcessingEnabled()) || !ContinueRequest(*request)) - { - response->SetClientErrorType(CoreErrors::USER_CANCELLED); - response->SetClientErrorMessage("Request processing disabled or continuation cancelled by user's continuation handler."); -diff --git a/aws-cpp-sdk-core/source/platform/windows/FileSystem.cpp b/aws-cpp-sdk-core/source/platform/windows/FileSystem.cpp -index 2ea82de6f..bc423441e 100644 ---- a/aws-cpp-sdk-core/source/platform/windows/FileSystem.cpp -+++ b/aws-cpp-sdk-core/source/platform/windows/FileSystem.cpp -@@ -11,7 +11,9 @@ - #include - #include - -+#ifdef _MSC_VER - #pragma warning( disable : 4996) -+#endif - - using namespace Aws::Utils; - namespace Aws -@@ -304,6 +306,9 @@ Aws::String CreateTempFilePath() - { - #ifdef _MSC_VER - #pragma warning(disable: 4996) // _CRT_SECURE_NO_WARNINGS -+#elif !defined(L_tmpnam_s) -+ // Definition from the MSVC stdio.h -+ #define L_tmpnam_s (sizeof("\\") + 16) - #endif - char s_tempName[L_tmpnam_s+1]; - -diff --git a/aws-cpp-sdk-core/source/platform/windows/OSVersionInfo.cpp b/aws-cpp-sdk-core/source/platform/windows/OSVersionInfo.cpp -index 0180f7fbf..3adbab313 100644 ---- a/aws-cpp-sdk-core/source/platform/windows/OSVersionInfo.cpp -+++ b/aws-cpp-sdk-core/source/platform/windows/OSVersionInfo.cpp -@@ -9,7 +9,9 @@ - - #include - -+#ifdef _MSC_VER - #pragma warning(disable: 4996) -+#endif - #include - #include - namespace Aws -diff --git a/aws-cpp-sdk-core/source/utils/crypto/factory/Factories.cpp b/aws-cpp-sdk-core/source/utils/crypto/factory/Factories.cpp -index 2ee517b48..3b0dce665 100644 ---- a/aws-cpp-sdk-core/source/utils/crypto/factory/Factories.cpp -+++ b/aws-cpp-sdk-core/source/utils/crypto/factory/Factories.cpp -@@ -939,7 +939,7 @@ std::shared_ptr Aws::Utils::Crypto::CreateSha256HMACIm - return GetSha256HMACFactory()->CreateImplementation(); - } - --#ifdef _WIN32 -+#ifdef _MSC_VER - #pragma warning( push ) - #pragma warning( disable : 4702 ) - #endif -@@ -1032,7 +1032,7 @@ std::shared_ptr Aws::Utils::Crypto::CreateAES_KeyWrapImplementa - return GetAES_KeyWrapFactory()->CreateImplementation(key); - } - --#ifdef _WIN32 -+#ifdef _MSC_VER - #pragma warning(pop) - #endif diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 9eac3ef5cb9..9b53ba99704 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=19.0.1.9000 +pkgver=22.0.0 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/ccache_setup.sh b/ci/scripts/ccache_setup.sh index 6afcdda7d0a..df00efe702f 100755 --- a/ci/scripts/ccache_setup.sh +++ b/ci/scripts/ccache_setup.sh @@ -19,8 +19,10 @@ set -eux -echo "ARROW_USE_CCACHE=ON" >> $GITHUB_ENV -echo "CCACHE_COMPILERCHECK=content" >> $GITHUB_ENV -echo "CCACHE_COMPRESS=1" >> $GITHUB_ENV -echo "CCACHE_COMPRESSLEVEL=6" >> $GITHUB_ENV -echo "CCACHE_MAXSIZE=1G" >> $GITHUB_ENV +{ + echo "ARROW_USE_CCACHE=ON" + echo "CCACHE_COMPILERCHECK=content" + echo "CCACHE_COMPRESS=1" + echo "CCACHE_COMPRESSLEVEL=6" + echo "CCACHE_MAXSIZE=1G" +} >> "$GITHUB_ENV" diff --git a/ci/scripts/conan_build.sh b/ci/scripts/conan_build.sh index 03e5cab8426..15c73d9d25a 100755 --- a/ci/scripts/conan_build.sh +++ b/ci/scripts/conan_build.sh @@ -29,53 +29,56 @@ export ARROW_HOME=${source_dir} conan_args=() conan_args+=(--build=missing) if [ -n "${ARROW_CONAN_PARQUET:-}" ]; then - conan_args+=(--options arrow/*:parquet=${ARROW_CONAN_PARQUET}) - conan_args+=(--options arrow/*:with_thrift=${ARROW_CONAN_PARQUET}) - conan_args+=(--options arrow/*:with_boost=${ARROW_CONAN_PARQUET}) + conan_args+=(--options "arrow/*:parquet=${ARROW_CONAN_PARQUET}") + conan_args+=(--options "arrow/*:with_boost=${ARROW_CONAN_PARQUET}") + conan_args+=(--options "arrow/*:with_json=${ARROW_CONAN_PARQUET}") + conan_args+=(--options "arrow/*:with_thrift=${ARROW_CONAN_PARQUET}") +else + conan_args+=(--options "arrow/*:parquet=False") fi if [ -n "${ARROW_CONAN_WITH_BROTLI:-}" ]; then - conan_args+=(--options arrow/*:with_brotli=${ARROW_CONAN_WITH_BROTLI}) + conan_args+=(--options "arrow/*:with_brotli=${ARROW_CONAN_WITH_BROTLI}") fi if [ -n "${ARROW_CONAN_WITH_BZ2:-}" ]; then - conan_args+=(--options arrow/*:with_bz2=${ARROW_CONAN_WITH_BZ2}) + conan_args+=(--options "arrow/*:with_bz2=${ARROW_CONAN_WITH_BZ2}") fi if [ -n "${ARROW_CONAN_WITH_FLIGHT_RPC:-}" ]; then - conan_args+=(--options arrow/*:with_flight_rpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) - conan_args+=(--options arrow/*:with_grpc=${ARROW_CONAN_WITH_FLIGHT_RPC}) - conan_args+=(--options arrow/*:with_protobuf=${ARROW_CONAN_WITH_FLIGHT_RPC}) - conan_args+=(--options arrow/*:with_re2=${ARROW_CONAN_WITH_FLIGHT_RPC}) + conan_args+=(--options "arrow/*:with_flight_rpc=${ARROW_CONAN_WITH_FLIGHT_RPC}") + conan_args+=(--options "arrow/*:with_grpc=${ARROW_CONAN_WITH_FLIGHT_RPC}") + conan_args+=(--options "arrow/*:with_protobuf=${ARROW_CONAN_WITH_FLIGHT_RPC}") + conan_args+=(--options "arrow/*:with_re2=${ARROW_CONAN_WITH_FLIGHT_RPC}") fi if [ -n "${ARROW_CONAN_WITH_GLOG:-}" ]; then - conan_args+=(--options arrow/*:with_glog=${ARROW_CONAN_WITH_GLOG}) + conan_args+=(--options "arrow/*:with_glog=${ARROW_CONAN_WITH_GLOG}") fi if [ -n "${ARROW_CONAN_WITH_JEMALLOC:-}" ]; then - conan_args+=(--options arrow/*:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}) + conan_args+=(--options "arrow/*:with_jemalloc=${ARROW_CONAN_WITH_JEMALLOC}") fi if [ -n "${ARROW_CONAN_WITH_JSON:-}" ]; then - conan_args+=(--options arrow/*:with_json=${ARROW_CONAN_WITH_JSON}) + conan_args+=(--options "arrow/*:with_json=${ARROW_CONAN_WITH_JSON}") fi if [ -n "${ARROW_CONAN_WITH_LZ4:-}" ]; then - conan_args+=(--options arrow/*:with_lz4=${ARROW_CONAN_WITH_LZ4}) + conan_args+=(--options "arrow/*:with_lz4=${ARROW_CONAN_WITH_LZ4}") fi if [ -n "${ARROW_CONAN_WITH_SNAPPY:-}" ]; then - conan_args+=(--options arrow/*:with_snappy=${ARROW_CONAN_WITH_SNAPPY}) + conan_args+=(--options "arrow/*:with_snappy=${ARROW_CONAN_WITH_SNAPPY}") fi if [ -n "${ARROW_CONAN_WITH_ZSTD:-}" ]; then - conan_args+=(--options arrow/*:with_zstd=${ARROW_CONAN_WITH_ZSTD}) + conan_args+=(--options "arrow/*:with_zstd=${ARROW_CONAN_WITH_ZSTD}") fi -version=$(grep '^set(ARROW_VERSION ' ${ARROW_HOME}/cpp/CMakeLists.txt | \ +version=$(grep '^set(ARROW_VERSION ' "${ARROW_HOME}/cpp/CMakeLists.txt" | \ grep -E -o '([0-9.]*)') -conan_args+=(--version ${version}) +conan_args+=(--version "${version}") rm -rf ~/.conan/data/arrow/ -rm -rf ${build_dir}/conan || sudo rm -rf ${build_dir}/conan -mkdir -p ${build_dir}/conan || sudo mkdir -p ${build_dir}/conan -if [ -w ${build_dir} ]; then - cp -a ${source_dir}/ci/conan/* ${build_dir}/conan/ +rm -rf "${build_dir}/conan" || sudo rm -rf "${build_dir}/conan" +mkdir -p "${build_dir}/conan" || sudo mkdir -p "${build_dir}/conan" +if [ -w "${build_dir}" ]; then + cp -a "${source_dir}"/ci/conan/* "${build_dir}/conan/" else - sudo cp -a ${source_dir}/ci/conan/* ${build_dir}/conan/ - sudo chown -R $(id -u):$(id -g) ${build_dir}/conan/ + sudo cp -a "${source_dir}"/ci/conan/* "${build_dir}/conan/" + sudo chown -R "$(id -u):$(id -g)" "${build_dir}/conan/" fi -cd ${build_dir}/conan/all +cd "${build_dir}/conan/all" conan create . "${conan_args[@]}" "$@" diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 0ed229517f9..fd550d8fb08 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -118,12 +118,40 @@ if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then fi } + ORIGINAL_CC="${CC}" + if [ -n "${CC}" ]; then + if [ "${ARROW_USE_CCACHE}" = "ON" ]; then + CC="ccache ${CC}" + else + if command -v sccache; then + CC="sccache ${CC}" + fi + fi + fi + + ORIGINAL_CXX="${CXX}" + if [ -n "${CXX}" ]; then + if [ "${ARROW_USE_CCACHE}" = "ON" ]; then + CXX="ccache ${CXX}" + else + if command -v sccache; then + CXX="sccache ${CXX}" + fi + fi + fi meson setup \ --prefix=${MESON_PREFIX:-${ARROW_HOME}} \ --buildtype=${ARROW_BUILD_TYPE:-debug} \ - -Dtests=$(meson_boolean ${ARROW_BUILD_TESTS:-OFF}) \ + --pkg-config-path="${CONDA_PREFIX}/lib/pkgconfig/" \ + -Dauto_features=enabled \ + -Dfuzzing=disabled \ + -Dgcs=disabled \ + -Ds3=disabled \ . \ ${source_dir} + + CC="${ORIGINAL_CC}" + CXX="${ORIGINAL_CXX}" elif [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then if [ "${UBUNTU}" = "20.04" ]; then echo "arrow emscripten build is not supported on Ubuntu 20.04, run with UBUNTU=22.04" @@ -146,6 +174,11 @@ elif [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ ${ARROW_CMAKE_ARGS} \ ${source_dir} +elif [ -n "${CMAKE_PRESET}" ]; then + cmake \ + --preset="${CMAKE_PRESET}" \ + ${ARROW_CMAKE_ARGS} \ + ${source_dir} else cmake \ -Dabsl_SOURCE=${absl_SOURCE:-} \ @@ -194,7 +227,6 @@ else -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ -DARROW_S3=${ARROW_S3:-OFF} \ -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL:-DEFAULT} \ - -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ -DARROW_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} \ -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ @@ -204,7 +236,6 @@ else -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ -DARROW_USE_LLD=${ARROW_USE_LLD:-OFF} \ -DARROW_USE_MOLD=${ARROW_USE_MOLD:-OFF} \ - -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ @@ -257,10 +288,18 @@ else ${source_dir} fi +: ${ARROW_BUILD_PARALLEL:=$[${n_jobs} + 1]} if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then - time meson install + time meson compile -j ${ARROW_BUILD_PARALLEL} + meson install + # Remove all added files in cpp/subprojects/ because they may have + # unreadable permissions on Docker host. + pushd "${source_dir}" + meson subprojects purge --confirm --include-cache + popd else - export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:-$[${n_jobs} + 1]} + : ${CMAKE_BUILD_PARALLEL_LEVEL:=${ARROW_BUILD_PARALLEL}} + export CMAKE_BUILD_PARALLEL_LEVEL time cmake --build . --target install fi @@ -274,10 +313,14 @@ fi popd if [ -x "$(command -v ldconfig)" ]; then - if [ -x "$(command -v sudo)" ]; then - SUDO=sudo - else + if [ "$(id --user)" -eq 0 ]; then SUDO= + else + if [ -x "$(command -v sudo)" ]; then + SUDO=sudo + else + SUDO= + fi fi ${SUDO} ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} fi diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 36e09e8936f..4243e78bca7 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -19,7 +19,7 @@ set -ex -if [[ $# < 2 ]]; then +if [[ $# -lt 2 ]]; then echo "Usage: $0 [ctest args ...]" exit 1 fi @@ -42,7 +42,17 @@ if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then export ARROW_DEBUG_MEMORY_POOL=trap fi +exclude_tests=() ctest_options=() +if ! type azurite >/dev/null 2>&1; then + exclude_tests+=("arrow-azurefs-test") +fi +if ! type storage-testbench >/dev/null 2>&1; then + exclude_tests+=("arrow-gcsfs-test") +fi +if ! type minio >/dev/null 2>&1; then + exclude_tests+=("arrow-s3fs-test") +fi case "$(uname)" in Linux) n_jobs=$(nproc) @@ -50,41 +60,44 @@ case "$(uname)" in Darwin) n_jobs=$(sysctl -n hw.ncpu) # TODO: https://github.com/apache/arrow/issues/40410 - exclude_tests="arrow-s3fs-test" - ctest_options+=(--exclude-regex "${exclude_tests}") + exclude_tests+=("arrow-s3fs-test") ;; MINGW*) n_jobs=${NUMBER_OF_PROCESSORS:-1} # TODO: Enable these crashed tests. # https://issues.apache.org/jira/browse/ARROW-9072 - exclude_tests="gandiva-binary-test" - exclude_tests="${exclude_tests}|gandiva-boolean-expr-test" - exclude_tests="${exclude_tests}|gandiva-date-time-test" - exclude_tests="${exclude_tests}|gandiva-decimal-single-test" - exclude_tests="${exclude_tests}|gandiva-decimal-test" - exclude_tests="${exclude_tests}|gandiva-filter-project-test" - exclude_tests="${exclude_tests}|gandiva-filter-test" - exclude_tests="${exclude_tests}|gandiva-hash-test" - exclude_tests="${exclude_tests}|gandiva-if-expr-test" - exclude_tests="${exclude_tests}|gandiva-in-expr-test" - exclude_tests="${exclude_tests}|gandiva-internals-test" - exclude_tests="${exclude_tests}|gandiva-literal-test" - exclude_tests="${exclude_tests}|gandiva-null-validity-test" - exclude_tests="${exclude_tests}|gandiva-precompiled-test" - exclude_tests="${exclude_tests}|gandiva-projector-test" - exclude_tests="${exclude_tests}|gandiva-utf8-test" - ctest_options+=(--exclude-regex "${exclude_tests}") + exclude_tests+=("gandiva-binary-test") + exclude_tests+=("gandiva-boolean-expr-test") + exclude_tests+=("gandiva-date-time-test") + exclude_tests+=("gandiva-decimal-single-test") + exclude_tests+=("gandiva-decimal-test") + exclude_tests+=("gandiva-filter-project-test") + exclude_tests+=("gandiva-filter-test") + exclude_tests+=("gandiva-hash-test") + exclude_tests+=("gandiva-if-expr-test") + exclude_tests+=("gandiva-in-expr-test") + exclude_tests+=("gandiva-internals-test") + exclude_tests+=("gandiva-literal-test") + exclude_tests+=("gandiva-null-validity-test") + exclude_tests+=("gandiva-precompiled-test") + exclude_tests+=("gandiva-projector-test") + exclude_tests+=("gandiva-utf8-test") ;; *) n_jobs=${NPROC:-1} ;; esac +if [ "${#exclude_tests[@]}" -gt 0 ]; then + IFS="|" + ctest_options+=(--exclude-regex "${exclude_tests[*]}") + unset IFS +fi if [ "${ARROW_EMSCRIPTEN:-OFF}" = "ON" ]; then n_jobs=1 # avoid spurious fails on emscripten due to loading too many big executables fi -pushd ${build_dir} +pushd "${build_dir}" if [ -z "${PYTHON}" ] && ! which python > /dev/null 2>&1; then export PYTHON="${PYTHON:-python3}" @@ -92,21 +105,58 @@ fi if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then ARROW_BUILD_EXAMPLES=OFF # TODO: Remove this meson test \ + --no-rebuild \ --print-errorlogs \ + --suite arrow \ "$@" else ctest \ --label-regex unittest \ --output-on-failure \ - --parallel ${n_jobs} \ + --parallel "${n_jobs}" \ --repeat until-pass:3 \ - --timeout ${ARROW_CTEST_TIMEOUT:-300} \ + --timeout "${ARROW_CTEST_TIMEOUT:-300}" \ "${ctest_options[@]}" \ "$@" fi +# This is for testing find_package(Arrow). +# +# Note that this is not a perfect solution. We should improve this +# later. +# +# * This is ad-hoc +# * This doesn't test other CMake packages such as ArrowDataset +if [ "${ARROW_USE_MESON:-OFF}" = "OFF" ] && \ + [ "${ARROW_EMSCRIPTEN:-OFF}" = "OFF" ] && \ + [ "${ARROW_USE_ASAN:-OFF}" = "OFF" ]; then + CMAKE_PREFIX_PATH="${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}}" + case "$(uname)" in + MINGW*) + # /lib/cmake/ isn't searched on Windows. + # + # See also: + # https://cmake.org/cmake/help/latest/command/find_package.html#config-mode-search-procedure + CMAKE_PREFIX_PATH+="/lib/cmake/" + ;; + esac + if [ -n "${VCPKG_ROOT}" ] && [ -n "${VCPKG_DEFAULT_TRIPLET}" ]; then + CMAKE_PREFIX_PATH+=";${VCPKG_ROOT}/installed/${VCPKG_DEFAULT_TRIPLET}" + fi + cmake \ + -S "${source_dir}/examples/minimal_build" \ + -B "${build_dir}/examples/minimal_build" \ + -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" + cmake --build "${build_dir}/examples/minimal_build" + pushd "${source_dir}/examples/minimal_build" + # PATH= is for Windows. + PATH="${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}}/bin:${PATH}" \ + "${build_dir}/examples/minimal_build/arrow-example" + popd +fi + if [ "${ARROW_BUILD_EXAMPLES}" == "ON" ]; then - examples=$(find ${binary_output_dir} -executable -name "*example") + examples=$(find "${binary_output_dir}" -executable -name "*example") if [ "${examples}" == "" ]; then echo "==================" echo "No examples found!" @@ -124,12 +174,15 @@ fi if [ "${ARROW_FUZZING}" == "ON" ]; then # Fuzzing regression tests - ${binary_output_dir}/arrow-ipc-stream-fuzz ${ARROW_TEST_DATA}/arrow-ipc-stream/crash-* - ${binary_output_dir}/arrow-ipc-stream-fuzz ${ARROW_TEST_DATA}/arrow-ipc-stream/*-testcase-* - ${binary_output_dir}/arrow-ipc-file-fuzz ${ARROW_TEST_DATA}/arrow-ipc-file/*-testcase-* - ${binary_output_dir}/arrow-ipc-tensor-stream-fuzz ${ARROW_TEST_DATA}/arrow-ipc-tensor-stream/*-testcase-* + # Some fuzz regression files may trigger huge memory allocations, + # let the allocator return null instead of aborting. + export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1" + "${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/crash-* + "${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/*-testcase-* + "${binary_output_dir}/arrow-ipc-file-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-file/*-testcase-* + "${binary_output_dir}/arrow-ipc-tensor-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-tensor-stream/*-testcase-* if [ "${ARROW_PARQUET}" == "ON" ]; then - ${binary_output_dir}/parquet-arrow-fuzz ${ARROW_TEST_DATA}/parquet/fuzzing/*-testcase-* + "${binary_output_dir}/parquet-arrow-fuzz" "${ARROW_TEST_DATA}"/parquet/fuzzing/*-testcase-* fi fi diff --git a/ci/scripts/csharp_build.sh b/ci/scripts/csharp_build.sh deleted file mode 100755 index 5a397679487..00000000000 --- a/ci/scripts/csharp_build.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -source_dir=${1}/csharp - -pushd ${source_dir} -dotnet build -popd diff --git a/ci/scripts/csharp_pack.sh b/ci/scripts/csharp_pack.sh deleted file mode 100755 index e9dfc664ec5..00000000000 --- a/ci/scripts/csharp_pack.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -eux - -source_dir=${1}/csharp - -pushd ${source_dir} -dotnet pack -c Release -popd diff --git a/ci/scripts/csharp_test.sh b/ci/scripts/csharp_test.sh deleted file mode 100755 index a435a835251..00000000000 --- a/ci/scripts/csharp_test.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -source_dir=${1}/csharp - -# Python and PyArrow are required for C Data Interface tests. -if [ -z "${PYTHON}" ]; then - if type python3 > /dev/null 2>&1; then - export PYTHON=python3 - else - export PYTHON=python - fi -fi -${PYTHON} -m pip install pyarrow find-libpython -export PYTHONNET_PYDLL=$(${PYTHON} -m find_libpython) - -pushd ${source_dir} -dotnet test -popd diff --git a/ci/scripts/generate_dataset.py b/ci/scripts/generate_dataset.py deleted file mode 100644 index 42ee0763a1b..00000000000 --- a/ci/scripts/generate_dataset.py +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -import os -import shutil -import random - -import pandas as pd - -if __name__ == "__main__": - # generate the test dataframe - data = { - "total_amount": list(), - "fare_amount": list() - } - for i in range(0, 500): - data['total_amount'].append(random.randint(1,11)*5) - data['fare_amount'].append(random.randint(1,11)*3) - df = pd.DataFrame(data) - - # dump the dataframe to a parquet file - df.to_parquet("skyhook_test_data.parquet") - - # create the dataset by copying the parquet files - shutil.rmtree("nyc", ignore_errors=True) - payment_type = ["1", "2", "3", "4"] - vendor_id = ["1", "2"] - for p in payment_type: - for v in vendor_id: - path = f"nyc/payment_type={p}/VendorID={v}" - os.makedirs(path, exist_ok=True) - shutil.copyfile("skyhook_test_data.parquet", os.path.join(path, f"{p}.{v}.parquet")) diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh index b8b1618bed3..e911af6f0ff 100755 --- a/ci/scripts/install_azurite.sh +++ b/ci/scripts/install_azurite.sh @@ -20,18 +20,9 @@ set -e node_version="$(node --version)" -echo "node version = ${node_version}" - -case "${node_version}" in - v12*) - # Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505 - azurite_version=v3.29.0 - ;; - *) - azurite_version=latest - ;; -esac +echo "Node.js version = ${node_version}" +azurite_version=latest case "$(uname)" in Darwin) npm install -g azurite@${azurite_version} @@ -46,5 +37,4 @@ case "$(uname)" in which azurite ;; esac - -echo "azurite version = $(azurite --version)" +echo "Azurite version = $(azurite --version)" diff --git a/ci/scripts/install_ccache.sh b/ci/scripts/install_ccache.sh index 7d39e18ebe5..75ca81076d6 100755 --- a/ci/scripts/install_ccache.sh +++ b/ci/scripts/install_ccache.sh @@ -32,23 +32,23 @@ case $(uname) in MINGW64*) url="https://github.com/ccache/ccache/releases/download/v${version}/ccache-${version}-windows-x86_64.zip" pushd /tmp/ccache - curl --fail --location --remote-name ${url} - unzip -j ccache-${version}-windows-x86_64.zip + curl --fail --location --remote-name "${url}" + unzip -j "ccache-${version}-windows-x86_64.zip" chmod +x ccache.exe - mv ccache.exe ${prefix}/bin/ + mv ccache.exe "${prefix}/bin/" popd ;; *) url="https://github.com/ccache/ccache/archive/v${version}.tar.gz" - wget -q ${url} -O - | tar -xzf - --directory /tmp/ccache --strip-components=1 + wget -q "${url}" -O - | tar -xzf - --directory /tmp/ccache --strip-components=1 mkdir /tmp/ccache/build pushd /tmp/ccache/build cmake \ -GNinja \ -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=${prefix} \ + -DCMAKE_INSTALL_PREFIX="${prefix}" \ -DZSTD_FROM_INTERNET=ON \ .. ninja install diff --git a/ci/scripts/install_chromedriver.sh b/ci/scripts/install_chromedriver.sh index 9097a20bfc5..9167ae70e8d 100755 --- a/ci/scripts/install_chromedriver.sh +++ b/ci/scripts/install_chromedriver.sh @@ -23,7 +23,7 @@ set -e chrome_version=$1 -if [ $chrome_version = "latest" ]; then +if [ "$chrome_version" = "latest" ]; then latest_release_path=LATEST_RELEASE_STABLE else latest_release_path=LATEST_RELEASE_${chrome_version} diff --git a/ci/scripts/install_cmake.sh b/ci/scripts/install_cmake.sh index d01a7a744dc..a6916b255e2 100755 --- a/ci/scripts/install_cmake.sh +++ b/ci/scripts/install_cmake.sh @@ -30,7 +30,7 @@ archs=([x86_64]=x86_64 [aarch64]=aarch64) arch=$(uname -m) -if [ -z ${archs[$arch]} ]; then +if [ -z "${archs[$arch]}" ]; then echo "Unsupported architecture: ${arch}" exit 0 fi @@ -56,25 +56,25 @@ case ${platform} in ;; esac -mkdir -p ${prefix} +mkdir -p "${prefix}" url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-" case ${platform} in macos) url+="universal.tar.gz" - curl -L ${url} | tar -xzf - --directory ${prefix} --strip-components=1 - ln -s CMake.app/Contents/bin ${prefix}/bin + curl -L "${url}" | tar -xzf - --directory "${prefix}" --strip-components=1 + ln -s CMake.app/Contents/bin "${prefix}/bin" ;; windows) url+="${arch}.zip" - archive_name=$(basename ${url}) - curl -L -o ${archive_name} ${url} - unzip ${archive_name} - base_name=$(basename ${archive_name} .zip) - mv ${base_name}/* ${prefix} - rm -rf ${base_name} ${archive_name} + archive_name=$(basename "${url}") + curl -L -o "${archive_name}" "${url}" + unzip "${archive_name}" + base_name=$(basename "${archive_name}" .zip) + mv "${base_name}"/* "${prefix}" + rm -rf "${base_name}" "${archive_name}" ;; *) url+="${arch}.tar.gz" - curl -L ${url} | tar -xzf - --directory ${prefix} --strip-components=1 + curl -L "${url}" | tar -xzf - --directory "${prefix}" --strip-components=1 ;; esac diff --git a/ci/scripts/install_conda.sh b/ci/scripts/install_conda.sh index 8539a0b2bbf..c74b318cfa3 100755 --- a/ci/scripts/install_conda.sh +++ b/ci/scripts/install_conda.sh @@ -30,16 +30,16 @@ installer=$1 version=$2 prefix=$3 -download_url=https://github.com/conda-forge/miniforge/releases/latest/download/${installer^}-${platform}-${arch}.sh +download_url=https://github.com/conda-forge/miniforge/releases/${version}/download/${installer^}-${platform}-${arch}.sh echo "Downloading Miniconda installer from ${download_url} ..." -wget -nv ${download_url} -O /tmp/installer.sh -bash /tmp/installer.sh -b -p ${prefix} +wget -nv "${download_url}" -O /tmp/installer.sh +bash /tmp/installer.sh -b -p "${prefix}" rm /tmp/installer.sh # Like "conda init", but for POSIX sh rather than bash -ln -s ${prefix}/etc/profile.d/conda.sh /etc/profile.d/conda.sh +ln -s "${prefix}/etc/profile.d/conda.sh" /etc/profile.d/conda.sh export PATH=/opt/conda/bin:$PATH diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh index b89e43cfb31..8967e2681d9 100755 --- a/ci/scripts/install_dask.sh +++ b/ci/scripts/install_dask.sh @@ -30,9 +30,9 @@ if [ "${dask}" = "upstream_devel" ]; then pip install "dask[dataframe] @ git+https://github.com/dask/dask.git" pip install -U git+https://github.com/dask-contrib/dask-expr.git elif [ "${dask}" = "latest" ]; then - pip install dask[dataframe] + pip install "dask[dataframe]" else - pip install dask[dataframe]==${dask} + pip install "dask[dataframe]==${dask}" fi # additional dependencies needed for dask's s3 tests diff --git a/ci/scripts/install_emscripten.sh b/ci/scripts/install_emscripten.sh index 4bad7238a6c..6cf3f023ba3 100755 --- a/ci/scripts/install_emscripten.sh +++ b/ci/scripts/install_emscripten.sh @@ -24,13 +24,13 @@ set -e target_path=$1 pyodide_path=$2 -emscripten_version=$(${pyodide_path}/python -c "import sys;print(*sys._emscripten_info.emscripten_version,sep='.')") +emscripten_version=$("${pyodide_path}/python" -c "import sys;print(*sys._emscripten_info.emscripten_version,sep='.')") -cd ${target_path} +cd "${target_path}" if [ ! -d emsdk ]; then git clone https://github.com/emscripten-core/emsdk.git fi cd emsdk -./emsdk install ${emscripten_version} -./emsdk activate ${emscripten_version} -echo "Installed emsdk to: ${target_path}" \ No newline at end of file +./emsdk install "${emscripten_version}" +./emsdk activate "${emscripten_version}" +echo "Installed emsdk to: ${target_path}" diff --git a/ci/scripts/install_gcs_testbench.bat b/ci/scripts/install_gcs_testbench.bat index f54f98db7ca..d0ceb7be2b6 100644 --- a/ci/scripts/install_gcs_testbench.bat +++ b/ci/scripts/install_gcs_testbench.bat @@ -17,7 +17,7 @@ @echo on -set GCS_TESTBENCH_VERSION="v0.40.0" +set GCS_TESTBENCH_VERSION="v0.55.0" set PIPX_FLAGS=--verbose if NOT "%PIPX_PYTHON%"=="" ( diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 48a5858a358..e962e6f1657 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -35,19 +35,19 @@ case "$(uname -m)" in esac version=$1 -if [[ "${version}" -eq "default" ]]; then - version="v0.39.0" +if [[ "${version}" = "default" ]]; then + version="v0.55.0" fi # The Python to install pipx with -: ${PIPX_BASE_PYTHON:=$(which python3)} +: "${PIPX_BASE_PYTHON:=$(which python3)}" # The Python to install the GCS testbench with -: ${PIPX_PYTHON:=${PIPX_BASE_PYTHON:-$(which python3)}} +: "${PIPX_PYTHON:=${PIPX_BASE_PYTHON:-$(which python3)}}" export PIP_BREAK_SYSTEM_PACKAGES=1 ${PIPX_BASE_PYTHON} -m pip install -U pipx -pipx_flags=(--verbose --python ${PIPX_PYTHON}) +pipx_flags=(--verbose --python "${PIPX_PYTHON}") if [[ $(id -un) == "root" ]]; then # Install globally as /root/.local/bin is typically not in $PATH pipx_flags+=(--global) @@ -55,5 +55,5 @@ fi if [[ -n "${PIPX_PIP_ARGS}" ]]; then pipx_flags+=(--pip-args "'${PIPX_PIP_ARGS}'") fi -${PIPX_BASE_PYTHON} -m pipx install ${pipx_flags[@]} \ +${PIPX_BASE_PYTHON} -m pipx install "${pipx_flags[@]}" \ "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" diff --git a/ci/scripts/install_iwyu.sh b/ci/scripts/install_iwyu.sh deleted file mode 100755 index 3cd2cbc95fe..00000000000 --- a/ci/scripts/install_iwyu.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -eu - -source_dir=${1:-/tmp/iwyu} -install_prefix=${2:-/usr/local} -clang_tools_version=${3:-8} - -iwyu_branch_name="clang_${clang_tools_version}" -if [ ${clang_tools_version} -lt 10 ]; then - iwyu_branch_name="${iwyu_branch_name}.0" -fi - -git clone --single-branch --branch ${iwyu_branch_name} \ - https://github.com/include-what-you-use/include-what-you-use.git ${source_dir} - -mkdir -p ${source_dir}/build -pushd ${source_dir}/build - -# Build IWYU for current Clang -export CC=clang-${clang_tools_version} -export CXX=clang++-${clang_tools_version} - -cmake -DCMAKE_PREFIX_PATH=/usr/lib/llvm-${clang_tools_version} \ - -DCMAKE_INSTALL_PREFIX=${install_prefix} \ - ${source_dir} -make -j4 -make install - -popd - -rm -rf ${source_dir} diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 8685ced0bd1..5efa03e82e2 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -34,7 +34,7 @@ archs=([x86_64]=amd64 [s390x]=s390x) arch=$(uname -m) -if [ -z ${archs[$arch]} ]; then +if [ -z "${archs[$arch]}" ]; then echo "Unsupported architecture: ${arch}" exit 0 fi @@ -71,23 +71,23 @@ download() local output=$1 local url=$2 - mkdir -p $(dirname ${output}) + mkdir -p "$(dirname "${output}")" if type wget > /dev/null 2>&1; then - wget -nv --output-document ${output} ${url} + wget -nv --output-document "${output}" "${url}" else - curl --fail --location --output ${output} ${url} + curl --fail --location --output "${output}" "${url}" fi } if [[ ! -x ${prefix}/bin/minio ]]; then url="https://dl.min.io/server/minio/release/${platform}-${arch}/archive/${minio_version}" echo "Fetching ${url}..." - download ${prefix}/bin/minio ${url} - chmod +x ${prefix}/bin/minio + download "${prefix}/bin/minio" "${url}" + chmod +x "${prefix}/bin/minio" fi if [[ ! -x ${prefix}/bin/mc ]]; then url="https://dl.min.io/client/mc/release/${platform}-${arch}/archive/${mc_version}" echo "Fetching ${url}..." - download ${prefix}/bin/mc ${url} - chmod +x ${prefix}/bin/mc + download "${prefix}/bin/mc" "${url}" + chmod +x "${prefix}/bin/mc" fi diff --git a/ci/scripts/install_ninja.sh b/ci/scripts/install_ninja.sh index 0440d563fb1..5cfa0cbea2d 100755 --- a/ci/scripts/install_ninja.sh +++ b/ci/scripts/install_ninja.sh @@ -30,11 +30,11 @@ prefix=$2 url="https://github.com/ninja-build/ninja/archive/v${version}.tar.gz" mkdir /tmp/ninja -wget -q ${url} -O - | tar -xzf - --directory /tmp/ninja --strip-components=1 +wget -q "${url}" -O - | tar -xzf - --directory /tmp/ninja --strip-components=1 pushd /tmp/ninja ./configure.py --bootstrap -mv ninja ${prefix}/bin +mv ninja "${prefix}/bin" popd rm -rf /tmp/ninja diff --git a/ci/scripts/install_numba.sh b/ci/scripts/install_numba.sh index fba9f50b79c..22e0df2b3c6 100755 --- a/ci/scripts/install_numba.sh +++ b/ci/scripts/install_numba.sh @@ -19,14 +19,18 @@ set -e -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " +if [ "$#" -ne 1 ] && [ "$#" -ne 2 ]; then + echo "Usage: $0 [numba-cuda version]" exit 1 fi numba=$1 if [ -n "${ARROW_PYTHON_VENV:-}" ]; then + # We don't need to follow this external file. + # See also: https://www.shellcheck.net/wiki/SC1091 + # + # shellcheck source=/dev/null . "${ARROW_PYTHON_VENV}/bin/activate" fi @@ -35,5 +39,19 @@ if [ "${numba}" = "master" ]; then elif [ "${numba}" = "latest" ]; then pip install numba else - pip install numba==${numba} + pip install "numba==${numba}" +fi + +if [ "$#" -eq 1 ]; then + exit 0 +fi + +numba_cuda=$2 + +if [ "${numba_cuda}" = "master" ]; then + pip install https://github.com/NVIDIA/numba-cuda/archive/main.tar.gz#egg=numba-cuda +elif [ "${numba_cuda}" = "latest" ]; then + pip install numba-cuda +else + pip install "numba-cuda==${numba_cuda}" fi diff --git a/ci/scripts/install_numpy.sh b/ci/scripts/install_numpy.sh index f04fe81b669..8addc7c023e 100755 --- a/ci/scripts/install_numpy.sh +++ b/ci/scripts/install_numpy.sh @@ -29,5 +29,5 @@ numpy=${1:-"latest"} if [ "${numpy}" = "latest" ]; then pip install numpy else - pip install numpy==${numpy} + pip install numpy=="${numpy}" fi diff --git a/ci/scripts/install_pandas.sh b/ci/scripts/install_pandas.sh index 6a506a86514..03009b3824f 100755 --- a/ci/scripts/install_pandas.sh +++ b/ci/scripts/install_pandas.sh @@ -32,7 +32,7 @@ if [ "${numpy}" = "nightly" ]; then elif [ "${numpy}" = "latest" ]; then pip install numpy else - pip install numpy==${numpy} + pip install numpy=="${numpy}" fi if [ "${pandas}" = "upstream_devel" ]; then @@ -42,5 +42,5 @@ elif [ "${pandas}" = "nightly" ]; then elif [ "${pandas}" = "latest" ]; then pip install pandas else - pip install pandas==${pandas} + pip install pandas=="${pandas}" fi diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh index a84d136c0c2..fe0c4bcb931 100755 --- a/ci/scripts/install_python.sh +++ b/ci/scripts/install_python.sh @@ -25,12 +25,13 @@ platforms=([windows]=Windows [linux]=Linux) declare -A versions -versions=([3.9]=3.9.13 - [3.10]=3.10.11 +versions=([3.10]=3.10.11 [3.11]=3.11.9 - [3.12]=3.12.9 - [3.13]=3.13.2 - [3.13t]=3.13.2) + [3.12]=3.12.10 + [3.13]=3.13.7 + [3.13t]=3.13.7 + [3.14]=3.14.0 + [3.14t]=3.14.0) if [ "$#" -ne 2 ]; then echo "Usage: $0 " @@ -44,19 +45,16 @@ platform=${platforms[$1]} version=$2 full_version=${versions[$2]} -if [ $platform = "macOS" ]; then +if [ "$platform" = "macOS" ]; then echo "Downloading Python installer..." - if [ "$(uname -m)" = "x86_64" ] && [ "$version" = "3.9" ]; - then - fname="python-${full_version}-macosx10.9.pkg" - else - fname="python-${full_version}-macos11.pkg" - fi + fname="python-${full_version}-macos11.pkg" wget "https://www.python.org/ftp/python/${full_version}/${fname}" echo "Installing Python..." - if [[ $2 == "3.13t" ]]; then + if [[ $2 == "3.13t" ]] || [[ $2 == "3.14t" ]]; then + # Extract the base version without 't' suffix + base_version="${version%t}" # See https://github.com/python/cpython/issues/120098#issuecomment-2151122033 for more info on this. cat > ./choicechanges.plist < @@ -69,21 +67,22 @@ if [ $platform = "macOS" ]; then choiceAttribute selected choiceIdentifier - org.python.Python.PythonTFramework-3.13 + org.python.Python.PythonTFramework-${base_version} EOF - installer -pkg $fname -applyChoiceChangesXML ./choicechanges.plist -target / + installer -pkg "$fname" -applyChoiceChangesXML ./choicechanges.plist -target / rm ./choicechanges.plist else - installer -pkg $fname -target / + installer -pkg "$fname" -target / fi - rm $fname + rm "$fname" python="/Library/Frameworks/Python.framework/Versions/${version}/bin/python${version}" - if [[ $2 == "3.13t" ]]; then - python="/Library/Frameworks/PythonT.framework/Versions/3.13/bin/python3.13t" + if [[ $2 == "3.13t" ]] || [[ $2 == "3.14t" ]]; then + base_version="${version%t}" + python="/Library/Frameworks/PythonT.framework/Versions/${base_version}/bin/python${base_version}t" fi echo "Installing Pip..." diff --git a/ci/scripts/install_sccache.sh b/ci/scripts/install_sccache.sh index 136f39b3ae2..c571625a3b3 100755 --- a/ci/scripts/install_sccache.sh +++ b/ci/scripts/install_sccache.sh @@ -19,7 +19,7 @@ set -e -if [ "$#" -lt 1 -o "$#" -gt 3 ]; then +if [ "$#" -lt 2 ] || [ "$#" -gt 3 ]; then echo "Usage: $0 " echo "Will default to version=0.3.0 " exit 1 @@ -39,30 +39,30 @@ SCCACHE_URL="https://github.com/mozilla/sccache/releases/download/v$VERSION/scca SCCACHE_ARCHIVE=sccache.tar.gz # Download archive and checksum -curl -L $SCCACHE_URL --output $SCCACHE_ARCHIVE -curl -L $SCCACHE_URL.sha256 --output $SCCACHE_ARCHIVE.sha256 +curl -L "$SCCACHE_URL" --output $SCCACHE_ARCHIVE +curl -L "${SCCACHE_URL}.sha256" --output $SCCACHE_ARCHIVE.sha256 echo " $SCCACHE_ARCHIVE" >> $SCCACHE_ARCHIVE.sha256 -SHA_ARGS="--check --status" +SHA_ARGS=(--check --status) # Busybox sha256sum uses different flags if sha256sum --version 2>&1 | grep -q BusyBox; then - SHA_ARGS="-sc" + SHA_ARGS=(-sc) fi -sha256sum $SHA_ARGS $SCCACHE_ARCHIVE.sha256 +sha256sum "${SHA_ARGS[@]}" $SCCACHE_ARCHIVE.sha256 -if [ ! -d $PREFIX ]; then - mkdir -p $PREFIX +if [ ! -d "$PREFIX" ]; then + mkdir -p "$PREFIX" fi # Extract only the sccache binary into $PREFIX and ignore README and LICENSE. # --wildcards doesn't work on busybox. -tar -xzvf $SCCACHE_ARCHIVE --strip-component=1 --directory $PREFIX --exclude="sccache*/*E*E*" -chmod a+x $PREFIX/sccache +tar -xzvf $SCCACHE_ARCHIVE --strip-component=1 --directory "$PREFIX" --exclude="sccache*/*E*E*" +chmod a+x "${PREFIX}/sccache" if [ -n "${GITHUB_PATH}" ]; then - echo "$PREFIX" >> $GITHUB_PATH + echo "$PREFIX" >> "$GITHUB_PATH" # Add executable for windows as mingw workaround. - echo "SCCACHE_PATH=$PREFIX/sccache.exe" >> $GITHUB_ENV + echo "SCCACHE_PATH=$PREFIX/sccache.exe" >> "$GITHUB_ENV" fi diff --git a/ci/scripts/install_vcpkg.sh b/ci/scripts/install_vcpkg.sh index 4183a606512..401b6b8d99b 100755 --- a/ci/scripts/install_vcpkg.sh +++ b/ci/scripts/install_vcpkg.sh @@ -26,6 +26,7 @@ fi arrow_dir=$(cd -- "$(dirname -- "$0")/../.." && pwd -P) default_vcpkg_ports_patch="${arrow_dir}/ci/vcpkg/ports.patch" +vcpkg_patch="${arrow_dir}/ci/vcpkg/vcpkg.patch" vcpkg_destination=$1 vcpkg_version=${2:-} @@ -36,12 +37,15 @@ if [ -z "${vcpkg_version}" ]; then fi # reduce the fetched data using a shallow clone -git clone --shallow-since=2021-04-01 https://github.com/microsoft/vcpkg ${vcpkg_destination} +git clone --shallow-since=2021-04-01 https://github.com/microsoft/vcpkg "${vcpkg_destination}" -pushd ${vcpkg_destination} +pushd "${vcpkg_destination}" git checkout "${vcpkg_version}" +git apply --verbose --ignore-whitespace "${vcpkg_patch}" +echo "Patch successfully applied to the VCPKG files!" + if [[ "${OSTYPE:-}" == "msys" ]]; then ./bootstrap-vcpkg.bat -disableMetrics else @@ -49,7 +53,7 @@ else fi if [ -f "${vcpkg_ports_patch}" ]; then - git apply --verbose --ignore-whitespace ${vcpkg_ports_patch} + git apply --verbose --ignore-whitespace "${vcpkg_ports_patch}" echo "Patch successfully applied to the VCPKG port files!" fi diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index 275ef431c7d..ac8bb841c0e 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -24,22 +24,20 @@ build_dir=${2} gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration -: ${ARROW_INTEGRATION_CPP:=ON} -: ${ARROW_INTEGRATION_CSHARP:=ON} -: ${ARROW_INTEGRATION_JS:=ON} +: "${ARROW_INTEGRATION_CPP:=ON}" -: ${ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS:=cpp,csharp,js} +: "${ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS:=cpp}" export ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS -. ${arrow_dir}/ci/scripts/util_log.sh +. "${arrow_dir}/ci/scripts/util_log.sh" github_actions_group_begin "Integration: Prepare: Archery" -pip install -e $arrow_dir/dev/archery[integration] +pip install -e "$arrow_dir/dev/archery[integration]" github_actions_group_end github_actions_group_begin "Integration: Prepare: Dependencies" # For C Data Interface testing -if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then +if [ "${ARCHERY_INTEGRATION_WITH_DOTNET}" -gt "0" ]; then pip install pythonnet fi if [ "${ARCHERY_INTEGRATION_WITH_JAVA}" -gt "0" ]; then @@ -48,6 +46,7 @@ fi github_actions_group_end export ARROW_BUILD_ROOT=${build_dir} +export ARROW_JS_ROOT=${build_dir}/js # Get more detailed context on crashes export PYTHONFAULTHANDLER=1 @@ -58,17 +57,17 @@ export PYTHONFAULTHANDLER=1 export GOMEMLIMIT=200MiB export GODEBUG=gctrace=1,clobberfree=1 +ARCHERY_WITH_CPP=$([ "$ARROW_INTEGRATION_CPP" == "ON" ] && echo "1" || echo "0") + # Rust can be enabled by exporting ARCHERY_INTEGRATION_WITH_RUST=1 time archery integration \ --run-c-data \ --run-ipc \ --run-flight \ - --with-cpp=$([ "$ARROW_INTEGRATION_CPP" == "ON" ] && echo "1" || echo "0") \ - --with-csharp=$([ "$ARROW_INTEGRATION_CSHARP" == "ON" ] && echo "1" || echo "0") \ - --with-js=$([ "$ARROW_INTEGRATION_JS" == "ON" ] && echo "1" || echo "0") \ - --gold-dirs=$gold_dir/0.14.1 \ - --gold-dirs=$gold_dir/0.17.1 \ - --gold-dirs=$gold_dir/1.0.0-bigendian \ - --gold-dirs=$gold_dir/1.0.0-littleendian \ - --gold-dirs=$gold_dir/2.0.0-compression \ - --gold-dirs=$gold_dir/4.0.0-shareddict \ + --with-cpp="${ARCHERY_WITH_CPP}" \ + --gold-dirs="$gold_dir/0.14.1" \ + --gold-dirs="$gold_dir/0.17.1" \ + --gold-dirs="$gold_dir/1.0.0-bigendian" \ + --gold-dirs="$gold_dir/1.0.0-littleendian" \ + --gold-dirs="$gold_dir/2.0.0-compression" \ + --gold-dirs="$gold_dir/4.0.0-shareddict" \ diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index 1c7e65cf27f..61ad0ea59e4 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -22,35 +22,34 @@ set -e arrow_dir=${1} build_dir=${2} -: ${ARROW_INTEGRATION_CPP:=ON} -: ${ARROW_INTEGRATION_CSHARP:=ON} -: ${ARROW_INTEGRATION_JS:=ON} +: "${ARROW_INTEGRATION_CPP:=ON}" -. ${arrow_dir}/ci/scripts/util_log.sh +. "${arrow_dir}/ci/scripts/util_log.sh" github_actions_group_begin "Integration: Build: Rust" -${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir} +"${arrow_dir}/ci/scripts/rust_build.sh" "${arrow_dir}" "${build_dir}" github_actions_group_end github_actions_group_begin "Integration: Build: nanoarrow" -${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir} +"${arrow_dir}/ci/scripts/nanoarrow_build.sh" "${arrow_dir}" "${build_dir}" github_actions_group_end github_actions_group_begin "Integration: Build: Go" if [ "${ARCHERY_INTEGRATION_WITH_GO}" -gt "0" ]; then - ${arrow_dir}/go/ci/scripts/build.sh ${arrow_dir}/go + "${arrow_dir}/go/ci/scripts/build.sh" "${arrow_dir}/go" fi github_actions_group_end github_actions_group_begin "Integration: Build: C++" if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then - ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir} + "${arrow_dir}/ci/scripts/cpp_build.sh" "${arrow_dir}" "${build_dir}" fi github_actions_group_end -github_actions_group_begin "Integration: Build: C#" -if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then - ${arrow_dir}/ci/scripts/csharp_build.sh ${arrow_dir} ${build_dir} +github_actions_group_begin "Integration: Build: .NET" +if [ "${ARCHERY_INTEGRATION_WITH_DOTNET}" -gt "0" ]; then + "${arrow_dir}/dotnet/ci/scripts/build.sh" "${arrow_dir}/dotnet" + cp -a "${arrow_dir}/dotnet" "${build_dir}/dotnet" fi github_actions_group_end @@ -59,13 +58,14 @@ if [ "${ARCHERY_INTEGRATION_WITH_JAVA}" -gt "0" ]; then export ARROW_JAVA_CDATA="ON" export JAVA_JNI_CMAKE_ARGS="-DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF -DARROW_JAVA_JNI_ENABLE_C=ON" - ${arrow_dir}/java/ci/scripts/jni_build.sh "${arrow_dir}/java" "${ARROW_HOME}" "${build_dir}/java/" /tmp/dist/java - ${arrow_dir}/java/ci/scripts/build.sh "${arrow_dir}/java" "${build_dir}/java" /tmp/dist/java + "${arrow_dir}/java/ci/scripts/jni_build.sh" "${arrow_dir}/java" "${ARROW_HOME}" "${build_dir}/java/" /tmp/dist/java + "${arrow_dir}/java/ci/scripts/build.sh" "${arrow_dir}/java" "${build_dir}/java" /tmp/dist/java fi github_actions_group_end github_actions_group_begin "Integration: Build: JavaScript" -if [ "${ARROW_INTEGRATION_JS}" == "ON" ]; then - ${arrow_dir}/ci/scripts/js_build.sh ${arrow_dir} ${build_dir} +if [ "${ARCHERY_INTEGRATION_WITH_JS}" -gt "0" ]; then + "${arrow_dir}/js/ci/scripts/build.sh" "${arrow_dir}/js" + cp -a "${arrow_dir}/js" "${build_dir}/js" fi github_actions_group_end diff --git a/ci/scripts/integration_hdfs.sh b/ci/scripts/integration_hdfs.sh index d0444ccb74f..bb3a9b51401 100755 --- a/ci/scripts/integration_hdfs.sh +++ b/ci/scripts/integration_hdfs.sh @@ -19,10 +19,12 @@ set -e +# shellcheck disable=SC2034 source_dir=${1}/cpp build_dir=${2}/cpp -export CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath --glob) +HADOOP_CLASSPATH=$("$HADOOP_HOME/bin/hadoop" classpath --glob) +export CLASSPATH="${HADOOP_CLASSPATH}" export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop export LIBHDFS3_CONF=$HADOOP_CONF_DIR/hdfs-site.xml export ARROW_LIBHDFS3_DIR=$CONDA_PREFIX/lib @@ -42,7 +44,7 @@ function use_libhdfs_dir() { # execute cpp tests export ARROW_HDFS_TEST_LIBHDFS_REQUIRE=ON -pushd ${build_dir} +pushd "${build_dir}" debug/arrow-io-hdfs-test debug/arrow-hdfs-test diff --git a/ci/scripts/integration_skyhook.sh b/ci/scripts/integration_skyhook.sh deleted file mode 100755 index 6c3011f9c63..00000000000 --- a/ci/scripts/integration_skyhook.sh +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This script spawns a single-node Ceph cluster, creates a CephFS mount, -# generates a Parquet dataset, and runs the SkyhookDM integration tests. -# Taken from https://github.com/ceph/go-ceph/blob/master/micro-osd.sh - -set -e -set -x -set -u - -if [ "${ARROW_SKYHOOK:-OFF}" != "ON" ]; then - exit 0 -fi - -ARROW_BUILD_DIR=${1}/cpp -DIR=/tmp/integration_skyhook - -# set environment variables -pkill ceph || true -rm -rf ${DIR}/* -LOG_DIR=${DIR}/log -MON_DATA=${DIR}/mon -MDS_DATA=${DIR}/mds -MOUNTPT=${MDS_DATA}/mnt -OSD_DATA=${DIR}/osd -mkdir -p ${LOG_DIR} ${MON_DATA} ${OSD_DATA} ${MDS_DATA} ${MOUNTPT} -MDS_NAME="Z" -MON_NAME="a" -MGR_NAME="x" -MIRROR_ID="m" - -# cluster wide parameters -cat >> ${DIR}/ceph.conf < ${MDS_DATA}/keyring -ceph osd pool create cephfs_data 8 -ceph osd pool create cephfs_metadata 8 -ceph fs new cephfs cephfs_metadata cephfs_data -ceph fs ls -ceph-mds -i ${MDS_NAME} -ceph status -while [[ ! $(ceph mds stat | grep "up:active") ]]; do sleep 1; done - -# start a manager -ceph-mgr --id ${MGR_NAME} - -# test the setup -ceph --version -ceph status - -apt update -apt install -y python3-pip - -pushd ${ARROW_BUILD_DIR} - # create the rados-classes, if not there already - mkdir -p /usr/lib/x86_64-linux-gnu/rados-classes/ - cp debug/libcls_skyhook* /usr/lib/x86_64-linux-gnu/rados-classes/ - - # mount a ceph filesystem to /mnt/cephfs in the user-space using ceph-fuse - mkdir -p /mnt/cephfs - ceph-fuse /mnt/cephfs - sleep 5 - - # download an example dataset and copy into the mounted dir - pip3 install pyarrow pandas - python3 /arrow/ci/scripts/generate_dataset.py - cp -r nyc /mnt/cephfs/ - sleep 10 - - # run the tests - SKYHOOK_CLS_TEST=debug/skyhook-cls-test - if [ -f "$SKYHOOK_CLS_TEST" ]; then - debug/skyhook-cls-test - fi - - SKYHOOK_PROTOCOL_TEST=debug/skyhook-protocol-test - if [ -f "$SKYHOOK_PROTOCOL_TEST" ]; then - debug/skyhook-protocol-test - fi -popd diff --git a/ci/scripts/integration_spark.sh b/ci/scripts/integration_spark.sh index f7ef87a8b8f..b6b45a796ab 100755 --- a/ci/scripts/integration_spark.sh +++ b/ci/scripts/integration_spark.sh @@ -18,16 +18,19 @@ # exit on any error set -eu -source_dir=${1} -spark_dir=${2} +if [ "$#" -lt 2 ]; then + echo "Usage: $0 " + exit 1 +fi # Spark branch to checkout -spark_version=${SPARK_VERSION:-master} +spark_version=${1} +spark_dir=${2} # Use old behavior that always dropped timezones. export PYARROW_IGNORE_TIMEZONE=1 -if [ "${SPARK_VERSION:1:2}" == "2." ]; then +if [ "${spark_version:1:2}" == "2." ]; then # https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x export ARROW_PRE_0_15_IPC_FORMAT=1 fi @@ -35,8 +38,8 @@ fi export MAVEN_OPTS="-Xss256m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=warn" export MAVEN_OPTS="${MAVEN_OPTS} -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" -pushd ${spark_dir} - echo "Building Spark ${SPARK_VERSION}" +pushd "${spark_dir}" + echo "Building Spark ${spark_version}" # Build Spark only build/mvn -B -DskipTests package @@ -50,7 +53,7 @@ pushd ${spark_dir} "pyspark.sql.tests.arrow.test_arrow_map" "pyspark.sql.tests.arrow.test_arrow_python_udf") - case "${SPARK_VERSION}" in + case "${spark_version}" in v1.*|v2.*|v3.0.*|v3.1.*|v3.2.*|v3.3.*) old_test_modules=true ;; diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh deleted file mode 100755 index 212ec6eb114..00000000000 --- a/ci/scripts/java_build.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -if [[ "${ARROW_JAVA_BUILD:-ON}" != "ON" ]]; then - exit -fi - -arrow_dir=${1} -source_dir=${1}/java -build_dir=${2} -java_jni_dist_dir=${3} - -: ${BUILD_DOCS_JAVA:=OFF} - -if [[ "$(uname -s)" == "Linux" ]] && [[ "$(uname -m)" == "s390x" ]]; then - # Since some files for s390_64 are not available at maven central, - # download pre-build files from Artifactory and install them explicitly - mvn_install="mvn clean install:install-file" - wget="wget" - artifactory_base_url="https://apache.jfrog.io/artifactory/arrow" - - artifactory_dir="protoc-binary" - group="com.google.protobuf" - artifact="protoc" - ver="21.2" - classifier="linux-s390_64" - extension="exe" - # target=${artifact}-${ver}-${classifier}.${extension} - target=${artifact} - ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/${target} - ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} - # protoc requires libprotoc.so.* libprotobuf.so.* - libver="32" - ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/libprotoc.so.${libver} - ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/libprotobuf.so.${libver} - mkdir -p ${ARROW_HOME}/lib - cp lib*.so.${libver} ${ARROW_HOME}/lib - export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${ARROW_HOME}/lib - - artifactory_dir="protoc-gen-grpc-java-binary" - group="io.grpc" - artifact="protoc-gen-grpc-java" - ver="1.47.0" - classifier="linux-s390_64" - extension="exe" - # target=${artifact}-${ver}-${classifier}.${extension} - target=${artifact} - ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/${target} - ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} -fi - -mvn="mvn -B -DskipTests -Drat.skip=true -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" - -if [ $ARROW_JAVA_SKIP_GIT_PLUGIN ]; then - mvn="${mvn} -Dmaven.gitcommitid.skip=true" -fi - -# https://github.com/apache/arrow/issues/41429 -# TODO: We want to out-of-source build. This is a workaround. We copy -# all needed files to the build directory from the source directory -# and build in the build directory. -mkdir -p ${build_dir} -rm -rf ${build_dir}/format -cp -aL ${arrow_dir}/format ${build_dir}/ -rm -rf ${build_dir}/java -cp -aL ${source_dir} ${build_dir}/ -pushd ${build_dir}/java - -if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then - mvn="${mvn} -Pshade-flatbuffers" -fi - -if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then - mvn="${mvn} -Darrow.c.jni.dist.dir=${java_jni_dist_dir} -Parrow-c-data" -fi - -if [ "${ARROW_JAVA_JNI}" = "ON" ]; then - mvn="${mvn} -Darrow.cpp.build.dir=${java_jni_dist_dir} -Parrow-jni" -fi - -# Use `2 * ncores` threads -${mvn} -T 2C clean install - -if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then - # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 - # GH-43378: Maven site plugins not compatible with multithreading - mkdir -p ${build_dir}/docs/java/reference - ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site - rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference -fi - -popd diff --git a/ci/scripts/java_cdata_integration.sh b/ci/scripts/java_cdata_integration.sh deleted file mode 100755 index 0ee5d3026aa..00000000000 --- a/ci/scripts/java_cdata_integration.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -arrow_dir=${1} -build_dir=${2} - -pushd ${build_dir}/java/c/src/test/python - -python integration_tests.py - -popd diff --git a/ci/scripts/java_full_build.sh b/ci/scripts/java_full_build.sh deleted file mode 100755 index 4beade50b45..00000000000 --- a/ci/scripts/java_full_build.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -arrow_dir=${1} -dist_dir=${2} - -export ARROW_TEST_DATA=${arrow_dir}/testing/data - -pushd ${arrow_dir}/java - -# Ensure that there is no old jar -# inside the maven repository -maven_repo=~/.m2/repository/org/apache/arrow -if [ -d $maven_repo ]; then - find $maven_repo \ - "(" -name "*.jar" -o -name "*.zip" -o -name "*.pom" ")" \ - -exec echo {} ";" \ - -exec rm -rf {} ";" -fi - -# generate dummy GPG key for -Papache-release. -# -Papache-release generates signs (*.asc) of artifacts. -# We don't use these signs in our release process. -(echo "Key-Type: RSA"; \ - echo "Key-Length: 4096"; \ - echo "Name-Real: Build"; \ - echo "Name-Email: build@example.com"; \ - echo "%no-protection") | \ - gpg --full-generate-key --batch - -# build the entire project -mvn clean \ - install \ - -Papache-release \ - -Parrow-c-data \ - -Parrow-jni \ - -Darrow.cpp.build.dir=$dist_dir \ - -Darrow.c.jni.dist.dir=$dist_dir \ - --no-transfer-progress - -# copy all jar, zip and pom files to the distribution folder -find ~/.m2/repository/org/apache/arrow \ - "(" \ - -name "*.jar" -o \ - -name "*.json" -o \ - -name "*.pom" -o \ - -name "*.xml" -o \ - -name "*.zip" \ - ")" \ - -exec echo {} ";" \ - -exec cp {} $dist_dir ";" - -popd diff --git a/ci/scripts/java_jni_build.sh b/ci/scripts/java_jni_build.sh deleted file mode 100755 index d989351ab7e..00000000000 --- a/ci/scripts/java_jni_build.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -arrow_dir=${1} -arrow_install_dir=${2} -build_dir=${3}/java_jni -# The directory where the final binaries will be stored when scripts finish -dist_dir=${4} -prefix_dir="${build_dir}/java-jni" - -echo "=== Clear output directories and leftovers ===" -# Clear output directories and leftovers -rm -rf ${build_dir} - -echo "=== Building Arrow Java C Data Interface native library ===" -mkdir -p "${build_dir}" -pushd "${build_dir}" - -case "$(uname)" in - Linux) - n_jobs=$(nproc) - ;; - Darwin) - n_jobs=$(sysctl -n hw.ncpu) - ;; - *) - n_jobs=${NPROC:-1} - ;; -esac - -: ${ARROW_JAVA_BUILD_TESTS:=${ARROW_BUILD_TESTS:-OFF}} -: ${CMAKE_BUILD_TYPE:=release} -cmake \ - -DARROW_JAVA_JNI_ENABLE_DATASET=${ARROW_DATASET:-OFF} \ - -DARROW_JAVA_JNI_ENABLE_GANDIVA=${ARROW_GANDIVA:-OFF} \ - -DARROW_JAVA_JNI_ENABLE_ORC=${ARROW_ORC:-OFF} \ - -DBUILD_TESTING=${ARROW_JAVA_BUILD_TESTS} \ - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ - -DCMAKE_PREFIX_PATH=${arrow_install_dir} \ - -DCMAKE_INSTALL_PREFIX=${prefix_dir} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ - -DProtobuf_USE_STATIC_LIBS=ON \ - -GNinja \ - ${JAVA_JNI_CMAKE_ARGS:-} \ - ${arrow_dir}/java -export CMAKE_BUILD_PARALLEL_LEVEL=${n_jobs} -cmake --build . --config ${CMAKE_BUILD_TYPE} -if [ "${ARROW_JAVA_BUILD_TESTS}" = "ON" ]; then - ctest \ - --output-on-failure \ - --parallel ${n_jobs} \ - --timeout 300 -fi -cmake --build . --config ${CMAKE_BUILD_TYPE} --target install -popd - -mkdir -p ${dist_dir} -# For Windows. *.dll are installed into bin/ on Windows. -if [ -d "${prefix_dir}/bin" ]; then - mv ${prefix_dir}/bin/* ${dist_dir}/ -else - mv ${prefix_dir}/lib/* ${dist_dir}/ -fi diff --git a/ci/scripts/java_jni_macos_build.sh b/ci/scripts/java_jni_macos_build.sh deleted file mode 100755 index 4ecc029bdd3..00000000000 --- a/ci/scripts/java_jni_macos_build.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -arrow_dir=${1} -build_dir=${2} -normalized_arch=$(arch) -case ${normalized_arch} in - arm64) - normalized_arch=aarch_64 - ;; - i386) - normalized_arch=x86_64 - ;; -esac -# The directory where the final binaries will be stored when scripts finish -dist_dir=${3} - -echo "=== Clear output directories and leftovers ===" -# Clear output directories and leftovers -rm -rf ${build_dir} - -echo "=== Building Arrow C++ libraries ===" -install_dir=${build_dir}/cpp-install -: ${ARROW_ACERO:=ON} -export ARROW_ACERO -: ${ARROW_BUILD_TESTS:=ON} -: ${ARROW_DATASET:=ON} -export ARROW_DATASET -: ${ARROW_GANDIVA:=ON} -export ARROW_GANDIVA -: ${ARROW_ORC:=ON} -export ARROW_ORC -: ${ARROW_PARQUET:=ON} -: ${ARROW_S3:=ON} -: ${ARROW_USE_CCACHE:=OFF} -: ${CMAKE_BUILD_TYPE:=Release} -: ${CMAKE_UNITY_BUILD:=ON} - -if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo "=== ccache statistics before build ===" - ccache -sv 2>/dev/null || ccache -s -fi - -export ARROW_TEST_DATA="${arrow_dir}/testing/data" -export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" -export AWS_EC2_METADATA_DISABLED=TRUE - -mkdir -p "${build_dir}/cpp" -pushd "${build_dir}/cpp" - -cmake \ - -DARROW_ACERO=${ARROW_ACERO} \ - -DARROW_BUILD_SHARED=OFF \ - -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS} \ - -DARROW_CSV=${ARROW_DATASET} \ - -DARROW_DATASET=${ARROW_DATASET} \ - -DARROW_SUBSTRAIT=${ARROW_DATASET} \ - -DARROW_DEPENDENCY_USE_SHARED=OFF \ - -DARROW_GANDIVA=${ARROW_GANDIVA} \ - -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ - -DARROW_JSON=${ARROW_DATASET} \ - -DARROW_ORC=${ARROW_ORC} \ - -DARROW_PARQUET=${ARROW_PARQUET} \ - -DARROW_S3=${ARROW_S3} \ - -DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \ - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ - -DCMAKE_INSTALL_PREFIX=${install_dir} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DGTest_SOURCE=BUNDLED \ - -DPARQUET_BUILD_EXAMPLES=OFF \ - -DPARQUET_BUILD_EXECUTABLES=OFF \ - -DPARQUET_REQUIRE_ENCRYPTION=OFF \ - -Dre2_SOURCE=BUNDLED \ - -GNinja \ - ${arrow_dir}/cpp -cmake --build . --target install - -if [ "${ARROW_BUILD_TESTS}" == "ON" ]; then - # MinIO is required - exclude_tests="arrow-s3fs-test" - # unstable - exclude_tests="${exclude_tests}|arrow-acero-asof-join-node-test" - exclude_tests="${exclude_tests}|arrow-acero-hash-join-node-test" - ctest \ - --exclude-regex "${exclude_tests}" \ - --label-regex unittest \ - --output-on-failure \ - --parallel $(sysctl -n hw.ncpu) \ - --timeout 300 -fi - -popd - -export JAVA_JNI_CMAKE_ARGS="-DProtobuf_ROOT=${build_dir}/cpp/protobuf_ep-install" -${arrow_dir}/ci/scripts/java_jni_build.sh \ - ${arrow_dir} \ - ${install_dir} \ - ${build_dir} \ - ${dist_dir} - -if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo "=== ccache statistics after build ===" - ccache -sv 2>/dev/null || ccache -s -fi - - -echo "=== Checking shared dependencies for libraries ===" -pushd ${dist_dir} -archery linking check-dependencies \ - --allow CoreFoundation \ - --allow Security \ - --allow libSystem \ - --allow libarrow_cdata_jni \ - --allow libarrow_dataset_jni \ - --allow libarrow_orc_jni \ - --allow libc++ \ - --allow libcurl \ - --allow libgandiva_jni \ - --allow libncurses \ - --allow libobjc \ - --allow libz \ - arrow_cdata_jni/${normalized_arch}/libarrow_cdata_jni.dylib \ - arrow_dataset_jni/${normalized_arch}/libarrow_dataset_jni.dylib \ - arrow_orc_jni/${normalized_arch}/libarrow_orc_jni.dylib \ - gandiva_jni/${normalized_arch}/libgandiva_jni.dylib -popd diff --git a/ci/scripts/java_jni_manylinux_build.sh b/ci/scripts/java_jni_manylinux_build.sh deleted file mode 100755 index 6f3769751af..00000000000 --- a/ci/scripts/java_jni_manylinux_build.sh +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -arrow_dir=${1} -build_dir=${2} -normalized_arch=$(arch) -case ${normalized_arch} in - aarch64) - normalized_arch=aarch_64 - ;; -esac -# The directory where the final binaries will be stored when scripts finish -dist_dir=${3} - -echo "=== Clear output directories and leftovers ===" -# Clear output directories and leftovers -rm -rf ${build_dir} - -echo "=== Building Arrow C++ libraries ===" -devtoolset_version=$(rpm -qa "devtoolset-*-gcc" --queryformat %{VERSION} | \ - grep -o "^[0-9]*") -devtoolset_include_cpp="/opt/rh/devtoolset-${devtoolset_version}/root/usr/include/c++/${devtoolset_version}" -: ${ARROW_ACERO:=ON} -export ARROW_ACERO -: ${ARROW_BUILD_TESTS:=ON} -: ${ARROW_DATASET:=ON} -export ARROW_DATASET -: ${ARROW_GANDIVA:=ON} -export ARROW_GANDIVA -: ${ARROW_GCS:=ON} -: ${ARROW_JEMALLOC:=ON} -: ${ARROW_RPATH_ORIGIN:=ON} -: ${ARROW_ORC:=ON} -export ARROW_ORC -: ${ARROW_PARQUET:=ON} -: ${ARROW_S3:=ON} -: ${ARROW_USE_CCACHE:=OFF} -: ${CMAKE_BUILD_TYPE:=release} -: ${CMAKE_UNITY_BUILD:=ON} -: ${VCPKG_ROOT:=/opt/vcpkg} -: ${VCPKG_FEATURE_FLAGS:=-manifests} -: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}} -: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-lpthread} - -if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo "=== ccache statistics before build ===" - ccache -sv 2>/dev/null || ccache -s -fi - -export ARROW_TEST_DATA="${arrow_dir}/testing/data" -export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" -export AWS_EC2_METADATA_DISABLED=TRUE - -mkdir -p "${build_dir}/cpp" -pushd "${build_dir}/cpp" - -cmake \ - -DARROW_ACERO=${ARROW_ACERO} \ - -DARROW_BUILD_SHARED=OFF \ - -DARROW_BUILD_TESTS=ON \ - -DARROW_CSV=${ARROW_DATASET} \ - -DARROW_DATASET=${ARROW_DATASET} \ - -DARROW_SUBSTRAIT=${ARROW_DATASET} \ - -DARROW_DEPENDENCY_SOURCE="VCPKG" \ - -DARROW_DEPENDENCY_USE_SHARED=OFF \ - -DARROW_GANDIVA_PC_CXX_FLAGS=${GANDIVA_CXX_FLAGS} \ - -DARROW_GANDIVA=${ARROW_GANDIVA} \ - -DARROW_GCS=${ARROW_GCS} \ - -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ - -DARROW_ORC=${ARROW_ORC} \ - -DARROW_PARQUET=${ARROW_PARQUET} \ - -DARROW_RPATH_ORIGIN=${ARROW_RPATH_ORIGIN} \ - -DARROW_S3=${ARROW_S3} \ - -DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \ - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ - -DCMAKE_INSTALL_PREFIX=${ARROW_HOME} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -DGTest_SOURCE=BUNDLED \ - -DORC_SOURCE=BUNDLED \ - -DORC_PROTOBUF_EXECUTABLE=${VCPKG_ROOT}/installed/${VCPKG_TARGET_TRIPLET}/tools/protobuf/protoc \ - -DPARQUET_BUILD_EXAMPLES=OFF \ - -DPARQUET_BUILD_EXECUTABLES=OFF \ - -DPARQUET_REQUIRE_ENCRYPTION=OFF \ - -DVCPKG_MANIFEST_MODE=OFF \ - -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ - -GNinja \ - ${arrow_dir}/cpp -ninja install - -if [ "${ARROW_BUILD_TESTS}" = "ON" ]; then - # MinIO is required - exclude_tests="arrow-s3fs-test" - case $(arch) in - aarch64) - # GCS testbench is crashed on aarch64: - # ImportError: ../grpc/_cython/cygrpc.cpython-38-aarch64-linux-gnu.so: - # undefined symbol: vtable for std::__cxx11::basic_ostringstream< - # char, std::char_traits, std::allocator > - exclude_tests="${exclude_tests}|arrow-gcsfs-test" - ;; - esac - # unstable - exclude_tests="${exclude_tests}|arrow-acero-asof-join-node-test" - exclude_tests="${exclude_tests}|arrow-acero-hash-join-node-test" - # strptime - exclude_tests="${exclude_tests}|arrow-utility-test" - ctest \ - --exclude-regex "${exclude_tests}" \ - --label-regex unittest \ - --output-on-failure \ - --parallel $(nproc) \ - --timeout 300 -fi - -popd - - -JAVA_JNI_CMAKE_ARGS="" -JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" -JAVA_JNI_CMAKE_ARGS="${JAVA_JNI_CMAKE_ARGS} -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET}" -export JAVA_JNI_CMAKE_ARGS -${arrow_dir}/ci/scripts/java_jni_build.sh \ - ${arrow_dir} \ - ${ARROW_HOME} \ - ${build_dir} \ - ${dist_dir} - -if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo "=== ccache statistics after build ===" - ccache -sv 2>/dev/null || ccache -s -fi - - -echo "=== Checking shared dependencies for libraries ===" -pushd ${dist_dir} -archery linking check-dependencies \ - --allow ld-linux-aarch64 \ - --allow ld-linux-x86-64 \ - --allow libc \ - --allow libdl \ - --allow libgcc_s \ - --allow libm \ - --allow libpthread \ - --allow librt \ - --allow libstdc++ \ - --allow libz \ - --allow linux-vdso \ - arrow_cdata_jni/${normalized_arch}/libarrow_cdata_jni.so \ - arrow_dataset_jni/${normalized_arch}/libarrow_dataset_jni.so \ - arrow_orc_jni/${normalized_arch}/libarrow_orc_jni.so \ - gandiva_jni/${normalized_arch}/libgandiva_jni.so -popd diff --git a/ci/scripts/java_jni_windows_build.sh b/ci/scripts/java_jni_windows_build.sh deleted file mode 100755 index 39288f4a9d0..00000000000 --- a/ci/scripts/java_jni_windows_build.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -arrow_dir=${1} -build_dir=${2} -# The directory where the final binaries will be stored when scripts finish -dist_dir=${3} - -echo "=== Clear output directories and leftovers ===" -# Clear output directories and leftovers -rm -rf ${build_dir} - -echo "=== Building Arrow C++ libraries ===" -install_dir=${build_dir}/cpp-install -: ${ARROW_ACERO:=ON} -export ARROW_ACERO -: ${ARROW_BUILD_TESTS:=ON} -: ${ARROW_DATASET:=ON} -export ARROW_DATASET -: ${ARROW_ORC:=ON} -export ARROW_ORC -: ${ARROW_PARQUET:=ON} -: ${ARROW_S3:=ON} -: ${ARROW_USE_CCACHE:=OFF} -: ${CMAKE_BUILD_TYPE:=release} -: ${CMAKE_UNITY_BUILD:=ON} - -if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo "=== ccache statistics before build ===" - ccache -sv 2>/dev/null || ccache -s -fi - -export ARROW_TEST_DATA="${arrow_dir}/testing/data" -export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" -export AWS_EC2_METADATA_DISABLED=TRUE - -mkdir -p "${build_dir}/cpp" -pushd "${build_dir}/cpp" - -cmake \ - -DARROW_ACERO=${ARROW_ACERO} \ - -DARROW_BUILD_SHARED=OFF \ - -DARROW_BUILD_TESTS=ON \ - -DARROW_CSV=${ARROW_DATASET} \ - -DARROW_DATASET=${ARROW_DATASET} \ - -DARROW_SUBSTRAIT=${ARROW_DATASET} \ - -DARROW_DEPENDENCY_USE_SHARED=OFF \ - -DARROW_ORC=${ARROW_ORC} \ - -DARROW_PARQUET=${ARROW_PARQUET} \ - -DARROW_S3=${ARROW_S3} \ - -DARROW_USE_CCACHE=${ARROW_USE_CCACHE} \ - -DARROW_WITH_BROTLI=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_ZSTD=ON \ - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ - -DCMAKE_INSTALL_PREFIX=${install_dir} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ - -GNinja \ - ${arrow_dir}/cpp -ninja install - -if [ "${ARROW_BUILD_TESTS}" = "ON" ]; then - # MinIO is required - exclude_tests="arrow-s3fs-test" - # unstable - exclude_tests="${exclude_tests}|arrow-compute-hash-join-node-test" - exclude_tests="${exclude_tests}|arrow-dataset-scanner-test" - # strptime - exclude_tests="${exclude_tests}|arrow-utility-test" - ctest \ - --exclude-regex "${exclude_tests}" \ - --label-regex unittest \ - --output-on-failure \ - --parallel $(nproc) \ - --timeout 300 -fi - -popd - - -${arrow_dir}/ci/scripts/java_jni_build.sh \ - ${arrow_dir} \ - ${install_dir} \ - ${build_dir} \ - ${dist_dir} - -if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo "=== ccache statistics after build ===" - ccache -sv 2>/dev/null || ccache -s -fi - - -echo "=== Checking shared dependencies for libraries ===" -pushd ${dist_dir} -# TODO -# archery linking check-dependencies \ -# --allow libm \ -# --allow librt \ -# --allow libz \ -# libarrow_cdata_jni.dll \ -# libarrow_dataset_jni.dll \ -popd diff --git a/ci/scripts/java_test.sh b/ci/scripts/java_test.sh deleted file mode 100755 index 5efda4318f1..00000000000 --- a/ci/scripts/java_test.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -if [[ "${ARROW_JAVA_TEST:-ON}" != "ON" ]]; then - exit -fi - -arrow_dir=${1} -source_dir=${1}/java -java_jni_dist_dir=${3} - -# For JNI -export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} - -mvn="mvn -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" -# Use `2 * ncores` threads -mvn="${mvn} -T 2C" - -pushd ${source_dir} - -${mvn} clean test - -projects=() -if [ "${ARROW_JAVA_JNI}" = "ON" ]; then - projects+=(adapter/orc) - projects+=(dataset) - projects+=(gandiva) -fi -if [ "${#projects[@]}" -gt 0 ]; then - ${mvn} clean test \ - -Parrow-jni \ - -pl $(IFS=,; echo "${projects[*]}") \ - -Darrow.cpp.build.dir=${java_jni_dist_dir} -fi - -if [ "${ARROW_JAVA_CDATA}" = "ON" ]; then - ${mvn} clean test -Parrow-c-data -pl c -Darrow.c.jni.dist.dir=${java_jni_dist_dir} -fi - -popd diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh deleted file mode 100755 index 196539ee0f1..00000000000 --- a/ci/scripts/js_build.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -arrow_dir=${1} -source_dir=${arrow_dir}/js -build_dir=${2} - -: ${BUILD_DOCS_JS:=OFF} - -# https://github.com/apache/arrow/issues/41429 -# TODO: We want to out-of-source build. This is a workaround. We copy -# all needed files to the build directory from the source directory -# and build in the build directory. -rm -rf ${build_dir}/js -mkdir -p ${build_dir} -cp -aL ${arrow_dir}/LICENSE.txt ${build_dir}/ -cp -aL ${arrow_dir}/NOTICE.txt ${build_dir}/ -cp -aL ${source_dir} ${build_dir}/js -pushd ${build_dir}/js - -yarn --immutable -yarn lint:ci -yarn build - -if [ "${BUILD_DOCS_JS}" == "ON" ]; then - # If apache or upstream are defined use those as remote. - # Otherwise use origin which could be a fork on PRs. - if [ "$(git -C ${arrow_dir} config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then - yarn doc --gitRemote apache - elif [[ "$(git -C ${arrow_dir}config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then - yarn doc --gitRemote upstream - elif [[ "$(basename -s .git $(git -C ${arrow_dir} config --get remote.origin.url))" == "arrow" ]]; then - yarn doc - else - echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." - exit 0 - fi - mkdir -p ${build_dir}/docs/js - rsync -a doc/ ${build_dir}/docs/js -fi - -popd diff --git a/ci/scripts/js_test.sh b/ci/scripts/js_test.sh deleted file mode 100755 index 863b1c3d346..00000000000 --- a/ci/scripts/js_test.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -source_dir=${1}/js -build_dir=${2}/js - -pushd ${build_dir} - -yarn lint -yarn test -yarn test:bundle - -popd diff --git a/ci/scripts/matlab_build.sh b/ci/scripts/matlab_build.sh index d3f86adbb8a..0548cd08726 100755 --- a/ci/scripts/matlab_build.sh +++ b/ci/scripts/matlab_build.sh @@ -26,9 +26,9 @@ build_dir=${base_dir}/matlab/build install_dir=${base_dir}/matlab/install cmake \ - -S ${source_dir} \ - -B ${build_dir} \ + -S "${source_dir}" \ + -B "${build_dir}" \ -G Ninja \ - -D CMAKE_INSTALL_PREFIX=${install_dir} \ + -D CMAKE_INSTALL_PREFIX="${install_dir}" \ -D MATLAB_ADD_INSTALL_DIR_TO_SEARCH_PATH=OFF -cmake --build ${build_dir} --config Release --target install +cmake --build "${build_dir}" --config Release --target install diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh index e5b08424022..b4634070a87 100755 --- a/ci/scripts/msys2_setup.sh +++ b/ci/scripts/msys2_setup.sh @@ -24,33 +24,34 @@ target=$1 packages=() case "${target}" in cpp|c_glib|ruby) - packages+=(${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp) - packages+=(${MINGW_PACKAGE_PREFIX}-boost) - packages+=(${MINGW_PACKAGE_PREFIX}-brotli) - packages+=(${MINGW_PACKAGE_PREFIX}-bzip2) - packages+=(${MINGW_PACKAGE_PREFIX}-c-ares) - packages+=(${MINGW_PACKAGE_PREFIX}-cc) - packages+=(${MINGW_PACKAGE_PREFIX}-ccache) - packages+=(${MINGW_PACKAGE_PREFIX}-clang) - packages+=(${MINGW_PACKAGE_PREFIX}-cmake) - packages+=(${MINGW_PACKAGE_PREFIX}-double-conversion) - packages+=(${MINGW_PACKAGE_PREFIX}-gflags) - packages+=(${MINGW_PACKAGE_PREFIX}-grpc) - packages+=(${MINGW_PACKAGE_PREFIX}-gtest) - packages+=(${MINGW_PACKAGE_PREFIX}-libutf8proc) - packages+=(${MINGW_PACKAGE_PREFIX}-libxml2) - packages+=(${MINGW_PACKAGE_PREFIX}-lz4) - packages+=(${MINGW_PACKAGE_PREFIX}-ninja) - packages+=(${MINGW_PACKAGE_PREFIX}-nlohmann-json) - packages+=(${MINGW_PACKAGE_PREFIX}-protobuf) - packages+=(${MINGW_PACKAGE_PREFIX}-rapidjson) - packages+=(${MINGW_PACKAGE_PREFIX}-re2) - packages+=(${MINGW_PACKAGE_PREFIX}-snappy) - packages+=(${MINGW_PACKAGE_PREFIX}-sqlite3) - packages+=(${MINGW_PACKAGE_PREFIX}-thrift) - packages+=(${MINGW_PACKAGE_PREFIX}-xsimd) - packages+=(${MINGW_PACKAGE_PREFIX}-uriparser) - packages+=(${MINGW_PACKAGE_PREFIX}-zstd) + packages+=("${MINGW_PACKAGE_PREFIX}-aws-sdk-cpp") + packages+=("${MINGW_PACKAGE_PREFIX}-boost") + packages+=("${MINGW_PACKAGE_PREFIX}-brotli") + packages+=("${MINGW_PACKAGE_PREFIX}-bzip2") + packages+=("${MINGW_PACKAGE_PREFIX}-c-ares") + packages+=("${MINGW_PACKAGE_PREFIX}-ccache") + packages+=("${MINGW_PACKAGE_PREFIX}-clang") + packages+=("${MINGW_PACKAGE_PREFIX}-cmake") + packages+=("${MINGW_PACKAGE_PREFIX}-double-conversion") + packages+=("${MINGW_PACKAGE_PREFIX}-gflags") + packages+=("${MINGW_PACKAGE_PREFIX}-grpc") + packages+=("${MINGW_PACKAGE_PREFIX}-gtest") + packages+=("${MINGW_PACKAGE_PREFIX}-libutf8proc") + packages+=("${MINGW_PACKAGE_PREFIX}-libxml2") + packages+=("${MINGW_PACKAGE_PREFIX}-llvm") + packages+=("${MINGW_PACKAGE_PREFIX}-lz4") + packages+=("${MINGW_PACKAGE_PREFIX}-ninja") + packages+=("${MINGW_PACKAGE_PREFIX}-nlohmann-json") + packages+=("${MINGW_PACKAGE_PREFIX}-protobuf") + packages+=("${MINGW_PACKAGE_PREFIX}-rapidjson") + packages+=("${MINGW_PACKAGE_PREFIX}-re2") + packages+=("${MINGW_PACKAGE_PREFIX}-snappy") + packages+=("${MINGW_PACKAGE_PREFIX}-sqlite3") + packages+=("${MINGW_PACKAGE_PREFIX}-thrift") + packages+=("${MINGW_PACKAGE_PREFIX}-xsimd") + packages+=("${MINGW_PACKAGE_PREFIX}-uriparser") + packages+=("${MINGW_PACKAGE_PREFIX}-zstd") + packages+=("patch") if [ "${target}" != "ruby" ]; then # We don't update the exiting packages for Ruby because @@ -58,26 +59,17 @@ case "${target}" in # OpenSSL and zlib separately. They should be ABI compatible # with packages installed by MSYS2. If we specify packages # explicitly here, the existing packages may be updated. - packages+=(${MINGW_PACKAGE_PREFIX}-openssl) - packages+=(${MINGW_PACKAGE_PREFIX}-zlib) + packages+=("${MINGW_PACKAGE_PREFIX}-openssl") + packages+=("${MINGW_PACKAGE_PREFIX}-zlib") fi ;; esac case "${target}" in c_glib|ruby) - packages+=(${MINGW_PACKAGE_PREFIX}-gobject-introspection) - packages+=(${MINGW_PACKAGE_PREFIX}-meson) - packages+=(${MINGW_PACKAGE_PREFIX}-vala) - ;; -esac - -case "${target}" in - cgo) - packages+=(${MINGW_PACKAGE_PREFIX}-arrow) - packages+=(${MINGW_PACKAGE_PREFIX}-gcc) - packages+=(${MINGW_PACKAGE_PREFIX}-toolchain) - packages+=(base-devel) + packages+=("${MINGW_PACKAGE_PREFIX}-gobject-introspection") + packages+=("${MINGW_PACKAGE_PREFIX}-meson") + packages+=("${MINGW_PACKAGE_PREFIX}-vala") ;; esac @@ -87,6 +79,7 @@ pacman \ --sync \ "${packages[@]}" -"$(dirname $0)/ccache_setup.sh" -echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >> $GITHUB_ENV -echo "PIP_CACHE_DIR=$(pip cache dir)" >> $GITHUB_ENV +ccache_dir=$(dirname "$0") +"${ccache_dir}/ccache_setup.sh" +echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >> "$GITHUB_ENV" +echo "PIP_CACHE_DIR=$(pip cache dir)" >> "$GITHUB_ENV" diff --git a/ci/scripts/msys2_system_clean.sh b/ci/scripts/msys2_system_clean.sh index c4de8a6b265..35b28e6a2ce 100755 --- a/ci/scripts/msys2_system_clean.sh +++ b/ci/scripts/msys2_system_clean.sh @@ -30,11 +30,11 @@ case "${MINGW_PACKAGE_PREFIX}" in --nosave \ --recursive \ --remove \ - ${MINGW_PACKAGE_PREFIX}-clang-tools-extra \ - ${MINGW_PACKAGE_PREFIX}-gcc-ada \ - ${MINGW_PACKAGE_PREFIX}-gcc-fortran \ - ${MINGW_PACKAGE_PREFIX}-gcc-libgfortran \ - ${MINGW_PACKAGE_PREFIX}-gcc-objc \ - ${MINGW_PACKAGE_PREFIX}-libgccjit + "${MINGW_PACKAGE_PREFIX}-clang-tools-extra" \ + "${MINGW_PACKAGE_PREFIX}-gcc-ada" \ + "${MINGW_PACKAGE_PREFIX}-gcc-fortran" \ + "${MINGW_PACKAGE_PREFIX}-gcc-libgfortran" \ + "${MINGW_PACKAGE_PREFIX}-gcc-objc" \ + "${MINGW_PACKAGE_PREFIX}-libgccjit" ;; esac diff --git a/ci/scripts/nanoarrow_build.sh b/ci/scripts/nanoarrow_build.sh index 6f7c82b099b..8627c0984f0 100755 --- a/ci/scripts/nanoarrow_build.sh +++ b/ci/scripts/nanoarrow_build.sh @@ -20,7 +20,7 @@ set -e arrow_dir=${1} -source_dir=${1}/nanoarrow +source_dir=${arrow_dir}/nanoarrow build_dir=${2}/nanoarrow # This file is used to build the nanoarrow binaries needed for the archery @@ -43,10 +43,10 @@ fi set -x -mkdir -p ${build_dir} -pushd ${build_dir} +mkdir -p "${build_dir}" +pushd "${build_dir}" -cmake ${source_dir} \ +cmake "${source_dir}" \ -DNANOARROW_IPC=ON \ -DNANOARROW_IPC_WITH_ZSTD=ON \ -DNANOARROW_BUILD_INTEGRATION_TESTS=ON diff --git a/ci/scripts/python_build.bat b/ci/scripts/python_build.bat new file mode 100644 index 00000000000..417cc0d5dd0 --- /dev/null +++ b/ci/scripts/python_build.bat @@ -0,0 +1,139 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@echo on + +set SOURCE_DIR=%1 +set CMAKE_INSTALL_PREFIX=%2 +set CPP_SOURCE_DIR=%SOURCE_DIR%\cpp +set CPP_BUILD_DIR=%SOURCE_DIR%\build +echo C++ source dir is %CPP_SOURCE_DIR% + +echo Building for Windows ... + +@REM List installed Pythons +py -0p + +%PYTHON_CMD% -m sysconfig || exit /B 1 + +@REM Setup MSVC environment + +call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 +@echo on + +echo "=== CCACHE Stats before build ===" +ccache -sv + +echo "=== Building Arrow C++ libraries ===" +set ARROW_ACERO=ON +set ARROW_DATASET=ON +set ARROW_FLIGHT=OFF +set ARROW_GANDIVA=OFF +set ARROW_GCS=OFF +set ARROW_HDFS=ON +set ARROW_MIMALLOC=ON +set ARROW_ORC=OFF +set ARROW_PARQUET=ON +set PARQUET_REQUIRE_ENCRYPTION=ON +set ARROW_SUBSTRAIT=ON +set ARROW_S3=ON +set ARROW_TENSORFLOW=ON +set ARROW_WITH_BROTLI=ON +set ARROW_WITH_BZ2=OFF +set ARROW_WITH_LZ4=ON +set ARROW_WITH_SNAPPY=ON +set ARROW_WITH_ZLIB=ON +set ARROW_WITH_ZSTD=ON +set CMAKE_BUILD_TYPE=Release +set CMAKE_GENERATOR=Ninja +set CMAKE_UNITY_BUILD=ON + +mkdir %CPP_BUILD_DIR% +pushd %CPP_BUILD_DIR% + +cmake ^ + -DARROW_ACERO=%ARROW_ACERO% ^ + -DARROW_BUILD_SHARED=ON ^ + -DARROW_BUILD_STATIC=OFF ^ + -DARROW_BUILD_TESTS=OFF ^ + -DARROW_COMPUTE=ON ^ + -DARROW_CSV=ON ^ + -DARROW_CXXFLAGS="/MP" ^ + -DARROW_DATASET=%ARROW_DATASET% ^ + -DARROW_DEPENDENCY_USE_SHARED=OFF ^ + -DARROW_FILESYSTEM=ON ^ + -DARROW_FLIGHT=%ARROW_FLIGHT% ^ + -DARROW_GANDIVA=%ARROW_GANDIVA% ^ + -DARROW_GCS=%ARROW_GCS% ^ + -DARROW_HDFS=%ARROW_HDFS% ^ + -DARROW_JSON=ON ^ + -DARROW_MIMALLOC=%ARROW_MIMALLOC% ^ + -DARROW_ORC=%ARROW_ORC% ^ + -DARROW_PARQUET=%ARROW_PARQUET% ^ + -DARROW_S3=%ARROW_S3% ^ + -DARROW_SUBSTRAIT=%ARROW_SUBSTRAIT% ^ + -DARROW_TENSORFLOW=%ARROW_TENSORFLOW% ^ + -DARROW_USE_CCACHE=ON ^ + -DARROW_WITH_BROTLI=%ARROW_WITH_BROTLI% ^ + -DARROW_WITH_BZ2=%ARROW_WITH_BZ2% ^ + -DARROW_WITH_LZ4=%ARROW_WITH_LZ4% ^ + -DARROW_WITH_SNAPPY=%ARROW_WITH_SNAPPY% ^ + -DARROW_WITH_ZLIB=%ARROW_WITH_ZLIB% ^ + -DARROW_WITH_ZSTD=%ARROW_WITH_ZSTD% ^ + -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^ + -DCMAKE_INSTALL_PREFIX=%CMAKE_INSTALL_PREFIX% ^ + -DCMAKE_UNITY_BUILD=%CMAKE_UNITY_BUILD% ^ + -DMSVC_LINK_VERBOSE=ON ^ + -DPARQUET_REQUIRE_ENCRYPTION=%PARQUET_REQUIRE_ENCRYPTION% ^ + -Dxsimd_SOURCE=BUNDLED ^ + -G "%CMAKE_GENERATOR%" ^ + %CPP_SOURCE_DIR% || exit /B 1 +cmake --build . --config %CMAKE_BUILD_TYPE% --target install || exit /B 1 +popd + +echo "=== CCACHE Stats after build ===" +ccache -sv + +echo "=== Building Python ===" +set PYARROW_BUILD_TYPE=%CMAKE_BUILD_TYPE% +set PYARROW_BUILD_VERBOSE=1 +set PYARROW_BUNDLE_ARROW_CPP=ON +set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR% +set PYARROW_WITH_ACERO=%ARROW_ACERO% +set PYARROW_WITH_DATASET=%ARROW_DATASET% +set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT% +set PYARROW_WITH_GANDIVA=%ARROW_GANDIVA% +set PYARROW_WITH_GCS=%ARROW_GCS% +set PYARROW_WITH_HDFS=%ARROW_HDFS% +set PYARROW_WITH_ORC=%ARROW_ORC% +set PYARROW_WITH_PARQUET=%ARROW_PARQUET% +set PYARROW_WITH_PARQUET_ENCRYPTION=%PARQUET_REQUIRE_ENCRYPTION% +set PYARROW_WITH_SUBSTRAIT=%ARROW_SUBSTRAIT% +set PYARROW_WITH_S3=%ARROW_S3% +set ARROW_HOME=%CMAKE_INSTALL_PREFIX% +set CMAKE_PREFIX_PATH=%CMAKE_INSTALL_PREFIX% + +pushd %SOURCE_DIR%\python + +@REM Install Python build dependencies +%PYTHON_CMD% -m pip install --upgrade pip || exit /B 1 +%PYTHON_CMD% -m pip install -r requirements-build.txt || exit /B 1 + +@REM Build PyArrow +%PYTHON_CMD% -m pip install --no-deps --no-build-isolation -vv . || exit /B 1 + +popd diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index 9455baf3536..e0c64521cdd 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -25,13 +25,17 @@ build_dir=${2} source_dir=${arrow_dir}/python python_build_dir=${build_dir}/python -: ${BUILD_DOCS_PYTHON:=OFF} +: "${BUILD_DOCS_PYTHON:=OFF}" if [ -x "$(command -v git)" ]; then - git config --global --add safe.directory ${arrow_dir} + git config --global --add safe.directory "${arrow_dir}" fi if [ -n "${ARROW_PYTHON_VENV:-}" ]; then + # We don't need to follow this external file. + # See also: https://www.shellcheck.net/wiki/SC1091 + # + # shellcheck source=/dev/null . "${ARROW_PYTHON_VENV}/bin/activate" fi @@ -50,7 +54,7 @@ case "$(uname)" in ;; esac -if [ ! -z "${CONDA_PREFIX}" ]; then +if [ -n "${CONDA_PREFIX}" ]; then echo -e "===\n=== Conda environment for build\n===" conda list fi @@ -74,7 +78,7 @@ export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} export PYARROW_PARALLEL=${n_jobs} -: ${CMAKE_PREFIX_PATH:=${ARROW_HOME}} +: "${CMAKE_PREFIX_PATH:=${ARROW_HOME}}" export CMAKE_PREFIX_PATH export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} @@ -82,9 +86,9 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} # TODO: We want to out-of-source build. This is a workaround. We copy # all needed files to the build directory from the source directory # and build in the build directory. -rm -rf ${python_build_dir} -cp -aL ${source_dir} ${python_build_dir} -pushd ${python_build_dir} +rm -rf "${python_build_dir}" +cp -aL "${source_dir}" "${python_build_dir}" +pushd "${python_build_dir}" # - Cannot call setup.py as it may install in the wrong directory # on Debian/Ubuntu (ARROW-15243). # - Cannot use build isolation as we want to use specific dependency versions @@ -98,22 +102,21 @@ if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then # # Copy docs/source because the "autosummary_generate = True" # configuration generates files to docs/source/python/generated/. - rm -rf ${python_build_dir}/docs/source - mkdir -p ${python_build_dir}/docs - cp -a ${arrow_dir}/docs/source ${python_build_dir}/docs/ - rm -rf ${python_build_dir}/format - cp -a ${arrow_dir}/format ${python_build_dir}/ - rm -rf ${python_build_dir}/cpp/examples - mkdir -p ${python_build_dir}/cpp - cp -a ${arrow_dir}/cpp/examples ${python_build_dir}/cpp/ - rm -rf ${python_build_dir}/ci - cp -a ${arrow_dir}/ci/ ${python_build_dir}/ - ncpus=$(python -c "import os; print(os.cpu_count())") + rm -rf "${python_build_dir}/docs/source" + mkdir -p "${python_build_dir}/docs" + cp -a "${arrow_dir}/docs/source" "${python_build_dir}/docs/" + rm -rf "${python_build_dir}/format" + cp -a "${arrow_dir}/format" "${python_build_dir}/" + rm -rf "${python_build_dir}/cpp/examples" + mkdir -p "${python_build_dir}/cpp" + cp -a "${arrow_dir}/cpp/examples" "${python_build_dir}/cpp/" + rm -rf "${python_build_dir}/ci" + cp -a "${arrow_dir}/ci/" "${python_build_dir}/" export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml - pushd ${build_dir} + pushd "${build_dir}" sphinx-build \ -b html \ - ${python_build_dir}/docs/source \ - ${build_dir}/docs + "${python_build_dir}/docs/source" \ + "${build_dir}/docs" popd fi diff --git a/ci/scripts/python_build_emscripten.sh b/ci/scripts/python_build_emscripten.sh index 14e96262020..857c0e25132 100755 --- a/ci/scripts/python_build_emscripten.sh +++ b/ci/scripts/python_build_emscripten.sh @@ -22,19 +22,22 @@ set -ex arrow_dir=${1} build_dir=${2} - +# We don't need to follow this external file. +# See also: https://www.shellcheck.net/wiki/SC1090 +# +# shellcheck source=/dev/null source ~/emsdk/emsdk_env.sh source_dir=${arrow_dir}/python python_build_dir=${build_dir}/python -rm -rf ${python_build_dir} -cp -aL ${source_dir} ${python_build_dir} +rm -rf "${python_build_dir}" +cp -aL "${source_dir}" "${python_build_dir}" # conda sets LDFLAGS / CFLAGS etc. which break # emcmake so we unset them unset LDFLAGS CFLAGS CXXFLAGS CPPFLAGS -pushd ${python_build_dir} +pushd "${python_build_dir}" pyodide build popd diff --git a/ci/scripts/python_sdist_build.sh b/ci/scripts/python_sdist_build.sh index f9e9359b6f6..dfb99518431 100755 --- a/ci/scripts/python_sdist_build.sh +++ b/ci/scripts/python_sdist_build.sh @@ -21,7 +21,7 @@ set -eux source_dir=${1}/python -pushd ${source_dir} +pushd "${source_dir}" export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION:-} ${PYTHON:-python} setup.py sdist popd diff --git a/ci/scripts/python_sdist_test.sh b/ci/scripts/python_sdist_test.sh index 8f263ceb05e..98a938d970a 100755 --- a/ci/scripts/python_sdist_test.sh +++ b/ci/scripts/python_sdist_test.sh @@ -53,13 +53,18 @@ fi if [ -n "${PYARROW_VERSION:-}" ]; then sdist="${arrow_dir}/python/dist/pyarrow-${PYARROW_VERSION}.tar.gz" else - sdist=$(ls ${arrow_dir}/python/dist/pyarrow-*.tar.gz | sort -r | head -n1) + sdist=$(echo "${arrow_dir}"/python/dist/pyarrow-*.tar.gz | sort -r | head -n1) fi if [ -n "${ARROW_PYTHON_VENV:-}" ]; then + # We don't need to follow this external file. + # See also: https://www.shellcheck.net/wiki/SC1091 + # + # shellcheck source=/dev/null . "${ARROW_PYTHON_VENV}/bin/activate" fi -${PYTHON:-python} -m pip install ${sdist} +${PYTHON:-python} -m pip install "${sdist}" +# shellcheck disable=SC2086 pytest -r s ${PYTEST_ARGS:-} --pyargs pyarrow diff --git a/ci/scripts/python_test.bat b/ci/scripts/python_test.bat new file mode 100644 index 00000000000..f9088c7f3d7 --- /dev/null +++ b/ci/scripts/python_test.bat @@ -0,0 +1,41 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@echo on + +set SOURCE_DIR=%1 + +set ARROW_TEST_DATA=%SOURCE_DIR%\testing\data +set PARQUET_TEST_DATA=%SOURCE_DIR%\cpp\submodules\parquet-testing\data + +echo Testing on Windows ... + +@REM List installed Pythons +py -0p + +%PYTHON_CMD% -m sysconfig || exit /B 1 + +pushd %SOURCE_DIR%\python + +@REM Install Python test dependencies +%PYTHON_CMD% -m pip install -r requirements-test.txt || exit /B 1 + +popd + +@REM Run Python tests +%PYTHON_CMD% -c "import pyarrow" || exit /B 1 +%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1 diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 1eaecd6bea0..8d113312927 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -54,8 +54,6 @@ FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_conf if [[ $FREE_THREADED_BUILD == "True" ]]; then pip install cython --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --prefer-binary fi -# With Python 3.9, the `--upgrade` flag is required to force full replacement of setuptools' distutils patching -pip install --upgrade --target $PIP_SITE_PACKAGES "setuptools>=58" pip install \ --only-binary=:all: \ @@ -147,6 +145,7 @@ cmake \ -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ + -Dxsimd_SOURCE=BUNDLED \ -G ${CMAKE_GENERATOR} \ ${source_dir}/cpp cmake --build . --target install diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 3ce86b16116..2b8ee7be745 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -28,15 +28,15 @@ fi source_dir=${1} -: ${ARROW_AZURE:=ON} -: ${ARROW_FLIGHT:=ON} -: ${ARROW_GCS:=ON} -: ${ARROW_S3:=ON} -: ${ARROW_SUBSTRAIT:=ON} -: ${CHECK_IMPORTS:=ON} -: ${CHECK_WHEEL_CONTENT:=ON} -: ${CHECK_UNITTESTS:=ON} -: ${INSTALL_PYARROW:=ON} +: "${ARROW_AZURE:=ON}" +: "${ARROW_FLIGHT:=ON}" +: "${ARROW_GCS:=ON}" +: "${CHECK_IMPORTS:=ON}" +: "${ARROW_S3:=ON}" +: "${ARROW_SUBSTRAIT:=ON}" +: "${CHECK_WHEEL_CONTENT:=ON}" +: "${CHECK_UNITTESTS:=ON}" +: "${INSTALL_PYARROW:=ON}" export PYARROW_TEST_ACERO=ON export PYARROW_TEST_AZURE=${ARROW_AZURE} @@ -59,7 +59,7 @@ export PARQUET_TEST_DATA=${source_dir}/cpp/submodules/parquet-testing/data if [ "${INSTALL_PYARROW}" == "ON" ]; then # Install the built wheels - python -m pip install ${source_dir}/python/repaired_wheels/*.whl + python -m pip install "${source_dir}"/python/repaired_wheels/*.whl fi if [ "${CHECK_IMPORTS}" == "ON" ]; then @@ -96,13 +96,23 @@ if [ "${CHECK_VERSION}" == "ON" ]; then fi if [ "${CHECK_WHEEL_CONTENT}" == "ON" ]; then - python ${source_dir}/ci/scripts/python_wheel_validate_contents.py \ - --path ${source_dir}/python/repaired_wheels + python "${source_dir}/ci/scripts/python_wheel_validate_contents.py" \ + --path "${source_dir}/python/repaired_wheels" fi +is_free_threaded() { + python -c "import sysconfig; print('ON' if sysconfig.get_config_var('Py_GIL_DISABLED') else 'OFF')" +} + if [ "${CHECK_UNITTESTS}" == "ON" ]; then # Install testing dependencies - python -m pip install -U -r ${source_dir}/python/requirements-wheel-test.txt + if [ "$(is_free_threaded)" = "ON" ] && [[ "${PYTHON:-}" == *"3.13"* ]]; then + echo "Free-threaded Python 3.13 build detected" + python -m pip install -U -r "${source_dir}/python/requirements-wheel-test-3.13t.txt" + else + echo "Regular Python build detected" + python -m pip install -U -r "${source_dir}/python/requirements-wheel-test.txt" + fi # Execute unittest, test dependencies must be installed python -c 'import pyarrow; pyarrow.create_library_symlinks()' diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index fd6d0591661..c01f833ce6d 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -60,7 +60,7 @@ set CMAKE_GENERATOR=Visual Studio 17 2022 set CMAKE_PLATFORM=x64 set VCPKG_ROOT=C:\vcpkg set VCPKG_FEATURE_FLAGS=-manifests -set VCGPK_TARGET_TRIPLET=amd64-windows-static-md-%CMAKE_BUILD_TYPE% +set VCPKG_TARGET_TRIPLET=amd64-windows-static-md-%CMAKE_BUILD_TYPE% mkdir C:\arrow-build pushd C:\arrow-build @@ -100,7 +100,8 @@ cmake ^ -DMSVC_LINK_VERBOSE=ON ^ -DPARQUET_REQUIRE_ENCRYPTION=%PARQUET_REQUIRE_ENCRYPTION% ^ -DVCPKG_MANIFEST_MODE=OFF ^ - -DVCPKG_TARGET_TRIPLET=%VCGPK_TARGET_TRIPLET% ^ + -DVCPKG_TARGET_TRIPLET=%VCPKG_TARGET_TRIPLET% ^ + -Dxsimd_SOURCE=BUNDLED ^ -G "%CMAKE_GENERATOR%" ^ -A "%CMAKE_PLATFORM%" ^ C:\arrow\cpp || exit /B 1 diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index 9cebf1ac6b6..44a1bb919f3 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -128,6 +128,7 @@ cmake \ -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \ -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ + -Dxsimd_SOURCE=BUNDLED \ ${ARROW_EXTRA_CMAKE_FLAGS} \ -G ${CMAKE_GENERATOR} \ /arrow/cpp diff --git a/ci/scripts/r_build.sh b/ci/scripts/r_build.sh index f4dc5a5781c..7554a9502fd 100755 --- a/ci/scripts/r_build.sh +++ b/ci/scripts/r_build.sh @@ -18,19 +18,24 @@ set -ex -: ${R_BIN:=R} +: "${R_BIN:=R}" source_dir=${1}/r build_dir=${2} -: ${BUILD_DOCS_R:=OFF} +: "${BUILD_DOCS_R:=OFF}" + +R_INSTALL_ARGS=() +for arg in ${INSTALL_ARGS:-}; do + R_INSTALL_ARGS+=("${arg}") +done # https://github.com/apache/arrow/issues/41429 # TODO: We want to out-of-source build. This is a workaround. We copy # all needed files to the build directory from the source directory # and build in the build directory. -rm -rf ${build_dir}/r -cp -aL ${source_dir} ${build_dir}/r -pushd ${build_dir}/r +rm -rf "${build_dir}/r" +cp -aL "${source_dir}" "${build_dir}/r" +pushd "${build_dir}/r" # build first so that any stray compiled files in r/src are ignored ${R_BIN} CMD build . @@ -41,12 +46,12 @@ else fi ${SUDO} \ env \ - PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH} \ - ${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz + PKG_CONFIG_PATH="${ARROW_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH}" \ + "${R_BIN}" CMD INSTALL "${R_INSTALL_ARGS[@]}" arrow*.tar.gz if [ "${BUILD_DOCS_R}" == "ON" ]; then ${R_BIN} -e "pkgdown::build_site(install = FALSE)" - rsync -a docs/ ${build_dir}/docs/r + rsync -a docs/ "${build_dir}/docs/r" fi popd diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 8a962fe576c..3039291ebbd 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -41,7 +41,7 @@ elif [ "`which zypper`" ]; then PACKAGE_MANAGER=zypper else PACKAGE_MANAGER=apt-get - apt-get update + apt-get update --allow-releaseinfo-change # flag needed for when debian version changes fi # Enable ccache if requested based on http://dirk.eddelbuettel.com/blog/2017/11/27/ @@ -75,5 +75,17 @@ fi # cmake is now a listed sys req. $PACKAGE_MANAGER install -y rsync cmake curl +# Update clang version to latest available. +# This is only for rhub/clang20. If we change the base image from rhub/clang20, +# we need to update this part too. +if [ "$R_UPDATE_CLANG" = true ]; then + apt update -y --allow-releaseinfo-change # flag needed for when debian version changes + apt install -y gnupg + curl -fsSL https://apt.llvm.org/llvm-snapshot.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/llvm.gpg + echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" > /etc/apt/sources.list.d/llvm20.list + apt update -y --allow-releaseinfo-change # flag needed for when debian version changes + apt install -y clang-20 lld-20 +fi + # Workaround for html help install failure; see https://github.com/r-lib/devtools/issues/2084#issuecomment-530912786 Rscript -e 'x <- file.path(R.home("doc"), "html"); if (!file.exists(x)) {dir.create(x, recursive=TRUE); file.copy(system.file("html/R.css", package="stats"), x)}' diff --git a/ci/scripts/r_install_system_dependencies.sh b/ci/scripts/r_install_system_dependencies.sh index ae2a04656c5..c9795d0ec6f 100755 --- a/ci/scripts/r_install_system_dependencies.sh +++ b/ci/scripts/r_install_system_dependencies.sh @@ -54,10 +54,10 @@ if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_GCS" == "ON" ] || [ "$ARROW_R_DEV" == "T case "$PACKAGE_MANAGER" in zypper) # python3 is Python 3.6 on OpenSUSE 15.3. - # PyArrow supports Python 3.9 or later. - $PACKAGE_MANAGER install -y python39-pip - ln -s /usr/bin/python3.9 /usr/local/bin/python - ln -s /usr/bin/pip3.9 /usr/local/bin/pip + # PyArrow supports Python 3.10 or later. + $PACKAGE_MANAGER install -y python310-pip + ln -s /usr/bin/python3.10 /usr/local/bin/python + ln -s /usr/bin/pip3.10 /usr/local/bin/pip ;; *) $PACKAGE_MANAGER install -y python3-pip diff --git a/ci/scripts/r_revdepcheck.sh b/ci/scripts/r_revdepcheck.sh index f7527aed89c..9949c98d9b1 100755 --- a/ci/scripts/r_revdepcheck.sh +++ b/ci/scripts/r_revdepcheck.sh @@ -18,13 +18,13 @@ set -ex -: ${R_BIN:=R} +: "${R_BIN:=R}" # When revdep runs with > 1 worker the checks for {targets} time out for # some reason. -: ${ARROW_REVDEP_WORKERS:=1} +: "${ARROW_REVDEP_WORKERS:=1}" # But we do want to use all cores while building arrow to speed up the # installation so this is used to set MAKEFLAGS -: ${N_JOBS:=$(nproc)} +: "${N_JOBS:=$(nproc)}" source_dir=${1}/r # cpp building dependencies @@ -78,7 +78,7 @@ apt install -y libxml2-dev \ # We have to be in source_dir so that cpp source detection works -pushd $source_dir +pushd "$source_dir" printenv diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index fb3e9a58363..b66724fdbdc 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -36,8 +36,7 @@ ncores=$(${R_BIN} -s -e 'cat(parallel::detectCores())') echo "MAKEFLAGS=-j${ncores}" >> ${rhome}/etc/Renviron.site # build first so that any stray compiled files in r/src are ignored -${R_BIN} CMD build . -${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz +${R_BIN} CMD build --no-build-vignettes --no-manual . # But unset the env var so that it doesn't cause us to run extra dev tests unset ARROW_R_DEV @@ -45,25 +44,30 @@ unset ARROW_R_DEV # Set the testthat output to be verbose for easier debugging export ARROW_R_VERBOSE_TEST=TRUE -export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" +# We prune dependencies for these, so we need to disable forcing suggests +export _R_CHECK_FORCE_SUGGESTS_=FALSE + +export SUPPRESSION_FILE=$(readlink -f "tools/ubsan.supp") +export UBSAN_OPTIONS="print_stacktrace=1,suppressions=${SUPPRESSION_FILE}" # From the old rhub image https://github.com/r-hub/rhub-linux-builders/blob/master/fedora-clang-devel-san/Dockerfile export ASAN_OPTIONS="alloc_dealloc_mismatch=0:detect_leaks=0:detect_odr_violation=0" -# run tests -pushd tests -${R_BIN} --no-save < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } +${R_BIN} CMD check --no-manual --no-vignettes --no-build-vignettes arrow*.tar.gz -cat testthat.out -if grep -q "runtime error" testthat.out; then +# Find sanitizer issues, print the file(s) they are part of, and fail the job +find . -type f -name "*Rout" -exec grep -l "runtime error\|SUMMARY: UndefinedBehaviorSanitizer" {} \; > sanitizer_errors.txt +if [ -s sanitizer_errors.txt ]; then + echo "Sanitizer errors found in the following files:" + cat sanitizer_errors.txt + + # Print the content of files with errors for debugging + while read -r file; do + echo "=============== $file ===============" + cat "$file" + echo "=========================================" + done < sanitizer_errors.txt + exit 1 fi -# run examples -popd -${R_BIN} --no-save -e 'library(arrow); testthat::test_examples(".")' >> examples.out 2>&1 || { cat examples.out; exit 1; } - -cat examples.out -if grep -q "runtime error" examples.out; then - exit 1 -fi popd diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index de92addf083..e3b68c941c0 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -35,7 +35,7 @@ printenv makepkg-mingw --noconfirm --noprogressbar --skippgpcheck --nocheck --syncdeps --cleanbuild VERSION=$(grep Version $ARROW_HOME/r/DESCRIPTION | cut -d " " -f 2) -DST_DIR="arrow-$VERSION" +DST_DIR="r-libarrow-windows-x86_64-$VERSION" # Collect the build artifacts and make the shape of zip file that rwinlib expects ls diff --git a/ci/scripts/release_test.sh b/ci/scripts/release_test.sh index 583d9618c65..af6d7dbb3a5 100755 --- a/ci/scripts/release_test.sh +++ b/ci/scripts/release_test.sh @@ -21,7 +21,7 @@ set -eux arrow_dir=${1} -pushd ${arrow_dir} +pushd "${arrow_dir}" dev/release/run-test.rb -vv diff --git a/ci/scripts/ruby_test.sh b/ci/scripts/ruby_test.sh index 507fa7858e8..da4c6454116 100755 --- a/ci/scripts/ruby_test.sh +++ b/ci/scripts/ruby_test.sh @@ -32,4 +32,4 @@ if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then export ARROW_DEBUG_MEMORY_POOL=trap fi -rake -f ${source_dir}/Rakefile BUILD_DIR=${build_dir} USE_BUNDLER=yes +rake -f "${source_dir}/Rakefile" BUILD_DIR="${build_dir}" USE_BUNDLER=yes diff --git a/ci/scripts/rust_build.sh b/ci/scripts/rust_build.sh index 5fc21d454b0..866b4e333e1 100755 --- a/ci/scripts/rust_build.sh +++ b/ci/scripts/rust_build.sh @@ -52,10 +52,10 @@ set -x # show activated toolchain rustup show -pushd ${source_dir} +pushd "${source_dir}" # build only the integration testing binaries -cargo build -p arrow-integration-testing --target-dir ${build_dir} +cargo build -p arrow-integration-testing --target-dir "${build_dir}" # Save disk space by removing large temporary build products rm -rf target/debug/deps diff --git a/ci/scripts/swift_test.sh b/ci/scripts/swift_test.sh deleted file mode 100755 index aba90f31e50..00000000000 --- a/ci/scripts/swift_test.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -data_gen_dir=${1}/swift/data-generator/swift-datagen -export GOPATH=/ -pushd ${data_gen_dir} -go get -d ./... -go run . -cp *.arrow ../../Arrow -popd - -source_dir=${1}/swift -pushd ${source_dir} -swiftlint --strict -popd - -source_dir=${1}/swift/Arrow -pushd ${source_dir} -sed 's/\/\/ build://g' Package.swift > Package.swift.build -mv Package.swift.build Package.swift -swift test -popd - -source_dir=${1}/swift/ArrowFlight -pushd ${source_dir} -sed 's/\/\/ build://g' Package.swift > Package.swift.build -mv Package.swift.build Package.swift -swift test -popd diff --git a/ci/scripts/util_enable_core_dumps.sh b/ci/scripts/util_enable_core_dumps.sh index 09f8d2d7270..48061fd50c5 100644 --- a/ci/scripts/util_enable_core_dumps.sh +++ b/ci/scripts/util_enable_core_dumps.sh @@ -1,3 +1,5 @@ +# shellcheck shell=bash +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information diff --git a/ci/scripts/util_free_space.sh b/ci/scripts/util_free_space.sh index dd6ba2c4600..4b6e26c6b22 100755 --- a/ci/scripts/util_free_space.sh +++ b/ci/scripts/util_free_space.sh @@ -87,8 +87,9 @@ sudo rm -rf /opt/hostedtoolcache/PyPy || : # 376MB sudo rm -rf /opt/hostedtoolcache/node || : # Remove Web browser packages -sudo apt purge -y \ - firefox \ - google-chrome-stable \ - microsoft-edge-stable +sudo apt purge -y firefox +# google-chrome-stable isn't installed on arm64 image. +sudo apt purge -y google-chrome-stable || : +# microsoft-edge-stable isn't installed on arm64 image. +sudo apt purge -y microsoft-edge-stable || : df -h diff --git a/ci/scripts/util_log.sh b/ci/scripts/util_log.sh index b34c44059ad..08b768905d0 100644 --- a/ci/scripts/util_log.sh +++ b/ci/scripts/util_log.sh @@ -1,3 +1,5 @@ +# shellcheck shell=bash +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,10 +19,12 @@ github_actions_group_begin() { echo "::group::$1" + echo "::stop-commands::arrow-log-grouping" set -x } github_actions_group_end() { set +x + echo "::arrow-log-grouping::" echo "::endgroup::" } diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 39b51874b1c..27e97a5b241 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,25 +1,26 @@ diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index 7cab6f726..697ab1bb4 100644 +index 6788bc7b7f..6b689dedf0 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -84,9 +84,12 @@ vcpkg_cmake_configure( - -DBUILD_TESTING=OFF +@@ -83,10 +83,13 @@ vcpkg_cmake_configure( -DENABLE_CURL_MANUAL=OFF - -DCURL_CA_FALLBACK=ON + -DIMPORT_LIB_SUFFIX= # empty + -DSHARE_LIB_OBJECT=OFF + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none + -DCURL_USE_PKGCONFIG=ON -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON - OPTIONS_DEBUG - -DENABLE_DEBUG=ON + MAYBE_UNUSED_VARIABLES + PKG_CONFIG_EXECUTABLE + ${EXTRA_ARGS_DEBUG} ) vcpkg_cmake_install() vcpkg_copy_pdbs() diff --git a/ports/llvm/portfile.cmake b/ports/llvm/portfile.cmake -index a79c72a59..6b7fa6a66 100644 +index 7764357a6d..da7374ecec 100644 --- a/ports/llvm/portfile.cmake +++ b/ports/llvm/portfile.cmake -@@ -292,6 +292,8 @@ vcpkg_cmake_configure( +@@ -302,6 +302,8 @@ vcpkg_cmake_configure( ${FEATURE_OPTIONS} MAYBE_UNUSED_VARIABLES COMPILER_RT_ENABLE_IOS @@ -28,92 +29,52 @@ index a79c72a59..6b7fa6a66 100644 ) vcpkg_cmake_install(ADD_BIN_TO_PATH) -diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 0312b2ae1..fdb576b5f 100644 ---- a/ports/snappy/portfile.cmake -+++ b/ports/snappy/portfile.cmake -@@ -8,5 +8,6 @@ vcpkg_from_github( - fix_clang-cl_build.patch - no-werror.patch - pkgconfig.diff -+ "snappy-disable-bmi.patch" - ) - file(COPY "${CURRENT_PORT_DIR}/snappy.pc.in" DESTINATION "${SOURCE_PATH}") -diff --git a/ports/snappy/snappy-disable-bmi.patch b/ports/snappy/snappy-disable-bmi.patch +diff --git a/ports/orc/orc-fix-exception-propagation.diff b/ports/orc/orc-fix-exception-propagation.diff new file mode 100644 -index 000000000..e839c93a4 +index 0000000000..25568e70cd --- /dev/null -+++ b/ports/snappy/snappy-disable-bmi.patch -@@ -0,0 +1,19 @@ -+diff --git a/snappy.cc b/snappy.cc -+index d414718..7b49d2a 100644 -+--- a/snappy.cc -++++ b/snappy.cc -+@@ -1014,14 +1014,10 @@ static inline void Report(const char *algorithm, size_t compressed_size, -+ static inline uint32_t ExtractLowBytes(const uint32_t& v, int n) { -+ assert(n >= 0); -+ assert(n <= 4); -+-#if SNAPPY_HAVE_BMI2 -+- return _bzhi_u32(v, 8 * n); -+-#else -+ // This needs to be wider than uint32_t otherwise `mask << 32` will be -+ // undefined. -+ uint64_t mask = 0xffffffff; -+ return v & ~(mask << (8 * n)); -+-#endif -+ } ++++ b/ports/orc/orc-fix-exception-propagation.diff +@@ -0,0 +1,30 @@ ++diff --git a/c++/src/Timezone.cc b/c++/src/Timezone.cc ++index 384f8ea99..07c75e0a7 100644 ++--- a/c++/src/Timezone.cc +++++ b/c++/src/Timezone.cc ++@@ -696,12 +696,21 @@ namespace orc { ++ std::string filename_; ++ mutable std::unique_ptr impl_; ++ mutable std::once_flag initialized_; +++ mutable std::exception_ptr init_exception_; ++ ++ TimezoneImpl* getImpl() const { ++- std::call_once(initialized_, [&]() { ++- auto buffer = loadTZDB(filename_); ++- impl_ = std::make_unique(filename_, std::move(buffer)); ++- }); +++ std::call_once(initialized_, [&]() { +++ try { +++ auto buffer = loadTZDB(filename_); +++ impl_ = std::make_unique(filename_, std::move(buffer)); +++ } catch (...) { +++ // If initialization failed, re-throw the exception +++ init_exception_ = std::current_exception(); +++ } +++ }); +++ if (init_exception_) { +++ std::rethrow_exception(init_exception_); +++ } ++ return impl_.get(); ++ } + -+ static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { -diff --git a/ports/thrift/portfile.cmake b/ports/thrift/portfile.cmake -index 1501782..71d2147 100644 ---- a/ports/thrift/portfile.cmake -+++ b/ports/thrift/portfile.cmake -@@ -12,7 +12,7 @@ vcpkg_find_acquire_program(BISON) - vcpkg_from_github( - OUT_SOURCE_PATH SOURCE_PATH - REPO apache/thrift -- REF "${VERSION}" -+ REF "v${VERSION}" - SHA512 5e4ee9870b30fe5ba484d39781c435716f7f3903793dc8aae96594ca813b1a5a73363b84719038ca8fa3ab8ef0a419a28410d936ff7b3bbadf36fc085a6883ae - HEAD_REF master - PATCHES -diff --git a/ports/thrift/vcpkg.json b/ports/thrift/vcpkg.json -index 2d5a854..9ff49ec 100644 ---- a/ports/thrift/vcpkg.json -+++ b/ports/thrift/vcpkg.json -@@ -1,6 +1,7 @@ - { - "name": "thrift", - "version": "0.20.0", -+ "port-version": 1, - "description": "Apache Thrift is a software project spanning a variety of programming languages and use cases. Our goal is to make reliable, performant communication and data serialization across languages as efficient and seamless as possible.", - "homepage": "https://github.com/apache/thrift", - "license": "Apache-2.0", -diff --git a/versions/baseline.json b/versions/baseline.json -index c6ce736..9ad1d63 100644 ---- a/versions/baseline.json -+++ b/versions/baseline.json -@@ -8622,7 +8622,7 @@ - }, - "thrift": { - "baseline": "0.20.0", -- "port-version": 0 -+ "port-version": 1 - }, - "tidy-html5": { - "baseline": "5.8.0", -diff --git a/versions/t-/thrift.json b/versions/t-/thrift.json -index 3db38c5..7464bde 100644 ---- a/versions/t-/thrift.json -+++ b/versions/t-/thrift.json -@@ -1,5 +1,10 @@ - { - "versions": [ -+ { -+ "git-tree": "13757a6b05741cf3c9c39e3a1dcc5e5cd685e025", -+ "version": "0.20.0", -+ "port-version": 1 -+ }, - { - "git-tree": "6855be1ce96497811d4eb0a9879baf6cf1b3610c", - "version": "0.20.0", +diff --git a/ports/orc/portfile.cmake b/ports/orc/portfile.cmake +index 77ebf41ec3..4d065594a7 100644 +--- a/ports/orc/portfile.cmake ++++ b/ports/orc/portfile.cmake +@@ -6,6 +6,8 @@ vcpkg_from_github( + REF "v${VERSION}" + SHA512 eabee16a6e984452a8cb715d0524041b20dd1bd88d78bb32534db93e5dbdd786aa4df8c05975406cb0728241eb3025a506c4fefb8c334ef0d8a27e6cb920d44c + HEAD_REF master ++ PATCHES ++ orc-fix-exception-propagation.diff + ) + + file(REMOVE "${SOURCE_PATH}/cmake_modules/FindGTest.cmake") diff --git a/ci/vcpkg/vcpkg.patch b/ci/vcpkg/vcpkg.patch new file mode 100644 index 00000000000..a4c8d520978 --- /dev/null +++ b/ci/vcpkg/vcpkg.patch @@ -0,0 +1,62 @@ +diff --git a/scripts/cmake/vcpkg_execute_build_process.cmake b/scripts/cmake/vcpkg_execute_build_process.cmake +index 60fd5b587a..c8dc021af8 100644 +--- a/scripts/cmake/vcpkg_execute_build_process.cmake ++++ b/scripts/cmake/vcpkg_execute_build_process.cmake +@@ -131,6 +131,26 @@ function(vcpkg_execute_build_process) + endif() + endforeach() + z_vcpkg_prettify_command_line(pretty_command ${arg_COMMAND}) ++ # --- Try to print error logs ++ # Split the string by newline characters ++ string(REGEX MATCHALL "[^\n]+" file_list ${stringified_logs}) ++ ++ # Iterate over the list and print content of each file ++ foreach(file IN LISTS file_list) ++ string(STRIP "${file}" file_stripped) ++ ++ # Print filename ++ message(STATUS "===") ++ message(STATUS "=== Build Failed. Content of ${file_stripped}:") ++ ++ # Read the content of the file ++ file(READ ${file_stripped} file_content) ++ ++ # Print the content ++ message(STATUS "${file_content}") ++ message(STATUS "=== End of content of ${file_stripped}") ++ endforeach() ++ # --- + message(FATAL_ERROR + " Command failed: ${pretty_command}\n" + " Working Directory: ${arg_WORKING_DIRECTORY}\n" +diff --git a/scripts/cmake/vcpkg_execute_required_process.cmake b/scripts/cmake/vcpkg_execute_required_process.cmake +index 830aa409fd..90452d857b 100644 +--- a/scripts/cmake/vcpkg_execute_required_process.cmake ++++ b/scripts/cmake/vcpkg_execute_required_process.cmake +@@ -109,6 +109,26 @@ Halting portfile execution. + endforeach() + + z_vcpkg_prettify_command_line(pretty_command ${arg_COMMAND}) ++ # --- Try to print error logs ++ # Split the string by newline characters ++ string(REGEX MATCHALL "[^\n]+" file_list ${stringified_logs}) ++ ++ # Iterate over the list and print content of each file ++ foreach(file IN LISTS file_list) ++ string(STRIP "${file}" file_stripped) ++ ++ # Print filename ++ message(STATUS "===") ++ message(STATUS "=== Build Failed. Content of ${file_stripped}:") ++ ++ # Read the content of the file ++ file(READ ${file_stripped} file_content) ++ ++ # Print the content ++ message(STATUS "${file_content}") ++ message(STATUS "=== End of content of ${file_stripped}") ++ endforeach() ++ # --- + message(FATAL_ERROR + " Command failed: ${pretty_command}\n" + " Working Directory: ${arg_WORKING_DIRECTORY}\n" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d6ecd2a355f..14cf1b91411 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -56,6 +56,16 @@ cmake_policy(SET CMP0068 NEW) # find_package() uses _ROOT variables. cmake_policy(SET CMP0074 NEW) +# https://cmake.org/cmake/help/latest/policy/CMP0077.html +# +# option() honors normal variables. +cmake_policy(SET CMP0077 NEW) + +# https://cmake.org/cmake/help/latest/policy/CMP0090.html +# +# export(PACKAGE) does nothing by default. +cmake_policy(SET CMP0090 NEW) + # https://cmake.org/cmake/help/latest/policy/CMP0091.html # # MSVC runtime library flags are selected by an abstraction. @@ -86,7 +96,7 @@ if(POLICY CMP0170) cmake_policy(SET CMP0170 NEW) endif() -set(ARROW_VERSION "20.0.0-SNAPSHOT") +set(ARROW_VERSION "22.0.0") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") @@ -167,6 +177,7 @@ set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(ARROW_LLVM_VERSIONS + "21.1" "20.1" "19.1" "18.1" @@ -208,7 +219,7 @@ if("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() -# Needed for linting targets, etc. +# Needed for Gandiva. # Use the first Python installation on PATH, not the newest one set(Python3_FIND_STRATEGY "LOCATION") # On Windows, use registry last, not first @@ -276,121 +287,6 @@ if(ARROW_OPTIONAL_INSTALL) set(INSTALL_IS_OPTIONAL OPTIONAL) endif() -# -# "make lint" target -# -if(NOT ARROW_VERBOSE_LINT) - set(ARROW_LINT_QUIET "--quiet") -endif() - -if(NOT LINT_EXCLUSIONS_FILE) - # source files matching a glob from a line in this file - # will be excluded from linting (cpplint, clang-tidy, clang-format) - set(LINT_EXCLUSIONS_FILE ${BUILD_SUPPORT_DIR}/lint_exclusions.txt) -endif() - -find_program(CPPLINT_BIN - NAMES cpplint cpplint.py - HINTS ${BUILD_SUPPORT_DIR}) -message(STATUS "Found cpplint executable at ${CPPLINT_BIN}") - -set(COMMON_LINT_OPTIONS - --exclude_globs - ${LINT_EXCLUSIONS_FILE} - --source_dir - ${CMAKE_CURRENT_SOURCE_DIR}/src - --source_dir - ${CMAKE_CURRENT_SOURCE_DIR}/examples - --source_dir - ${CMAKE_CURRENT_SOURCE_DIR}/tools) - -add_custom_target(lint - ${PYTHON_EXECUTABLE} - ${BUILD_SUPPORT_DIR}/run_cpplint.py - --cpplint_binary - ${CPPLINT_BIN} - ${COMMON_LINT_OPTIONS} - ${ARROW_LINT_QUIET} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) - -# -# "make format" and "make check-format" targets -# -if(${CLANG_FORMAT_FOUND}) - # runs clang format and updates files in place. - add_custom_target(format - ${PYTHON_EXECUTABLE} - ${BUILD_SUPPORT_DIR}/run_clang_format.py - --clang_format_binary - ${CLANG_FORMAT_BIN} - ${COMMON_LINT_OPTIONS} - --fix - ${ARROW_LINT_QUIET}) - - # runs clang format and exits with a non-zero exit code if any files need to be reformatted - add_custom_target(check-format - ${PYTHON_EXECUTABLE} - ${BUILD_SUPPORT_DIR}/run_clang_format.py - --clang_format_binary - ${CLANG_FORMAT_BIN} - ${COMMON_LINT_OPTIONS} - ${ARROW_LINT_QUIET}) -endif() - -add_custom_target(lint_cpp_cli ${PYTHON_EXECUTABLE} ${BUILD_SUPPORT_DIR}/lint_cpp_cli.py - ${CMAKE_CURRENT_SOURCE_DIR}/src) - -if(ARROW_LINT_ONLY) - message("ARROW_LINT_ONLY was specified, this is only a partial build directory") - return() -endif() - -# -# "make clang-tidy" and "make check-clang-tidy" targets -# -if(${CLANG_TIDY_FOUND}) - # TODO check to make sure .clang-tidy is being respected - - # runs clang-tidy and attempts to fix any warning automatically - add_custom_target(clang-tidy - ${PYTHON_EXECUTABLE} - ${BUILD_SUPPORT_DIR}/run_clang_tidy.py - --clang_tidy_binary - ${CLANG_TIDY_BIN} - --compile_commands - ${CMAKE_BINARY_DIR}/compile_commands.json - ${COMMON_LINT_OPTIONS} - --fix - ${ARROW_LINT_QUIET}) - - # runs clang-tidy and exits with a non-zero exit code if any errors are found. - add_custom_target(check-clang-tidy - ${PYTHON_EXECUTABLE} - ${BUILD_SUPPORT_DIR}/run_clang_tidy.py - --clang_tidy_binary - ${CLANG_TIDY_BIN} - --compile_commands - ${CMAKE_BINARY_DIR}/compile_commands.json - ${COMMON_LINT_OPTIONS} - ${ARROW_LINT_QUIET}) -endif() - -if(UNIX) - add_custom_target(iwyu - ${CMAKE_COMMAND} - -E - env - "PYTHON=${PYTHON_EXECUTABLE}" - ${BUILD_SUPPORT_DIR}/iwyu/iwyu.sh) - add_custom_target(iwyu-all - ${CMAKE_COMMAND} - -E - env - "PYTHON=${PYTHON_EXECUTABLE}" - ${BUILD_SUPPORT_DIR}/iwyu/iwyu.sh - all) -endif(UNIX) - # datetime code used by iOS requires zlib support if(IOS) set(ARROW_WITH_ZLIB ON) @@ -455,6 +351,12 @@ endif() include(SetupCxxFlags) +if(${CMAKE_CXX_FLAGS_DEBUG} MATCHES "-Og") + # GH-47475: xxhash fails inlining when -Og is used. + # See: https://github.com/Cyan4973/xxHash/issues/943 + add_definitions(-DXXH_NO_INLINE_HINTS) +endif() + # # Linker flags # @@ -502,14 +404,10 @@ endif() # where to put generated archives (.a files) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") -set(ARCHIVE_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") - # where to put generated libraries (.so files) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") -set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") - # where to put generated binaries -set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") if(CMAKE_GENERATOR STREQUAL Xcode) # Xcode projects support multi-configuration builds. This forces a single output directory @@ -553,6 +451,13 @@ if(ARROW_BUILD_STATIC) string(APPEND ARROW_ACERO_PC_CFLAGS_PRIVATE " -DARROW_ACERO_STATIC") endif() +# For arrow-compute.pc. +set(ARROW_COMPUTE_PC_CFLAGS "") +set(ARROW_COMPUTE_PC_CFLAGS_PRIVATE "") +if(ARROW_BUILD_STATIC) + string(APPEND ARROW_COMPUTE_PC_CFLAGS_PRIVATE " -DARROW_COMPUTE_STATIC") +endif() + # For arrow-cuda.pc. set(ARROW_CUDA_PC_CFLAGS "") set(ARROW_CUDA_PC_CFLAGS_PRIVATE "") @@ -781,7 +686,7 @@ endif() # the first link library. It's for prioritizing bundled FlatBuffers # than system FlatBuffers. list(PREPEND ARROW_TEST_LINK_LIBS arrow::flatbuffers) -list(APPEND ARROW_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK} ${ARROW_GTEST_GTEST_MAIN}) +list(APPEND ARROW_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK_MAIN}) if(ARROW_BUILD_BENCHMARKS) set(ARROW_BENCHMARK_LINK_LIBS benchmark::benchmark_main ${ARROW_TEST_LINK_LIBS}) @@ -808,10 +713,6 @@ if(ARROW_GANDIVA) add_subdirectory(src/gandiva) endif() -if(ARROW_SKYHOOK) - add_subdirectory(src/skyhook) -endif() - if(ARROW_BUILD_EXAMPLES) add_custom_target(runexample ctest -L example) add_subdirectory(examples/arrow) diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 114f79271d2..e1cad83ae3f 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -178,7 +178,7 @@ "cacheVariables": { "ARROW_BUILD_EXAMPLES": "ON", "ARROW_BUILD_UTILITIES": "ON", - "ARROW_SKYHOOK": "ON", + "ARROW_FLIGHT_SQL_ODBC": "ON", "ARROW_TENSORFLOW": "ON", "PARQUET_BUILD_EXAMPLES": "ON", "PARQUET_BUILD_EXECUTABLES": "ON" @@ -444,7 +444,8 @@ "CMAKE_CXX_COMPILER": "clang++", "ARROW_IPC": "ON", "ARROW_PARQUET": "ON", - "ARROW_FUZZING": "ON" + "ARROW_FUZZING": "ON", + "ARROW_WITH_SNAPPY": "ON" } }, { @@ -581,6 +582,65 @@ ], "displayName": "Benchmarking build with everything enabled", "cacheVariables": {} + }, + { + "name": "ninja-release-jni-linux", + "inherits": [ + "base-release" + ], + "displayName": "Build for JNI on Linux", + "cacheVariables": { + "ARROW_ACERO": "ON", + "ARROW_BUILD_SHARED": "OFF", + "ARROW_BUILD_STATIC": "ON", + "ARROW_CSV": "ON", + "ARROW_DATASET": "ON", + "ARROW_DEPENDENCY_SOURCE": "VCPKG", + "ARROW_DEPENDENCY_USE_SHARED": "OFF", + "ARROW_GANDIVA": "ON", + "ARROW_GANDIVA_STATIC_LIBSTDCPP": "ON", + "ARROW_GCS": "ON", + "ARROW_JSON": "ON", + "ARROW_ORC": "ON", + "ARROW_PARQUET": "ON", + "ARROW_RPATH_ORIGIN": "ON", + "ARROW_S3": "ON", + "ARROW_SUBSTRAIT": "ON", + "PARQUET_BUILD_EXAMPLES": "OFF", + "PARQUET_BUILD_EXECUTABLES": "OFF", + "PARQUET_REQUIRE_ENCRYPTION": "OFF", + "VCPKG_MANIFEST_MODE": "OFF", + "VCPKG_ROOT": "$env{VCPKG_ROOT}", + "VCPKG_TARGET_TRIPLET": "$env{VCPKG_TARGET_TRIPLET}" + } + }, + { + "name": "ninja-release-jni-macos", + "inherits": [ + "base-release" + ], + "displayName": "Build for JNI on macOS", + "cacheVariables": { + "ARROW_ACERO": "ON", + "ARROW_BUILD_SHARED": "OFF", + "ARROW_BUILD_STATIC": "ON", + "ARROW_CSV": "ON", + "ARROW_DATASET": "ON", + "ARROW_DEPENDENCY_USE_SHARED": "OFF", + "ARROW_GANDIVA": "ON", + "ARROW_GANDIVA_STATIC_LIBSTDCPP": "ON", + "ARROW_JSON": "ON", + "ARROW_ORC": "ON", + "ARROW_PARQUET": "ON", + "ARROW_S3": "ON", + "ARROW_SUBSTRAIT": "ON", + "AWSSDK_SOURCE": "BUNDLED", + "GTest_SOURCE": "BUNDLED", + "PARQUET_BUILD_EXAMPLES": "OFF", + "PARQUET_BUILD_EXECUTABLES": "OFF", + "PARQUET_REQUIRE_ENCRYPTION": "OFF", + "re2_SOURCE": "BUNDLED" + } } ] } diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index 5be93032c00..de7777a6c1c 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. -# Doxyfile 1.8.18 +# Doxyfile 1.13.2 # This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. +# Doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. @@ -29,6 +29,16 @@ # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). +# +# Note: +# +# Use Doxygen to compare the used configuration file with the template +# configuration file: +# doxygen -x [configFile] +# Use Doxygen to compare the used configuration file with the template +# configuration file without replacing the environment variables or CMake type +# replacement variables: +# doxygen -x_noenv [configFile] #--------------------------------------------------------------------------- # Project related configuration options @@ -58,7 +68,7 @@ PROJECT_NAME = "Apache Arrow (C++)" PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer a +# for a project that appears at the top of each page and should give viewers a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "A columnar in-memory analytics layer designed to accelerate big data." @@ -70,24 +80,42 @@ PROJECT_BRIEF = "A columnar in-memory analytics layer designed to accel PROJECT_LOGO = +# With the PROJECT_ICON tag one can specify an icon that is included in the tabs +# when the HTML document is shown. Doxygen will copy the logo to the output +# directory. + +PROJECT_ICON = + # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is -# entered, it will be relative to the location where doxygen was started. If +# entered, it will be relative to the location where Doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = $(OUTPUT_DIRECTORY) -# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- -# directories (in 2 levels) under the output directory of each output format and -# will distribute the generated files over these directories. Enabling this -# option can be useful when feeding doxygen a huge amount of source files, where -# putting all generated files in the same directory would otherwise causes -# performance problems for the file system. +# If the CREATE_SUBDIRS tag is set to YES then Doxygen will create up to 4096 +# sub-directories (in 2 levels) under the output directory of each output format +# and will distribute the generated files over these directories. Enabling this +# option can be useful when feeding Doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise cause +# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to +# control the number of sub-directories. # The default value is: NO. CREATE_SUBDIRS = NO -# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# Controls the number of sub-directories that will be created when +# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every +# level increment doubles the number of directories, resulting in 4096 +# directories at level 8 which is the default and also the maximum value. The +# sub-directories are organized in 2 levels, the first level always has a fixed +# number of 16 directories. +# Minimum value: 0, maximum value: 8, default value: 8. +# This tag requires that the tag CREATE_SUBDIRS is set to YES. + +CREATE_SUBDIRS_LEVEL = 8 + +# If the ALLOW_UNICODE_NAMES tag is set to YES, Doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. @@ -96,36 +124,28 @@ CREATE_SUBDIRS = NO ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this +# documentation generated by Doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, -# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), -# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, -# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), -# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, -# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, -# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, -# Ukrainian and Vietnamese. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian, +# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English +# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek, +# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with +# English messages), Korean, Korean-en (Korean with English messages), Latvian, +# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, +# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, +# Swedish, Turkish, Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English -# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all generated output in the proper direction. -# Possible values are: None, LTR, RTL and Context. -# The default value is: None. - -OUTPUT_TEXT_DIRECTION = None - -# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# If the BRIEF_MEMBER_DESC tag is set to YES, Doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES -# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# If the REPEAT_BRIEF tag is set to YES, Doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the @@ -156,13 +176,13 @@ ABBREVIATE_BRIEF = "The $name class" \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# doxygen will generate a detailed section even if there is only a brief +# Doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# If the INLINE_INHERITED_MEMB tag is set to YES, Doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. @@ -170,7 +190,7 @@ ALWAYS_DETAILED_SEC = NO INLINE_INHERITED_MEMB = NO -# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# If the FULL_PATH_NAMES tag is set to YES, Doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. @@ -180,11 +200,11 @@ FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the path to +# If left blank the directory from which Doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which -# will be relative from the directory where doxygen is started. +# will be relative from the directory where Doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = @@ -198,59 +218,68 @@ STRIP_FROM_PATH = STRIP_FROM_INC_PATH = ../src -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but -# less readable) file names. This can be useful is your file systems doesn't +# If the SHORT_NAMES tag is set to YES, Doxygen will generate much shorter (but +# less readable) file names. This can be useful if your file system doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO -# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the -# first line (until the first dot) of a Javadoc-style comment as the brief -# description. If set to NO, the Javadoc-style will behave just like regular Qt- -# style comments (thus requiring an explicit @brief command for a brief -# description.) +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen will interpret the +# first line (until the first dot, question mark or exclamation mark) of a +# Javadoc-style comment as the brief description. If set to NO, the Javadoc- +# style will behave just like regular Qt-style comments (thus requiring an +# explicit @brief command for a brief description.) # The default value is: NO. JAVADOC_AUTOBRIEF = YES -# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# If the JAVADOC_BANNER tag is set to YES then Doxygen will interpret a line # such as # /*************** # as being the beginning of a Javadoc-style comment "banner". If set to NO, the # Javadoc-style will behave just like regular comments and it will not be -# interpreted by doxygen. +# interpreted by Doxygen. # The default value is: NO. JAVADOC_BANNER = NO -# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first -# line (until the first dot) of a Qt-style comment as the brief description. If -# set to NO, the Qt-style will behave just like regular Qt-style comments (thus -# requiring an explicit \brief command for a brief description.) +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will interpret the first +# line (until the first dot, question mark or exclamation mark) of a Qt-style +# comment as the brief description. If set to NO, the Qt-style will behave just +# like regular Qt-style comments (thus requiring an explicit \brief command for +# a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are -# not recognized anymore. +# not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO +# By default Python docstrings are displayed as preformatted text and Doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# Doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as Doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES -# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# If the SEPARATE_MEMBER_PAGES tag is set to YES then Doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. @@ -267,16 +296,16 @@ TAB_SIZE = 4 # the documentation. An alias has the form: # name=value # For example adding -# "sideeffect=@par Side Effects:\n" +# "sideeffect=@par Side Effects:^^" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines (in the resulting output). You can put ^^ in the value part of an -# alias to insert a newline as if a physical newline was in the original file. -# When you need a literal { or } or , in the value part of an alias you have to -# escape them by means of a backslash (\), this can lead to conflicts with the -# commands \{ and \} for these it is advised to use the version @{ and @} or use -# a double escape (\\{ and \\}) +# "Side Effects:". Note that you cannot put \n's in the value part of an alias +# to insert newlines (in the resulting output). You can put ^^ in the value part +# of an alias to insert a newline as if a physical newline was in the original +# file. When you need a literal { or } or , in the value part of an alias you +# have to escape them by means of a backslash (\), this can lead to conflicts +# with the commands \{ and \} for these it is advised to use the version @{ and +# @} or use a double escape (\\{ and \\}) ALIASES = @@ -320,27 +349,30 @@ OPTIMIZE_OUTPUT_SLICE = NO # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, -# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL, -# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# language is one of the parsers supported by Doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice, +# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser # tries to guess whether the code is fixed or free formatted code, this is the -# default for Fortran type files). For instance to make doxygen treat .inc files +# default for Fortran type files). For instance to make Doxygen treat .inc files # as Fortran files (default is PHP), and .f files as C (default is Fortran), # use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. +# the files are not read by Doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. EXTENSION_MAPPING = -# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# If the MARKDOWN_SUPPORT tag is enabled then Doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See https://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you can -# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# The output of markdown processing is further processed by Doxygen, so you can +# mix Doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. @@ -350,25 +382,45 @@ MARKDOWN_SUPPORT = YES # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. -# Minimum value: 0, maximum value: 99, default value: 5. +# Minimum value: 0, maximum value: 99, default value: 6. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 0 -# When enabled doxygen tries to link words that correspond to documented +# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to +# generate identifiers for the Markdown headings. Note: Every identifier is +# unique. +# Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a +# sequence number starting at 0 and GITHUB use the lower case version of title +# with any whitespace replaced by '-' and punctuation characters removed. +# The default value is: DOXYGEN. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +MARKDOWN_ID_STYLE = DOXYGEN + +# When enabled Doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or -# globally by setting AUTOLINK_SUPPORT to NO. +# globally by setting AUTOLINK_SUPPORT to NO. Words listed in the +# AUTOLINK_IGNORE_WORDS tag are excluded from automatic linking. # The default value is: YES. AUTOLINK_SUPPORT = YES +# This tag specifies a list of words that, when matching the start of a word in +# the documentation, will suppress auto links generation, if it is enabled via +# AUTOLINK_SUPPORT. This list does not affect affect links explicitly created +# using \# or the \link or commands. +# This tag requires that the tag AUTOLINK_SUPPORT is set to YES. + +AUTOLINK_IGNORE_WORDS = + # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this -# tag to YES in order to let doxygen match functions declarations and +# tag to YES in order to let Doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); -# versus func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. +# versus func(std::string) {}). This also makes the inheritance and +# collaboration diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO @@ -380,16 +432,16 @@ BUILTIN_STL_SUPPORT = NO CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen -# will parse them like normal C++ but will assume all classes use public instead -# of private inheritance when no explicit protection keyword is present. +# https://www.riverbankcomputing.com/software) sources only. Doxygen will parse +# them like normal C++ but will assume all classes use public instead of private +# inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make -# doxygen to replace the get and set methods by a property in the documentation. +# Doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. @@ -398,7 +450,7 @@ SIP_SUPPORT = NO IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES then doxygen will reuse the documentation of the first +# tag is set to YES then Doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. @@ -456,21 +508,42 @@ TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the -# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small -# doxygen will become slower. If the cache is too large, memory is wasted. The +# code, Doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# Doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 -# symbols. At the end of a run doxygen will report the cache usage and suggest +# symbols. At the end of a run Doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 +# The NUM_PROC_THREADS specifies the number of threads Doxygen is allowed to use +# during processing. When set to 0 Doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which effectively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + +# If the TIMESTAMP tag is set different from NO then each generated page will +# contain the date or date and time when the page was generated. Setting this to +# NO can help when comparing the output of multiple runs. +# Possible values are: YES, NO, DATETIME and DATE. +# The default value is: NO. + +TIMESTAMP = NO + #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- -# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# If the EXTRACT_ALL tag is set to YES, Doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. @@ -529,7 +602,14 @@ EXTRACT_LOCAL_METHODS = NO EXTRACT_ANON_NSPACES = NO -# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. @@ -537,22 +617,31 @@ EXTRACT_ANON_NSPACES = NO HIDE_UNDOC_MEMBERS = NO -# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option -# has no effect if EXTRACT_ALL is enabled. +# will also hide undocumented C++ concepts if enabled. This option has no effect +# if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# If the HIDE_UNDOC_NAMESPACES tag is set to YES, Doxygen will hide all +# undocumented namespaces that are normally visible in the namespace hierarchy. +# If set to NO, these namespaces will be included in the various overviews. This +# option has no effect if EXTRACT_ALL is enabled. +# The default value is: YES. + +HIDE_UNDOC_NAMESPACES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all friend # declarations. If set to NO, these declarations will be included in the # documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = YES -# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. @@ -566,30 +655,44 @@ HIDE_IN_BODY_DOCS = NO INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case-sensitive file names. Windows -# (including Cygwin) ands Mac users are advised to set this option to NO. -# The default value is: system dependent. +# With the correct setting of option CASE_SENSE_NAMES Doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and macOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. +# Possible values are: SYSTEM, NO and YES. +# The default value is: SYSTEM. CASE_SENSE_NAMES = NO -# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# If the HIDE_SCOPE_NAMES tag is set to NO then Doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO -# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then Doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO -# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class +# will show which file needs to be included to use the class. +# The default value is: YES. + +SHOW_HEADERFILE = YES + +# If the SHOW_INCLUDE_FILES tag is set to YES then Doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. @@ -602,7 +705,7 @@ SHOW_INCLUDE_FILES = YES SHOW_GROUPED_MEMB_INC = NO -# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. @@ -614,14 +717,14 @@ FORCE_LOCAL_INCLUDES = NO INLINE_INFO = YES -# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# If the SORT_MEMBER_DOCS tag is set to YES then Doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# If the SORT_BRIEF_DOCS tag is set to YES then Doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. @@ -629,7 +732,7 @@ SORT_MEMBER_DOCS = YES SORT_BRIEF_DOCS = NO -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then Doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. @@ -641,7 +744,7 @@ SORT_BRIEF_DOCS = NO SORT_MEMBERS_CTORS_1ST = NO -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# If the SORT_GROUP_NAMES tag is set to YES then Doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. @@ -658,11 +761,11 @@ SORT_GROUP_NAMES = NO SORT_BY_SCOPE_NAME = NO -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# If the STRICT_PROTO_MATCHING option is enabled and Doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a -# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# simple string match. By disabling STRICT_PROTO_MATCHING Doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. @@ -732,24 +835,25 @@ SHOW_FILES = YES SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from +# Doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via -# popen()) the command input-file, where command is the value of the +# popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided -# by doxygen. Whatever the program writes to standard output is used as the file +# by Doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated +# by Doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. You can +# that represents Doxygen's defaults, run Doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. +# will be used as the name of the layout file. See also section "Changing the +# layout of pages" for information. # -# Note that if you run doxygen from a directory containing a file called -# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# Note that if you run Doxygen from a directory containing a file called +# DoxygenLayout.xml, Doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = @@ -764,19 +868,35 @@ LAYOUT_FILE = CITE_BIB_FILES = +# The EXTERNAL_TOOL_PATH tag can be used to extend the search path (PATH +# environment variable) so that external tools such as latex and gs can be +# found. +# Note: Directories specified with EXTERNAL_TOOL_PATH are added in front of the +# path already specified by the PATH variable, and are added in the order +# specified. +# Note: This option is particularly useful for macOS version 14 (Sonoma) and +# higher, when running Doxygen from Doxywizard, because in this case any user- +# defined changes to the PATH are ignored. A typical example on macOS is to set +# EXTERNAL_TOOL_PATH = /Library/TeX/texbin /usr/local/bin +# together with the standard path, the full search path used by doxygen when +# launching external tools will then become +# PATH=/Library/TeX/texbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin + +EXTERNAL_TOOL_PATH = + #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to -# standard output by doxygen. If QUIET is set to YES this implies that the +# standard output by Doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = YES # The WARNINGS tag can be used to turn on/off the warning messages that are -# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# generated to standard error (stderr) by Doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. @@ -784,49 +904,97 @@ QUIET = YES WARNINGS = YES -# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# If the WARN_IF_UNDOCUMENTED tag is set to YES then Doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES -# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some parameters -# in a documented function, or documenting parameters that don't exist or using -# markup commands wrongly. +# If the WARN_IF_DOC_ERROR tag is set to YES, Doxygen will generate warnings for +# potential errors in the documentation, such as documenting some parameters in +# a documented function twice, or documenting parameters that don't exist or +# using markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES +# If WARN_IF_INCOMPLETE_DOC is set to YES, Doxygen will warn about incomplete +# function parameter documentation. If set to NO, Doxygen will accept that some +# parameters have no documentation without warning. +# The default value is: YES. + +WARN_IF_INCOMPLETE_DOC = YES + # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return -# value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. If -# EXTRACT_ALL is set to YES then this flag will automatically be disabled. +# value. If set to NO, Doxygen will only warn about wrong parameter +# documentation, but not about the absence of documentation. If EXTRACT_ALL is +# set to YES then this flag will automatically be disabled. See also +# WARN_IF_INCOMPLETE_DOC # The default value is: NO. WARN_NO_PARAMDOC = NO -# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when -# a warning is encountered. +# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, Doxygen will warn about +# undocumented enumeration values. If set to NO, Doxygen will accept +# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: NO. + +WARN_IF_UNDOC_ENUM_VAL = NO + +# If WARN_LAYOUT_FILE option is set to YES, Doxygen will warn about issues found +# while parsing the user defined layout file, such as missing or wrong elements. +# See also LAYOUT_FILE for details. If set to NO, problems with the layout file +# will be suppressed. +# The default value is: YES. + +WARN_LAYOUT_FILE = YES + +# If the WARN_AS_ERROR tag is set to YES then Doxygen will immediately stop when +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then Doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the Doxygen process Doxygen will return with a non-zero status. +# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then Doxygen behaves +# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined Doxygen will not +# write the warning messages in between other messages but write them at the end +# of a run, in case a WARN_LOGFILE is defined the warning messages will be +# besides being in the defined file also be shown at the end of a run, unless +# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case +# the behavior will remain as with the setting FAIL_ON_WARNINGS. +# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT. # The default value is: NO. WARN_AS_ERROR = YES -# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# The WARN_FORMAT tag determines the format of the warning messages that Doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) +# See also: WARN_LINE_FORMAT # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" +# In the $text part of the WARN_FORMAT command it is possible that a reference +# to a more specific place is given. To make it easier to jump to this place +# (outside of Doxygen) the user can define a custom "cut" / "paste" string. +# Example: +# WARN_LINE_FORMAT = "'vi $file +$line'" +# See also: WARN_FORMAT +# The default value is: at line $line of file $file. + +WARN_LINE_FORMAT = "at line $line of file $file" + # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard -# error (stderr). +# error (stderr). In case the file specified cannot be opened for writing the +# warning and error messages are written to standard error. When as file - is +# specified the warning and error messages are written to standard output +# (stdout). WARN_LOGFILE = @@ -844,29 +1012,42 @@ INPUT = ../src \ . # This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# that Doxygen parses. Internally Doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: https://www.gnu.org/software/libiconv/) for the list of -# possible encodings. +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. +# See also: INPUT_FILE_ENCODING # The default value is: UTF-8. INPUT_ENCODING = UTF-8 +# This tag can be used to specify the character encoding of the source files +# that Doxygen parses. The INPUT_FILE_ENCODING tag can be used to specify +# character encoding on a per file pattern basis. Doxygen will compare the file +# name with each pattern and apply the encoding instead of the default +# INPUT_ENCODING if there is a match. The character encodings are a list of the +# form: pattern=encoding (like *.php=ISO-8859-1). +# See also: INPUT_ENCODING for further information on supported encodings. + +INPUT_FILE_ENCODING = + # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not -# read by doxygen. +# read by Doxygen. +# +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. # -# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, -# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, -# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), -# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen -# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, -# *.vhdl, *.ucf, *.qsf and *.ice. +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm, +# *.cpp, *.cppm, *.ccm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, +# *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d, +# *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to +# be provided as Doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.h \ *.hh \ @@ -890,7 +1071,7 @@ RECURSIVE = YES # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # -# Note that relative paths are relative to the directory from which doxygen is +# Note that relative paths are relative to the directory from which Doxygen is # run. EXCLUDE = ../src/arrow/vendored \ @@ -920,10 +1101,7 @@ EXCLUDE_PATTERNS = *-test.cc \ # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories use the pattern */test/* +# ANamespace::AClass, ANamespace::*Test EXCLUDE_SYMBOLS = *::detail \ *::internal \ @@ -957,7 +1135,7 @@ EXAMPLE_RECURSIVE = NO IMAGE_PATH = -# The INPUT_FILTER tag can be used to specify a program that doxygen should +# The INPUT_FILTER tag can be used to specify a program that Doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # @@ -972,9 +1150,14 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # +# Note that Doxygen will use the data processed and written to standard output +# for further processing, therefore nothing else, like debug statements or used +# commands (so in case of a Windows batch file always use @echo OFF), should be +# written to standard output. +# # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. +# properly processed by Doxygen. INPUT_FILTER = @@ -987,7 +1170,7 @@ INPUT_FILTER = # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. +# properly processed by Doxygen. FILTER_PATTERNS = @@ -1009,10 +1192,28 @@ FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub -# and want to reuse the introduction page also for the doxygen output. +# and want to reuse the introduction page also for the Doxygen output. USE_MDFILE_AS_MAINPAGE = +# If the IMPLICIT_DIR_DOCS tag is set to YES, any README.md file found in sub- +# directories of the project's root, is used as the documentation for that sub- +# directory, except when the README.md starts with a \dir, \page or \mainpage +# command. If set to NO, the README.md file needs to start with an explicit \dir +# command in order to be used as directory documentation. +# The default value is: YES. + +IMPLICIT_DIR_DOCS = YES + +# The Fortran standard specifies that for fixed formatted Fortran code all +# characters from position 72 are to be considered as comment. A common +# extension is to allow longer lines before the automatic comment starts. The +# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can +# be processed before the automatic comment starts. +# Minimum value: 7, maximum value: 10000, default value: 72. + +FORTRAN_COMMENT_AFTER = 72 + #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- @@ -1027,12 +1228,13 @@ USE_MDFILE_AS_MAINPAGE = SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, -# classes and enums directly into the documentation. +# multi-line macros, enums or list initialized variables directly into the +# documentation. # The default value is: NO. INLINE_SOURCES = NO -# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct Doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. @@ -1070,7 +1272,7 @@ REFERENCES_LINK_SOURCE = YES SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will -# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# point to the HTML generated by the htags(1) tool instead of Doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. @@ -1084,14 +1286,14 @@ SOURCE_TOOLTIPS = YES # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # -# The result: instead of the source browser generated by doxygen, the links to +# The result: instead of the source browser generated by Doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO -# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# If the VERBATIM_HEADERS tag is set the YES then Doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. @@ -1110,17 +1312,11 @@ VERBATIM_HEADERS = YES ALPHABETICAL_INDEX = YES -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all classes will -# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -# can be used to specify a prefix (or a list of prefixes) that should be ignored -# while generating the index headers. +# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes) +# that should be ignored while generating the index headers. The IGNORE_PREFIX +# tag works for classes, function and member names. The entity will be placed in +# the alphabetical list under the first letter of the entity name that remains +# after removing the prefix. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = @@ -1129,7 +1325,7 @@ IGNORE_PREFIX = # Configuration options related to the HTML output #--------------------------------------------------------------------------- -# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# If the GENERATE_HTML tag is set to YES, Doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES @@ -1150,40 +1346,40 @@ HTML_OUTPUT = html HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for -# each generated HTML page. If the tag is left blank doxygen will generate a +# each generated HTML page. If the tag is left blank Doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets -# that doxygen needs, which is dependent on the configuration options used (e.g. +# that Doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" -# for information on how to generate the default header that doxygen normally +# for information on how to generate the default header that Doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the -# default header when upgrading to a newer version of doxygen. For a description +# default header when upgrading to a newer version of Doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each -# generated HTML page. If the tag is left blank doxygen will generate a standard +# generated HTML page. If the tag is left blank Doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer -# that doxygen normally uses. +# that Doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = footer.html # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of -# the HTML output. If left blank doxygen will generate a default style sheet. +# the HTML output. If left blank Doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style -# sheet that doxygen normally uses. +# sheet that Doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. @@ -1193,13 +1389,18 @@ HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets -# created by doxygen. Using this option one can overrule certain style aspects. +# created by Doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the -# list). For an example see the documentation. +# list). +# Note: Since the styling of scrollbars can currently not be overruled in +# Webkit/Chromium, the styling will be left out of the default doxygen.css if +# one or more extra stylesheets have been specified. So if scrollbar +# customization is desired it has to be added explicitly. For an example see the +# documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = @@ -1214,9 +1415,22 @@ HTML_EXTRA_STYLESHEET = HTML_EXTRA_FILES = +# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output +# should be rendered with a dark or light theme. +# Possible values are: LIGHT always generates light mode output, DARK always +# generates dark mode output, AUTO_LIGHT automatically sets the mode according +# to the user preference, uses light mode if no preference is set (the default), +# AUTO_DARK automatically sets the mode according to the user preference, uses +# dark mode if no preference is set and TOGGLE allows a user to switch between +# light and dark mode via a button. +# The default value is: AUTO_LIGHT. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE = AUTO_LIGHT + # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to -# this color. Hue is specified as an angle on a colorwheel, see +# this color. Hue is specified as an angle on a color-wheel, see # https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. @@ -1226,7 +1440,7 @@ HTML_EXTRA_FILES = HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use grayscales only. A +# in the HTML output. For a value of 0 the output will use gray-scales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1244,15 +1458,6 @@ HTML_COLORSTYLE_SAT = 100 HTML_COLORSTYLE_GAMMA = 80 -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via JavaScript. If disabled, the navigation index will @@ -1272,6 +1477,33 @@ HTML_DYNAMIC_MENUS = YES HTML_DYNAMIC_SECTIONS = NO +# If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be +# dynamically folded and expanded in the generated HTML source code. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_CODE_FOLDING = YES + +# If the HTML_COPY_CLIPBOARD tag is set to YES then Doxygen will show an icon in +# the top right corner of code and text fragments that allows the user to copy +# its content to the clipboard. Note this only works if supported by the browser +# and the web page is served via a secure context (see: +# https://www.w3.org/TR/secure-contexts/), i.e. using the https: or file: +# protocol. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COPY_CLIPBOARD = YES + +# Doxygen stores a couple of settings persistently in the browser (via e.g. +# cookies). By default these settings apply to all HTML pages generated by +# Doxygen across all projects. The HTML_PROJECT_COOKIE tag can be used to store +# the settings under a project specific key, such that the user preferences will +# be stored separately. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_PROJECT_COOKIE = + # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to @@ -1287,10 +1519,11 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: https://developer.apple.com/xcode/), introduced with OSX -# 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, Doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy # genXcode/_index.html for more information. @@ -1307,6 +1540,13 @@ GENERATE_DOCSET = NO DOCSET_FEEDNAME = "Doxygen generated docs" +# This tag determines the URL of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDURL = + # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. @@ -1329,14 +1569,18 @@ DOCSET_PUBLISHER_ID = org.doxygen.Publisher DOCSET_PUBLISHER_NAME = Publisher -# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# If the GENERATE_HTMLHELP tag is set to YES then Doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. +# on Windows. In the beginning of 2021 Microsoft took the original page, with +# a.o. the download links, offline (the HTML help workshop was already many +# years in maintenance mode). You can download the HTML help workshop from the +# web archives at Installation executable (see: +# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo +# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe). # # The HTML Help Workshop contains a compiler that can convert all HTML output -# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# generated by Doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for @@ -1356,14 +1600,14 @@ CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, -# doxygen will try to run the HTML help compiler on the generated index.hhp. +# Doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). +# (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. @@ -1390,6 +1634,16 @@ BINARY_TOC = NO TOC_EXPAND = NO +# The SITEMAP_URL tag is used to specify the full URL of the place where the +# generated documentation will be placed on the server by the user during the +# deployment of the documentation. The generated sitemap is called sitemap.xml +# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL +# is specified no sitemap is generated. For information about the sitemap +# protocol see https://www.sitemaps.org +# This tag requires that the tag GENERATE_HTML is set to YES. + +SITEMAP_URL = + # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help @@ -1408,7 +1662,8 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1416,8 +1671,8 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- -# folders). +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1425,16 +1680,16 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = @@ -1446,9 +1701,9 @@ QHP_CUST_FILTER_ATTRS = QHP_SECT_FILTER_ATTRS = -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty Doxygen will try to +# run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = @@ -1479,7 +1734,7 @@ ECLIPSE_DOC_ID = org.doxygen.Project # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. -# The default value is: NO. +# The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO @@ -1491,18 +1746,30 @@ DISABLE_INDEX = NO # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can -# further fine-tune the look of the index. As an example, the default style -# sheet generated by doxygen has an example that shows how to put an image at -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -# the same information as the tab index, you could consider setting -# DISABLE_INDEX to YES when enabling this option. -# The default value is: NO. +# further fine tune the look of the index (see "Fine-tuning the output"). As an +# example, the default style sheet generated by Doxygen has an example that +# shows how to put an image at the root of the tree instead of the PROJECT_NAME. +# Since the tree basically has the same information as the tab index, you could +# consider setting DISABLE_INDEX to YES when enabling this option. +# The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO +# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the +# FULL_SIDEBAR option determines if the side bar is limited to only the treeview +# area (value NO) or if it should extend to the full height of the window (value +# YES). Setting this to YES gives a layout similar to +# https://docs.readthedocs.io with more room for contents, but less room for the +# project logo, title, and description. If either GENERATE_TREEVIEW or +# DISABLE_INDEX is set to NO, this option has no effect. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FULL_SIDEBAR = NO + # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that -# doxygen will group on one line in the generated HTML documentation. +# Doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. @@ -1511,6 +1778,12 @@ GENERATE_TREEVIEW = NO ENUM_VALUES_PER_LINE = 4 +# When the SHOW_ENUM_VALUES tag is set doxygen will show the specified +# enumeration values besides the enumeration mnemonics. +# The default value is: NO. + +SHOW_ENUM_VALUES = NO + # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. @@ -1518,19 +1791,26 @@ ENUM_VALUES_PER_LINE = 4 TREEVIEW_WIDTH = 250 -# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# If the EXT_LINKS_IN_WINDOW option is set to YES, Doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO -# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# If the OBFUSCATE_EMAILS tag is set to YES, Doxygen will obfuscate email +# addresses. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +OBFUSCATE_EMAILS = YES + +# If the HTML_FORMULA_FORMAT option is set to svg, Doxygen will use the pdf2svg # tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see # https://inkscape.org) to generate formulas as SVG images instead of PNGs for # the HTML output. These images will generally look nicer at scaled resolutions. -# Possible values are: png The default and svg Looks nicer but requires the -# pdf2svg tool. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). # The default value is: png. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1538,24 +1818,13 @@ HTML_FORMULA_FORMAT = png # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful -# doxygen run you need to manually remove any form_*.png images from the HTML +# Doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANSPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - # The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands # to create new LaTeX commands to be used in formulas as building blocks. See # the section "Including formulas" for details. @@ -1573,11 +1842,29 @@ FORMULA_MACROFILE = USE_MATHJAX = NO +# With MATHJAX_VERSION it is possible to specify the MathJax version to be used. +# Note that the different versions of MathJax have different requirements with +# regards to the different settings, so it is possible that also other MathJax +# settings have to be changed when switching between the different MathJax +# versions. +# Possible values are: MathJax_2 and MathJax_3. +# The default value is: MathJax_2. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_VERSION = MathJax_2 + # When MathJax is enabled you can set the default output format to be used for -# the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. +# the MathJax output. For more details about the output format see MathJax +# version 2 (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3 +# (see: +# http://docs.mathjax.org/en/latest/web/components/output.html). # Possible values are: HTML-CSS (which is slower, but has the best -# compatibility), NativeMML (i.e. MathML) and SVG. +# compatibility. This is the name for Mathjax version 2, for MathJax version 3 +# this will be translated into chtml), NativeMML (i.e. MathML. Only supported +# for MathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This +# is the name for Mathjax version 3, for MathJax version 2 this will be +# translated into HTML-CSS) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1590,33 +1877,40 @@ MATHJAX_FORMAT = HTML-CSS # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of -# MathJax from https://www.mathjax.org before deployment. -# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. +# MathJax from https://www.mathjax.org before deployment. The default value is: +# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2 +# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3 # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example +# for MathJax version 2 (see +# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions): # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# For example for MathJax version 3 (see +# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html): +# MATHJAX_EXTENSIONS = ams # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = -# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# The MATHJAX_CODEFILE tag can be used to specify a file with JavaScript pieces # of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = -# When the SEARCHENGINE tag is enabled doxygen will generate a search box for -# the HTML output. The underlying search engine uses javascript and DHTML and +# When the SEARCHENGINE tag is enabled Doxygen will generate a search box for +# the HTML output. The underlying search engine uses JavaScript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. -# For large projects the javascript based search engine can be slow, then +# For large projects the JavaScript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically @@ -1635,7 +1929,7 @@ SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be # implemented using a web server instead of a web client using JavaScript. There # are two flavors of web server based searching depending on the EXTERNAL_SEARCH -# setting. When disabled, doxygen will generate a PHP script for searching and +# setting. When disabled, Doxygen will generate a PHP script for searching and # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing # and searching needs to be provided by external tools. See the section # "External Indexing and Searching" for details. @@ -1644,7 +1938,7 @@ SEARCHENGINE = YES SERVER_BASED_SEARCH = NO -# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP +# When EXTERNAL_SEARCH tag is enabled Doxygen will no longer generate the PHP # script for searching. Instead the search results are written to an XML file # which needs to be processed by an external indexer. Doxygen will invoke an # external search engine pointed to by the SEARCHENGINE_URL option to obtain the @@ -1652,7 +1946,8 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: https://xapian.org/). +# Xapian (see: +# https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1665,8 +1960,9 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: https://xapian.org/). See the section "External Indexing and -# Searching" for details. +# Xapian (see: +# https://xapian.org/). See the section "External Indexing and Searching" for +# details. # This tag requires that the tag SEARCHENGINE is set to YES. SEARCHENGINE_URL = @@ -1687,7 +1983,7 @@ SEARCHDATA_FILE = searchdata.xml EXTERNAL_SEARCH_ID = -# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen +# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through Doxygen # projects other than the one defined by this configuration file, but that are # all added to the same external search index. Each project needs to have a # unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of @@ -1701,7 +1997,7 @@ EXTRA_SEARCH_MAPPINGS = # Configuration options related to the LaTeX output #--------------------------------------------------------------------------- -# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output. +# If the GENERATE_LATEX tag is set to YES, Doxygen will generate LaTeX output. # The default value is: YES. GENERATE_LATEX = NO @@ -1746,7 +2042,7 @@ MAKEINDEX_CMD_NAME = makeindex LATEX_MAKEINDEX_CMD = makeindex -# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX +# If the COMPACT_LATEX tag is set to YES, Doxygen generates more compact LaTeX # documents. This may be useful for small projects and may help to save some # trees in general. # The default value is: NO. @@ -1775,36 +2071,38 @@ PAPER_TYPE = a4 EXTRA_PACKAGES = -# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the -# generated LaTeX document. The header should contain everything until the first -# chapter. If it is left blank doxygen will generate a standard header. See -# section "Doxygen usage" for information on how to let doxygen write the -# default header to a separate file. +# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for +# the generated LaTeX document. The header should contain everything until the +# first chapter. If it is left blank Doxygen will generate a standard header. It +# is highly recommended to start with a default header using +# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty +# and then modify the file new_header.tex. See also section "Doxygen usage" for +# information on how to generate the default header that Doxygen normally uses. # -# Note: Only use a user-defined header if you know what you are doing! The -# following commands have a special meaning inside the header: $title, -# $datetime, $date, $doxygenversion, $projectname, $projectnumber, -# $projectbrief, $projectlogo. Doxygen will replace $title with the empty -# string, for the replacement values of the other commands the user is referred -# to HTML_HEADER. +# Note: Only use a user-defined header if you know what you are doing! +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of Doxygen. The following +# commands have a special meaning inside the header (and footer): For a +# description of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_HEADER = -# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the -# generated LaTeX document. The footer should contain everything after the last -# chapter. If it is left blank doxygen will generate a standard footer. See +# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for +# the generated LaTeX document. The footer should contain everything after the +# last chapter. If it is left blank Doxygen will generate a standard footer. See # LATEX_HEADER for more information on how to generate a default footer and what -# special commands can be used inside the footer. -# -# Note: Only use a user-defined footer if you know what you are doing! +# special commands can be used inside the footer. See also section "Doxygen +# usage" for information on how to generate the default footer that Doxygen +# normally uses. Note: Only use a user-defined footer if you know what you are +# doing! # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created -# by doxygen. Using this option one can overrule certain style aspects. Doxygen +# by Doxygen. Using this option one can overrule certain style aspects. Doxygen # will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the @@ -1830,56 +2128,46 @@ LATEX_EXTRA_FILES = PDF_HYPERLINKS = YES -# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate -# the PDF file directly from the LaTeX files. Set this option to YES, to get a -# higher quality PDF documentation. +# If the USE_PDFLATEX tag is set to YES, Doxygen will use the engine as +# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX +# files. Set this option to YES, to get a higher quality PDF documentation. +# +# See also section LATEX_CMD_NAME for selecting the engine. # The default value is: YES. # This tag requires that the tag GENERATE_LATEX is set to YES. USE_PDFLATEX = YES -# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode -# command to the generated LaTeX files. This will instruct LaTeX to keep running -# if errors occur, instead of asking the user for help. This option is also used -# when generating formulas in HTML. +# The LATEX_BATCHMODE tag signals the behavior of LaTeX in case of an error. +# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch +# mode nothing is printed on the terminal, errors are scrolled as if is +# hit at every error; missing files that TeX tries to input or request from +# keyboard input (\read on a not open input stream) cause the job to abort, +# NON_STOP In nonstop mode the diagnostic message will appear on the terminal, +# but there is no possibility of user interaction just like in batch mode, +# SCROLL In scroll mode, TeX will stop only for missing files to input or if +# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at +# each error, asking for user intervention. # The default value is: NO. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_BATCHMODE = NO -# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the +# If the LATEX_HIDE_INDICES tag is set to YES then Doxygen will not include the # index chapters (such as File Index, Compound Index, etc.) in the output. # The default value is: NO. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_HIDE_INDICES = NO -# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source -# code with syntax highlighting in the LaTeX output. -# -# Note that which sources are shown also depends on other settings such as -# SOURCE_BROWSER. -# The default value is: NO. -# This tag requires that the tag GENERATE_LATEX is set to YES. - -LATEX_SOURCE_CODE = NO - # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See # https://en.wikipedia.org/wiki/BibTeX and \cite for more info. -# The default value is: plain. +# The default value is: plainnat. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_BIB_STYLE = plain -# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated -# page will contain the date and time when the page was generated. Setting this -# to NO can help when comparing the output of multiple runs. -# The default value is: NO. -# This tag requires that the tag GENERATE_LATEX is set to YES. - -LATEX_TIMESTAMP = NO - # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) # path from which the emoji images will be read. If a relative path is entered, # it will be relative to the LATEX_OUTPUT directory. If left blank the @@ -1892,7 +2180,7 @@ LATEX_EMOJI_DIRECTORY = # Configuration options related to the RTF output #--------------------------------------------------------------------------- -# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The +# If the GENERATE_RTF tag is set to YES, Doxygen will generate RTF output. The # RTF output is optimized for Word 97 and may not look too pretty with other RTF # readers/editors. # The default value is: NO. @@ -1907,7 +2195,7 @@ GENERATE_RTF = NO RTF_OUTPUT = rtf -# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF +# If the COMPACT_RTF tag is set to YES, Doxygen generates more compact RTF # documents. This may be useful for small projects and may help to save some # trees in general. # The default value is: NO. @@ -1927,38 +2215,36 @@ COMPACT_RTF = NO RTF_HYPERLINKS = NO -# Load stylesheet definitions from file. Syntax is similar to doxygen's +# Load stylesheet definitions from file. Syntax is similar to Doxygen's # configuration file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. # # See also section "Doxygen usage" for information on how to generate the -# default style sheet that doxygen normally uses. +# default style sheet that Doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is -# similar to doxygen's configuration file. A template extensions file can be +# similar to Doxygen's configuration file. A template extensions file can be # generated using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. RTF_EXTENSIONS_FILE = -# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code -# with syntax highlighting in the RTF output. -# -# Note that which sources are shown also depends on other settings such as -# SOURCE_BROWSER. -# The default value is: NO. +# The RTF_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the RTF_OUTPUT output directory. +# Note that the files will be copied as-is; there are no commands or markers +# available. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_SOURCE_CODE = NO +RTF_EXTRA_FILES = #--------------------------------------------------------------------------- # Configuration options related to the man page output #--------------------------------------------------------------------------- -# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for +# If the GENERATE_MAN tag is set to YES, Doxygen will generate man pages for # classes and files. # The default value is: NO. @@ -1989,7 +2275,7 @@ MAN_EXTENSION = .3 MAN_SUBDIR = -# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, then it # will generate one additional man file for each entity documented in the real # man page(s). These additional files only source the real man page, but without # them the man command would be unable to find the correct page. @@ -2002,7 +2288,7 @@ MAN_LINKS = NO # Configuration options related to the XML output #--------------------------------------------------------------------------- -# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that +# If the GENERATE_XML tag is set to YES, Doxygen will generate an XML file that # captures the structure of the code including all documentation. # The default value is: NO. @@ -2016,7 +2302,7 @@ GENERATE_XML = YES XML_OUTPUT = xml -# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program +# If the XML_PROGRAMLISTING tag is set to YES, Doxygen will dump the program # listings (including syntax highlighting and cross-referencing information) to # the XML output. Note that enabling this will significantly increase the size # of the XML output. @@ -2025,7 +2311,7 @@ XML_OUTPUT = xml XML_PROGRAMLISTING = YES -# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include +# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, Doxygen will include # namespace members in file scope as well, matching the HTML output. # The default value is: NO. # This tag requires that the tag GENERATE_XML is set to YES. @@ -2036,7 +2322,7 @@ XML_NS_MEMB_FILE_SCOPE = NO # Configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- -# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files +# If the GENERATE_DOCBOOK tag is set to YES, Doxygen will generate Docbook files # that can be used to generate PDF. # The default value is: NO. @@ -2050,32 +2336,49 @@ GENERATE_DOCBOOK = NO DOCBOOK_OUTPUT = docbook -# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the -# program listings (including syntax highlighting and cross-referencing -# information) to the DOCBOOK output. Note that enabling this will significantly -# increase the size of the DOCBOOK output. -# The default value is: NO. -# This tag requires that the tag GENERATE_DOCBOOK is set to YES. - -DOCBOOK_PROGRAMLISTING = NO - #--------------------------------------------------------------------------- # Configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- -# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# If the GENERATE_AUTOGEN_DEF tag is set to YES, Doxygen will generate an +# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures # the structure of the code including all documentation. Note that this feature # is still experimental and incomplete at the moment. # The default value is: NO. GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# Configuration options related to Sqlite3 output +#--------------------------------------------------------------------------- + +# If the GENERATE_SQLITE3 tag is set to YES Doxygen will generate a Sqlite3 +# database with symbols found by Doxygen stored in tables. +# The default value is: NO. + +GENERATE_SQLITE3 = NO + +# The SQLITE3_OUTPUT tag is used to specify where the Sqlite3 database will be +# put. If a relative path is entered the value of OUTPUT_DIRECTORY will be put +# in front of it. +# The default directory is: sqlite3. +# This tag requires that the tag GENERATE_SQLITE3 is set to YES. + +SQLITE3_OUTPUT = sqlite3 + +# The SQLITE3_RECREATE_DB tag is set to YES, the existing doxygen_sqlite3.db +# database file will be recreated with each Doxygen run. If set to NO, Doxygen +# will warn if a database file is already found and not modify it. +# The default value is: YES. +# This tag requires that the tag GENERATE_SQLITE3 is set to YES. + +SQLITE3_RECREATE_DB = YES + #--------------------------------------------------------------------------- # Configuration options related to the Perl module output #--------------------------------------------------------------------------- -# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module +# If the GENERATE_PERLMOD tag is set to YES, Doxygen will generate a Perl module # file that captures the structure of the code including all documentation. # # Note that this feature is still experimental and incomplete at the moment. @@ -2083,7 +2386,7 @@ GENERATE_AUTOGEN_DEF = NO GENERATE_PERLMOD = NO -# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary +# If the PERLMOD_LATEX tag is set to YES, Doxygen will generate the necessary # Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI # output from the Perl module output. # The default value is: NO. @@ -2113,13 +2416,13 @@ PERLMOD_MAKEVAR_PREFIX = # Configuration options related to the preprocessor #--------------------------------------------------------------------------- -# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all +# If the ENABLE_PREPROCESSING tag is set to YES, Doxygen will evaluate all # C-preprocessor directives found in the sources and include files. # The default value is: YES. ENABLE_PREPROCESSING = YES -# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names +# If the MACRO_EXPANSION tag is set to YES, Doxygen will expand all macro names # in the source code. If set to NO, only conditional compilation will be # performed. Macro expansion can be done in a controlled way by setting # EXPAND_ONLY_PREDEF to YES. @@ -2145,7 +2448,8 @@ SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by the -# preprocessor. +# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of +# RECURSIVE has no effect here. # This tag requires that the tag SEARCH_INCLUDES is set to YES. INCLUDE_PATH = @@ -2189,7 +2493,7 @@ PREDEFINED = __attribute__(x)= \ EXPAND_AS_DEFINED = -# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will +# If the SKIP_FUNCTION_MACROS tag is set to YES then Doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have # an all uppercase name, and do not end with a semicolon. Such function macros # are typically used for boiler-plate code, and will confuse the parser if not @@ -2213,26 +2517,26 @@ SKIP_FUNCTION_MACROS = YES # section "Linking to external documentation" for more information about the use # of tag files. # Note: Each tag file must have a unique name (where the name does NOT include -# the path). If a tag file is not located in the directory in which doxygen is +# the path). If a tag file is not located in the directory in which Doxygen is # run, you must also specify the path to the tagfile here. TAGFILES = -# When a file name is specified after GENERATE_TAGFILE, doxygen will create a +# When a file name is specified after GENERATE_TAGFILE, Doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. GENERATE_TAGFILE = -# If the ALLEXTERNALS tag is set to YES, all external class will be listed in -# the class index. If set to NO, only the inherited external classes will be -# listed. +# If the ALLEXTERNALS tag is set to YES, all external classes and namespaces +# will be listed in the class and namespace index. If set to NO, only the +# inherited external classes will be listed. # The default value is: NO. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed -# in the modules index. If set to NO, only the current project's groups will be +# in the topic index. If set to NO, only the current project's groups will be # listed. # The default value is: YES. @@ -2246,42 +2550,26 @@ EXTERNAL_GROUPS = YES EXTERNAL_PAGES = YES #--------------------------------------------------------------------------- -# Configuration options related to the dot tool +# Configuration options related to diagram generator tools #--------------------------------------------------------------------------- -# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram -# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to -# NO turns the diagrams off. Note that this option also works with HAVE_DOT -# disabled, but it is recommended to install and use dot, since it yields more -# powerful graphs. -# The default value is: YES. - -CLASS_DIAGRAMS = YES - -# You can include diagrams made with dia in doxygen documentation. Doxygen will -# then run dia to produce the diagram and insert it in the documentation. The -# DIA_PATH tag allows you to specify the directory where the dia binary resides. -# If left empty dia is assumed to be found in the default search path. - -DIA_PATH = - # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. # The default value is: YES. HIDE_UNDOC_RELATIONS = YES -# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# If you set the HAVE_DOT tag to YES then Doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz (see: -# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent +# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent # Bell Labs. The other options in this section have no effect if this option is # set to NO # The default value is: NO. HAVE_DOT = NO -# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed -# to run in parallel. When set to 0 doxygen will base this on the number of +# The DOT_NUM_THREADS specifies the number of dot invocations Doxygen is allowed +# to run in parallel. When set to 0 Doxygen will base this on the number of # processors available in the system. You can set it explicitly to a value # larger than 0 to get control over the balance between CPU load and processing # speed. @@ -2290,55 +2578,83 @@ HAVE_DOT = NO DOT_NUM_THREADS = 0 -# When you want a differently looking font in the dot files that doxygen -# generates you can specify the font name using DOT_FONTNAME. You need to make -# sure dot is able to find the font, which can be done by putting it in a -# standard location or by setting the DOTFONTPATH environment variable or by -# setting DOT_FONTPATH to the directory containing the font. -# The default value is: Helvetica. +# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of +# subgraphs. When you want a differently looking font in the dot files that +# Doxygen generates you can specify fontname, fontcolor and fontsize attributes. +# For details please see Node, +# Edge and Graph Attributes specification You need to make sure dot is able +# to find the font, which can be done by putting it in a standard location or by +# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the +# directory containing the font. Default graphviz fontsize is 14. +# The default value is: fontname=Helvetica,fontsize=10. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_COMMON_ATTR = "fontname=Helvetica,fontsize=10" + +# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can +# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. Complete documentation about +# arrows shapes. +# The default value is: labelfontname=Helvetica,labelfontsize=10. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTNAME = Helvetica +DOT_EDGE_ATTR = "labelfontname=Helvetica,labelfontsize=10" -# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of -# dot graphs. -# Minimum value: 4, maximum value: 24, default value: 10. +# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes +# around nodes set 'shape=plain' or 'shape=plaintext' Shapes specification +# The default value is: shape=box,height=0.2,width=0.4. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTSIZE = 10 +DOT_NODE_ATTR = "shape=box,height=0.2,width=0.4" -# By default doxygen will tell dot to use the default font as specified with -# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set -# the path where dot can find it using this tag. +# You can set the path where dot can find font specified with fontname in +# DOT_COMMON_ATTR and others dot attributes. # This tag requires that the tag HAVE_DOT is set to YES. DOT_FONTPATH = -# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for -# each documented class showing the direct and indirect inheritance relations. -# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO. +# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then Doxygen will +# generate a graph for each documented class showing the direct and indirect +# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and +# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case +# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the +# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used. +# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance +# relations will be shown as texts / links. Explicit enabling an inheritance +# graph or choosing a different representation for an inheritance graph of a +# specific class, can be accomplished by means of the command \inheritancegraph. +# Disabling an inheritance graph can be accomplished by means of the command +# \hideinheritancegraph. +# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. CLASS_GRAPH = YES -# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a +# If the COLLABORATION_GRAPH tag is set to YES then Doxygen will generate a # graph for each documented class showing the direct and indirect implementation # dependencies (inheritance, containment, and class references variables) of the -# class with other documented classes. +# class with other documented classes. Explicit enabling a collaboration graph, +# when COLLABORATION_GRAPH is set to NO, can be accomplished by means of the +# command \collaborationgraph. Disabling a collaboration graph can be +# accomplished by means of the command \hidecollaborationgraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. COLLABORATION_GRAPH = YES -# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for -# groups, showing the direct groups dependencies. +# If the GROUP_GRAPHS tag is set to YES then Doxygen will generate a graph for +# groups, showing the direct groups dependencies. Explicit enabling a group +# dependency graph, when GROUP_GRAPHS is set to NO, can be accomplished by means +# of the command \groupgraph. Disabling a directory graph can be accomplished by +# means of the command \hidegroupgraph. See also the chapter Grouping in the +# manual. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. GROUP_GRAPHS = YES -# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and +# If the UML_LOOK tag is set to YES, Doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. # The default value is: NO. @@ -2355,10 +2671,32 @@ UML_LOOK = NO # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. +# This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 +# If the DOT_UML_DETAILS tag is set to NO, Doxygen will show attributes and +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS +# tag is set to YES, Doxygen will add type and arguments for attributes and +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, Doxygen +# will not generate fields with class member information in the UML graphs. The +# class diagrams will look similar to the default class diagrams but using UML +# notation for the relationships. +# Possible values are: NO, YES and NONE. +# The default value is: NO. +# This tag requires that the tag UML_LOOK is set to YES. + +DOT_UML_DETAILS = NO + +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters +# to display on a single line. If the actual line length exceeds this threshold +# significantly it will be wrapped across multiple lines. Some heuristics are +# applied to avoid ugly line breaks. +# Minimum value: 0, maximum value: 1000, default value: 17. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_WRAP_THRESHOLD = 17 + # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. @@ -2368,24 +2706,29 @@ UML_LIMIT_NUM_FIELDS = 10 TEMPLATE_RELATIONS = NO # If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to -# YES then doxygen will generate a graph for each documented file showing the +# YES then Doxygen will generate a graph for each documented file showing the # direct and indirect include dependencies of the file with other documented -# files. +# files. Explicit enabling an include graph, when INCLUDE_GRAPH is is set to NO, +# can be accomplished by means of the command \includegraph. Disabling an +# include graph can be accomplished by means of the command \hideincludegraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. INCLUDE_GRAPH = YES # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are -# set to YES then doxygen will generate a graph for each documented file showing +# set to YES then Doxygen will generate a graph for each documented file showing # the direct and indirect include dependencies of the file with other documented -# files. +# files. Explicit enabling an included by graph, when INCLUDED_BY_GRAPH is set +# to NO, can be accomplished by means of the command \includedbygraph. Disabling +# an included by graph can be accomplished by means of the command +# \hideincludedbygraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. INCLUDED_BY_GRAPH = YES -# If the CALL_GRAPH tag is set to YES then doxygen will generate a call +# If the CALL_GRAPH tag is set to YES then Doxygen will generate a call # dependency graph for every global function or class method. # # Note that enabling this option will significantly increase the time of a run. @@ -2397,7 +2740,7 @@ INCLUDED_BY_GRAPH = YES CALL_GRAPH = NO -# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller +# If the CALLER_GRAPH tag is set to YES then Doxygen will generate a caller # dependency graph for every global function or class method. # # Note that enabling this option will significantly increase the time of a run. @@ -2409,44 +2752,59 @@ CALL_GRAPH = NO CALLER_GRAPH = NO -# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical +# If the GRAPHICAL_HIERARCHY tag is set to YES then Doxygen will graphical # hierarchy of all classes instead of a textual one. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. GRAPHICAL_HIERARCHY = YES -# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the +# If the DIRECTORY_GRAPH tag is set to YES then Doxygen will show the # dependencies a directory has on other directories in a graphical way. The # dependency relations are determined by the #include relations between the -# files in the directories. +# files in the directories. Explicit enabling a directory graph, when +# DIRECTORY_GRAPH is set to NO, can be accomplished by means of the command +# \directorygraph. Disabling a directory graph can be accomplished by means of +# the command \hidedirectorygraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. DIRECTORY_GRAPH = YES +# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels +# of child directories generated in directory dependency graphs by dot. +# Minimum value: 1, maximum value: 25, default value: 1. +# This tag requires that the tag DIRECTORY_GRAPH is set to YES. + +DIR_GRAPH_MAX_DEPTH = 1 + # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. For an explanation of the image formats see the section # output formats in the documentation of the dot tool (Graphviz (see: -# http://www.graphviz.org/)). -# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order -# to make the SVG files visible in IE 9+ (other browsers do not have this -# requirement). +# https://www.graphviz.org/)). +# +# Note the formats svg:cairo and svg:cairo:cairo cannot be used in combination +# with INTERACTIVE_SVG (the INTERACTIVE_SVG will be set to NO). # Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo, -# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and -# png:gdiplus:gdiplus. +# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus, +# png:gdiplus:gdiplus, svg:cairo, svg:cairo:cairo, svg:svg, svg:svg:core, +# gif:cairo, gif:cairo:gd, gif:cairo:gdiplus, gif:gdiplus, gif:gdiplus:gdiplus, +# gif:gd, gif:gd:gd, jpg:cairo, jpg:cairo:gd, jpg:cairo:gdiplus, jpg:gd, +# jpg:gd:gd, jpg:gdiplus and jpg:gdiplus:gdiplus. # The default value is: png. # This tag requires that the tag HAVE_DOT is set to YES. DOT_IMAGE_FORMAT = png -# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to -# enable generation of interactive SVG images that allow zooming and panning. +# If DOT_IMAGE_FORMAT is set to svg or svg:svg or svg:svg:core, then this option +# can be set to YES to enable generation of interactive SVG images that allow +# zooming and panning. # # Note that this requires a modern browser other than Internet Explorer. Tested # and working are Firefox, Chrome, Safari, and Opera. -# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make -# the SVG files visible. Older versions of IE do not have SVG support. +# +# Note This option will be automatically disabled when DOT_IMAGE_FORMAT is set +# to svg:cairo or svg:cairo:cairo. # The default value is: NO. # This tag requires that the tag HAVE_DOT is set to YES. @@ -2465,11 +2823,12 @@ DOT_PATH = DOTFILE_DIRS = -# The MSCFILE_DIRS tag can be used to specify one or more directories that -# contain msc files that are included in the documentation (see the \mscfile -# command). +# You can include diagrams made with dia in Doxygen documentation. Doxygen will +# then run dia to produce the diagram and insert it in the documentation. The +# DIA_PATH tag allows you to specify the directory where the dia binary resides. +# If left empty dia is assumed to be found in the default search path. -MSCFILE_DIRS = +DIA_PATH = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile @@ -2477,28 +2836,34 @@ MSCFILE_DIRS = DIAFILE_DIRS = -# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the -# path where java can find the plantuml.jar file. If left blank, it is assumed -# PlantUML is not used or called during a preprocessing step. Doxygen will -# generate a warning when it encounters a \startuml command in this case and -# will not generate output for the diagram. +# When using PlantUML, the PLANTUML_JAR_PATH tag should be used to specify the +# path where java can find the plantuml.jar file or to the filename of jar file +# to be used. If left blank, it is assumed PlantUML is not used or called during +# a preprocessing step. Doxygen will generate a warning when it encounters a +# \startuml command in this case and will not generate output for the diagram. PLANTUML_JAR_PATH = -# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a -# configuration file for plantuml. +# When using PlantUML, the PLANTUML_CFG_FILE tag can be used to specify a +# configuration file for PlantUML. PLANTUML_CFG_FILE = -# When using plantuml, the specified paths are searched for files specified by -# the !include statement in a plantuml block. +# When using PlantUML, the specified paths are searched for files specified by +# the !include statement in a PlantUML block. PLANTUML_INCLUDE_PATH = +# The PLANTUMLFILE_DIRS tag can be used to specify one or more directories that +# contain PlantUml files that are included in the documentation (see the +# \plantumlfile command). + +PLANTUMLFILE_DIRS = + # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes -# larger than this value, doxygen will truncate the graph, which is visualized -# by representing a node as a red box. Note that doxygen if the number of direct +# larger than this value, Doxygen will truncate the graph, which is visualized +# by representing a node as a red box. Note that if the number of direct # children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that # the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. @@ -2519,18 +2884,6 @@ DOT_GRAPH_MAX_NODES = 50 MAX_DOT_GRAPH_DEPTH = 0 -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not seem -# to support this out of the box. -# -# Warning: Depending on the platform used, enabling this option may lead to -# badly anti-aliased labels on the edges of a graph (i.e. they become hard to -# read). -# The default value is: NO. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_TRANSPARENT = NO - # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) support @@ -2540,17 +2893,37 @@ DOT_TRANSPARENT = NO DOT_MULTI_TARGETS = NO -# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page +# If the GENERATE_LEGEND tag is set to YES Doxygen will generate a legend page # explaining the meaning of the various boxes and arrows in the dot generated # graphs. +# Note: This tag requires that UML_LOOK isn't set, i.e. the Doxygen internal +# graphical representation for inheritance and collaboration diagrams is used. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. GENERATE_LEGEND = YES -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot +# If the DOT_CLEANUP tag is set to YES, Doxygen will remove the intermediate # files that are used to generate the various graphs. +# +# Note: This setting is not only used for dot files but also for msc temporary +# files. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. DOT_CLEANUP = YES + +# You can define message sequence charts within Doxygen comments using the \msc +# command. If the MSCGEN_TOOL tag is left empty (the default), then Doxygen will +# use a built-in version of mscgen tool to produce the charts. Alternatively, +# the MSCGEN_TOOL tag can also specify the name an external tool. For instance, +# specifying prog as the value, Doxygen will call the tool as prog -T +# -o . The external tool should support +# output file formats "png", "eps", "svg", and "ismap". + +MSCGEN_TOOL = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the \mscfile +# command). + +MSCFILE_DIRS = diff --git a/cpp/build-support/asan-suppressions.txt b/cpp/build-support/asan-suppressions.txt new file mode 100644 index 00000000000..553706e045f --- /dev/null +++ b/cpp/build-support/asan-suppressions.txt @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Note this file is merely a placeholder that contains no suppressions for now. +# But it may become useful in the future. \ No newline at end of file diff --git a/cpp/build-support/asan_symbolize.py b/cpp/build-support/asan_symbolize.py index 8bab72b661e..5328904581c 100755 --- a/cpp/build-support/asan_symbolize.py +++ b/cpp/build-support/asan_symbolize.py @@ -171,7 +171,7 @@ def symbolize(self, addr, binary, offset): # foo(type1, type2) (in object.name) (filename.cc:80) match = re.match(r'^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line) if DEBUG: - print('atos_line: {0}'.format(atos_line)) + print(f'atos_line: {atos_line}') if match: function_name = match.group(1) function_name = re.sub(r'\(.*?\)', '', function_name) diff --git a/cpp/build-support/cpplint.py b/cpp/build-support/cpplint.py deleted file mode 100755 index dc3d47ba8b4..00000000000 --- a/cpp/build-support/cpplint.py +++ /dev/null @@ -1,6927 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2009 Google Inc. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""Does google-lint on c++ files. - -The goal of this script is to identify places in the code that *may* -be in non-compliance with google style. It does not attempt to fix -up these problems -- the point is to educate. It does also not -attempt to find all problems, or to ensure that everything it does -find is legitimately a problem. - -In particular, we can get very confused by /* and // inside strings! -We do a small hack, which is to ignore //'s with "'s after them on the -same line, but it is far from perfect (in either direction). -""" - -# cpplint predates fstrings -# pylint: disable=consider-using-f-string - -# pylint: disable=invalid-name - -import codecs -import copy -import getopt -import glob -import itertools -import math # for log -import os -import re -import sre_compile -import string -import sys -import sysconfig -import unicodedata -import xml.etree.ElementTree - -# if empty, use defaults -_valid_extensions = set([]) - -__VERSION__ = '1.6.1' - -try: - # -- pylint: disable=used-before-assignment - xrange # Python 2 -except NameError: - # -- pylint: disable=redefined-builtin - xrange = range # Python 3 - - -_USAGE = """ -Syntax: cpplint.py [--verbose=#] [--output=emacs|eclipse|vs7|junit|sed|gsed] - [--filter=-x,+y,...] - [--counting=total|toplevel|detailed] [--root=subdir] - [--repository=path] - [--linelength=digits] [--headers=x,y,...] - [--recursive] - [--exclude=path] - [--extensions=hpp,cpp,...] - [--includeorder=default|standardcfirst] - [--quiet] - [--version] - [file] ... - - Style checker for C/C++ source files. - This is a fork of the Google style checker with minor extensions. - - The style guidelines this tries to follow are those in - https://google.github.io/styleguide/cppguide.html - - Every problem is given a confidence score from 1-5, with 5 meaning we are - certain of the problem, and 1 meaning it could be a legitimate construct. - This will miss some errors, and is not a substitute for a code review. - - To suppress false-positive errors of a certain category, add a - 'NOLINT(category)' comment to the line. NOLINT or NOLINT(*) - suppresses errors of all categories on that line. - - The files passed in will be linted; at least one file must be provided. - Default linted extensions are %s. - Other file types will be ignored. - Change the extensions with the --extensions flag. - - Flags: - - output=emacs|eclipse|vs7|junit|sed|gsed - By default, the output is formatted to ease emacs parsing. Visual Studio - compatible output (vs7) may also be used. Further support exists for - eclipse (eclipse), and JUnit (junit). XML parsers such as those used - in Jenkins and Bamboo may also be used. - The sed format outputs sed commands that should fix some of the errors. - Note that this requires gnu sed. If that is installed as gsed on your - system (common e.g. on macOS with homebrew) you can use the gsed output - format. Sed commands are written to stdout, not stderr, so you should be - able to pipe output straight to a shell to run the fixes. - - verbose=# - Specify a number 0-5 to restrict errors to certain verbosity levels. - Errors with lower verbosity levels have lower confidence and are more - likely to be false positives. - - quiet - Don't print anything if no errors are found. - - filter=-x,+y,... - Specify a comma-separated list of category-filters to apply: only - error messages whose category names pass the filters will be printed. - (Category names are printed with the message and look like - "[whitespace/indent]".) Filters are evaluated left to right. - "-FOO" means "do not print categories that start with FOO". - "+FOO" means "do print categories that start with FOO". - - Examples: --filter=-whitespace,+whitespace/braces - --filter=-whitespace,-runtime/printf,+runtime/printf_format - --filter=-,+build/include_what_you_use - - To see a list of all the categories used in cpplint, pass no arg: - --filter= - - counting=total|toplevel|detailed - The total number of errors found is always printed. If - 'toplevel' is provided, then the count of errors in each of - the top-level categories like 'build' and 'whitespace' will - also be printed. If 'detailed' is provided, then a count - is provided for each category like 'build/class'. - - repository=path - The top level directory of the repository, used to derive the header - guard CPP variable. By default, this is determined by searching for a - path that contains .git, .hg, or .svn. When this flag is specified, the - given path is used instead. This option allows the header guard CPP - variable to remain consistent even if members of a team have different - repository root directories (such as when checking out a subdirectory - with SVN). In addition, users of non-mainstream version control systems - can use this flag to ensure readable header guard CPP variables. - - Examples: - Assuming that Alice checks out ProjectName and Bob checks out - ProjectName/trunk and trunk contains src/chrome/ui/browser.h, then - with no --repository flag, the header guard CPP variable will be: - - Alice => TRUNK_SRC_CHROME_BROWSER_UI_BROWSER_H_ - Bob => SRC_CHROME_BROWSER_UI_BROWSER_H_ - - If Alice uses the --repository=trunk flag and Bob omits the flag or - uses --repository=. then the header guard CPP variable will be: - - Alice => SRC_CHROME_BROWSER_UI_BROWSER_H_ - Bob => SRC_CHROME_BROWSER_UI_BROWSER_H_ - - root=subdir - The root directory used for deriving header guard CPP variable. - This directory is relative to the top level directory of the repository - which by default is determined by searching for a directory that contains - .git, .hg, or .svn but can also be controlled with the --repository flag. - If the specified directory does not exist, this flag is ignored. - - Examples: - Assuming that src is the top level directory of the repository (and - cwd=top/src), the header guard CPP variables for - src/chrome/browser/ui/browser.h are: - - No flag => CHROME_BROWSER_UI_BROWSER_H_ - --root=chrome => BROWSER_UI_BROWSER_H_ - --root=chrome/browser => UI_BROWSER_H_ - --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_ - - linelength=digits - This is the allowed line length for the project. The default value is - 80 characters. - - Examples: - --linelength=120 - - recursive - Search for files to lint recursively. Each directory given in the list - of files to be linted is replaced by all files that descend from that - directory. Files with extensions not in the valid extensions list are - excluded. - - exclude=path - Exclude the given path from the list of files to be linted. Relative - paths are evaluated relative to the current directory and shell globbing - is performed. This flag can be provided multiple times to exclude - multiple files. - - Examples: - --exclude=one.cc - --exclude=src/*.cc - --exclude=src/*.cc --exclude=test/*.cc - - extensions=extension,extension,... - The allowed file extensions that cpplint will check - - Examples: - --extensions=%s - - includeorder=default|standardcfirst - For the build/include_order rule, the default is to blindly assume angle - bracket includes with file extension are c-system-headers (default), - even knowing this will have false classifications. - The default is established at google. - standardcfirst means to instead use an allow-list of known c headers and - treat all others as separate group of "other system headers". The C headers - included are those of the C-standard lib and closely related ones. - - headers=x,y,... - The header extensions that cpplint will treat as .h in checks. Values are - automatically added to --extensions list. - (by default, only files with extensions %s will be assumed to be headers) - - Examples: - --headers=%s - --headers=hpp,hxx - --headers=hpp - - cpplint.py supports per-directory configurations specified in CPPLINT.cfg - files. CPPLINT.cfg file can contain a number of key=value pairs. - Currently the following options are supported: - - set noparent - filter=+filter1,-filter2,... - exclude_files=regex - linelength=80 - root=subdir - headers=x,y,... - - "set noparent" option prevents cpplint from traversing directory tree - upwards looking for more .cfg files in parent directories. This option - is usually placed in the top-level project directory. - - The "filter" option is similar in function to --filter flag. It specifies - message filters in addition to the |_DEFAULT_FILTERS| and those specified - through --filter command-line flag. - - "exclude_files" allows to specify a regular expression to be matched against - a file name. If the expression matches, the file is skipped and not run - through the linter. - - "linelength" allows to specify the allowed line length for the project. - - The "root" option is similar in function to the --root flag (see example - above). Paths are relative to the directory of the CPPLINT.cfg. - - The "headers" option is similar in function to the --headers flag - (see example above). - - CPPLINT.cfg has an effect on files in the same directory and all - sub-directories, unless overridden by a nested configuration file. - - Example file: - filter=-build/include_order,+build/include_alpha - exclude_files=.*\\.cc - - The above example disables build/include_order warning and enables - build/include_alpha as well as excludes all .cc from being - processed by linter, in the current directory (where the .cfg - file is located) and all sub-directories. -""" - -# We categorize each error message we print. Here are the categories. -# We want an explicit list so we can list them all in cpplint --filter=. -# If you add a new error message with a new category, add it to the list -# here! cpplint_unittest.py should tell you if you forget to do this. -_ERROR_CATEGORIES = [ - 'build/class', - 'build/c++11', - 'build/c++14', - 'build/c++tr1', - 'build/deprecated', - 'build/endif_comment', - 'build/explicit_make_pair', - 'build/forward_decl', - 'build/header_guard', - 'build/include', - 'build/include_subdir', - 'build/include_alpha', - 'build/include_order', - 'build/include_what_you_use', - 'build/namespaces_headers', - 'build/namespaces_literals', - 'build/namespaces', - 'build/printf_format', - 'build/storage_class', - 'legal/copyright', - 'readability/alt_tokens', - 'readability/braces', - 'readability/casting', - 'readability/check', - 'readability/constructors', - 'readability/fn_size', - 'readability/inheritance', - 'readability/multiline_comment', - 'readability/multiline_string', - 'readability/namespace', - 'readability/nolint', - 'readability/nul', - 'readability/strings', - 'readability/todo', - 'readability/utf8', - 'runtime/arrays', - 'runtime/casting', - 'runtime/explicit', - 'runtime/int', - 'runtime/init', - 'runtime/invalid_increment', - 'runtime/member_string_references', - 'runtime/memset', - 'runtime/indentation_namespace', - 'runtime/operator', - 'runtime/printf', - 'runtime/printf_format', - 'runtime/references', - 'runtime/string', - 'runtime/threadsafe_fn', - 'runtime/vlog', - 'whitespace/blank_line', - 'whitespace/braces', - 'whitespace/comma', - 'whitespace/comments', - 'whitespace/empty_conditional_body', - 'whitespace/empty_if_body', - 'whitespace/empty_loop_body', - 'whitespace/end_of_line', - 'whitespace/ending_newline', - 'whitespace/forcolon', - 'whitespace/indent', - 'whitespace/line_length', - 'whitespace/newline', - 'whitespace/operators', - 'whitespace/parens', - 'whitespace/semicolon', - 'whitespace/tab', - 'whitespace/todo', - ] - -# keywords to use with --outputs which generate stdout for machine processing -_MACHINE_OUTPUTS = [ - 'junit', - 'sed', - 'gsed' -] - -# These error categories are no longer enforced by cpplint, but for backwards- -# compatibility they may still appear in NOLINT comments. -_LEGACY_ERROR_CATEGORIES = [ - 'readability/streams', - 'readability/function', - ] - -# These prefixes for categories should be ignored since they relate to other -# tools which also use the NOLINT syntax, e.g. clang-tidy. -_OTHER_NOLINT_CATEGORY_PREFIXES = [ - 'clang-analyzer', - ] - -# The default state of the category filter. This is overridden by the --filter= -# flag. By default all errors are on, so only add here categories that should be -# off by default (i.e., categories that must be enabled by the --filter= flags). -# All entries here should start with a '-' or '+', as in the --filter= flag. -_DEFAULT_FILTERS = ['-build/include_alpha'] - -# The default list of categories suppressed for C (not C++) files. -_DEFAULT_C_SUPPRESSED_CATEGORIES = [ - 'readability/casting', - ] - -# The default list of categories suppressed for Linux Kernel files. -_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [ - 'whitespace/tab', - ] - -# We used to check for high-bit characters, but after much discussion we -# decided those were OK, as long as they were in UTF-8 and didn't represent -# hard-coded international strings, which belong in a separate i18n file. - -# C++ headers -_CPP_HEADERS = frozenset([ - # Legacy - 'algobase.h', - 'algo.h', - 'alloc.h', - 'builtinbuf.h', - 'bvector.h', - 'complex.h', - 'defalloc.h', - 'deque.h', - 'editbuf.h', - 'fstream.h', - 'function.h', - 'hash_map', - 'hash_map.h', - 'hash_set', - 'hash_set.h', - 'hashtable.h', - 'heap.h', - 'indstream.h', - 'iomanip.h', - 'iostream.h', - 'istream.h', - 'iterator.h', - 'list.h', - 'map.h', - 'multimap.h', - 'multiset.h', - 'ostream.h', - 'pair.h', - 'parsestream.h', - 'pfstream.h', - 'procbuf.h', - 'pthread_alloc', - 'pthread_alloc.h', - 'rope', - 'rope.h', - 'ropeimpl.h', - 'set.h', - 'slist', - 'slist.h', - 'stack.h', - 'stdiostream.h', - 'stl_alloc.h', - 'stl_relops.h', - 'streambuf.h', - 'stream.h', - 'strfile.h', - 'strstream.h', - 'tempbuf.h', - 'tree.h', - 'type_traits.h', - 'vector.h', - # 17.6.1.2 C++ library headers - 'algorithm', - 'array', - 'atomic', - 'bitset', - 'chrono', - 'codecvt', - 'complex', - 'condition_variable', - 'deque', - 'exception', - 'forward_list', - 'fstream', - 'functional', - 'future', - 'initializer_list', - 'iomanip', - 'ios', - 'iosfwd', - 'iostream', - 'istream', - 'iterator', - 'limits', - 'list', - 'locale', - 'map', - 'memory', - 'mutex', - 'new', - 'numeric', - 'ostream', - 'queue', - 'random', - 'ratio', - 'regex', - 'scoped_allocator', - 'set', - 'sstream', - 'stack', - 'stdexcept', - 'streambuf', - 'string', - 'strstream', - 'system_error', - 'thread', - 'tuple', - 'typeindex', - 'typeinfo', - 'type_traits', - 'unordered_map', - 'unordered_set', - 'utility', - 'valarray', - 'vector', - # 17.6.1.2 C++14 headers - 'shared_mutex', - # 17.6.1.2 C++17 headers - 'any', - 'charconv', - 'codecvt', - 'execution', - 'filesystem', - 'memory_resource', - 'optional', - 'string_view', - 'variant', - # 17.6.1.2 C++ headers for C library facilities - 'cassert', - 'ccomplex', - 'cctype', - 'cerrno', - 'cfenv', - 'cfloat', - 'cinttypes', - 'ciso646', - 'climits', - 'clocale', - 'cmath', - 'csetjmp', - 'csignal', - 'cstdalign', - 'cstdarg', - 'cstdbool', - 'cstddef', - 'cstdint', - 'cstdio', - 'cstdlib', - 'cstring', - 'ctgmath', - 'ctime', - 'cuchar', - 'cwchar', - 'cwctype', - ]) - -# C headers -_C_HEADERS = frozenset([ - # System C headers - 'assert.h', - 'complex.h', - 'ctype.h', - 'errno.h', - 'fenv.h', - 'float.h', - 'inttypes.h', - 'iso646.h', - 'limits.h', - 'locale.h', - 'math.h', - 'setjmp.h', - 'signal.h', - 'stdalign.h', - 'stdarg.h', - 'stdatomic.h', - 'stdbool.h', - 'stddef.h', - 'stdint.h', - 'stdio.h', - 'stdlib.h', - 'stdnoreturn.h', - 'string.h', - 'tgmath.h', - 'threads.h', - 'time.h', - 'uchar.h', - 'wchar.h', - 'wctype.h', - # additional POSIX C headers - 'aio.h', - 'arpa/inet.h', - 'cpio.h', - 'dirent.h', - 'dlfcn.h', - 'fcntl.h', - 'fmtmsg.h', - 'fnmatch.h', - 'ftw.h', - 'glob.h', - 'grp.h', - 'iconv.h', - 'langinfo.h', - 'libgen.h', - 'monetary.h', - 'mqueue.h', - 'ndbm.h', - 'net/if.h', - 'netdb.h', - 'netinet/in.h', - 'netinet/tcp.h', - 'nl_types.h', - 'poll.h', - 'pthread.h', - 'pwd.h', - 'regex.h', - 'sched.h', - 'search.h', - 'semaphore.h', - 'setjmp.h', - 'signal.h', - 'spawn.h', - 'strings.h', - 'stropts.h', - 'syslog.h', - 'tar.h', - 'termios.h', - 'trace.h', - 'ulimit.h', - 'unistd.h', - 'utime.h', - 'utmpx.h', - 'wordexp.h', - # additional GNUlib headers - 'a.out.h', - 'aliases.h', - 'alloca.h', - 'ar.h', - 'argp.h', - 'argz.h', - 'byteswap.h', - 'crypt.h', - 'endian.h', - 'envz.h', - 'err.h', - 'error.h', - 'execinfo.h', - 'fpu_control.h', - 'fstab.h', - 'fts.h', - 'getopt.h', - 'gshadow.h', - 'ieee754.h', - 'ifaddrs.h', - 'libintl.h', - 'mcheck.h', - 'mntent.h', - 'obstack.h', - 'paths.h', - 'printf.h', - 'pty.h', - 'resolv.h', - 'shadow.h', - 'sysexits.h', - 'ttyent.h', - # Additional linux glibc headers - 'dlfcn.h', - 'elf.h', - 'features.h', - 'gconv.h', - 'gnu-versions.h', - 'lastlog.h', - 'libio.h', - 'link.h', - 'malloc.h', - 'memory.h', - 'netash/ash.h', - 'netatalk/at.h', - 'netax25/ax25.h', - 'neteconet/ec.h', - 'netipx/ipx.h', - 'netiucv/iucv.h', - 'netpacket/packet.h', - 'netrom/netrom.h', - 'netrose/rose.h', - 'nfs/nfs.h', - 'nl_types.h', - 'nss.h', - 're_comp.h', - 'regexp.h', - 'sched.h', - 'sgtty.h', - 'stab.h', - 'stdc-predef.h', - 'stdio_ext.h', - 'syscall.h', - 'termio.h', - 'thread_db.h', - 'ucontext.h', - 'ustat.h', - 'utmp.h', - 'values.h', - 'wait.h', - 'xlocale.h', - # Hardware specific headers - 'arm_neon.h', - 'emmintrin.h', - 'immintrin.h', - 'intrin.h', - 'nmmintrin.h', - 'x86intrin.h', - 'xmmintrin.h', - ]) - -# Folders of C libraries so commonly used in C++, -# that they have parity with standard C libraries. -C_STANDARD_HEADER_FOLDERS = frozenset([ - # standard C library - "sys", - # glibc for linux - "arpa", - "asm-generic", - "bits", - "gnu", - "net", - "netinet", - "protocols", - "rpc", - "rpcsvc", - "scsi", - # linux kernel header - "drm", - "linux", - "misc", - "mtd", - "rdma", - "sound", - "video", - "xen", - ]) - -# Type names -_TYPES = re.compile( - r'^(?:' - # [dcl.type.simple] - r'(char(16_t|32_t)?)|wchar_t|' - r'bool|short|int|long|signed|unsigned|float|double|' - # [support.types] - r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|' - # [cstdint.syn] - r'(u?int(_fast|_least)?(8|16|32|64)_t)|' - r'(u?int(max|ptr)_t)|' - r')$') - - -# These headers are excluded from [build/include] and [build/include_order] -# checks: -# - Anything not following google file name conventions (containing an -# uppercase character, such as Python.h or nsStringAPI.h, for example). -# - Lua headers. -_THIRD_PARTY_HEADERS_PATTERN = re.compile( - r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$') - -# Pattern for matching FileInfo.BaseName() against test file name -_test_suffixes = ['_test', '_regtest', '_unittest'] -_TEST_FILE_SUFFIX = '(' + '|'.join(_test_suffixes) + r')$' - -# Pattern that matches only complete whitespace, possibly across multiple lines. -_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL) - -# Assertion macros. These are defined in base/logging.h and -# testing/base/public/gunit.h. -_CHECK_MACROS = [ - 'DCHECK', 'CHECK', - 'EXPECT_TRUE', 'ASSERT_TRUE', - 'EXPECT_FALSE', 'ASSERT_FALSE', - ] - -# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE -_CHECK_REPLACEMENT = dict([(macro_var, {}) for macro_var in _CHECK_MACROS]) - -for op, replacement in [('==', 'EQ'), ('!=', 'NE'), - ('>=', 'GE'), ('>', 'GT'), - ('<=', 'LE'), ('<', 'LT')]: - _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement - _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement - _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement - _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement - -for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), - ('>=', 'LT'), ('>', 'LE'), - ('<=', 'GT'), ('<', 'GE')]: - _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement - _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement - -# Alternative tokens and their replacements. For full list, see section 2.5 -# Alternative tokens [lex.digraph] in the C++ standard. -# -# Digraphs (such as '%:') are not included here since it's a mess to -# match those on a word boundary. -_ALT_TOKEN_REPLACEMENT = { - 'and': '&&', - 'bitor': '|', - 'or': '||', - 'xor': '^', - 'compl': '~', - 'bitand': '&', - 'and_eq': '&=', - 'or_eq': '|=', - 'xor_eq': '^=', - 'not': '!', - 'not_eq': '!=' - } - -# Compile regular expression that matches all the above keywords. The "[ =()]" -# bit is meant to avoid matching these keywords outside of boolean expressions. -# -# False positives include C-style multi-line comments and multi-line strings -# but those have always been troublesome for cpplint. -_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile( - r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)') - - -# These constants define types of headers for use with -# _IncludeState.CheckNextIncludeOrder(). -_C_SYS_HEADER = 1 -_CPP_SYS_HEADER = 2 -_OTHER_SYS_HEADER = 3 -_LIKELY_MY_HEADER = 4 -_POSSIBLE_MY_HEADER = 5 -_OTHER_HEADER = 6 - -# These constants define the current inline assembly state -_NO_ASM = 0 # Outside of inline assembly block -_INSIDE_ASM = 1 # Inside inline assembly block -_END_ASM = 2 # Last line of inline assembly block -_BLOCK_ASM = 3 # The whole block is an inline assembly block - -# Match start of assembly blocks -_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)' - r'(?:\s+(volatile|__volatile__))?' - r'\s*[{(]') - -# Match strings that indicate we're working on a C (not C++) file. -_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|' - r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))') - -# Match string that indicates we're working on a Linux Kernel file. -_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)') - -# Commands for sed to fix the problem -_SED_FIXUPS = { - 'Remove spaces around =': r's/ = /=/', - 'Remove spaces around !=': r's/ != /!=/', - 'Remove space before ( in if (': r's/if (/if(/', - 'Remove space before ( in for (': r's/for (/for(/', - 'Remove space before ( in while (': r's/while (/while(/', - 'Remove space before ( in switch (': r's/switch (/switch(/', - 'Should have a space between // and comment': r's/\/\//\/\/ /', - 'Missing space before {': r's/\([^ ]\){/\1 {/', - 'Tab found, replace by spaces': r's/\t/ /g', - 'Line ends in whitespace. Consider deleting these extra spaces.': r's/\s*$//', - 'You don\'t need a ; after a }': r's/};/}/', - 'Missing space after ,': r's/,\([^ ]\)/, \1/g', -} - -_regexp_compile_cache = {} - -# {str, set(int)}: a map from error categories to sets of linenumbers -# on which those errors are expected and should be suppressed. -_error_suppressions = {} - -# The root directory used for deriving header guard CPP variable. -# This is set by --root flag. -_root = None -_root_debug = False - -# The top level repository directory. If set, _root is calculated relative to -# this directory instead of the directory containing version control artifacts. -# This is set by the --repository flag. -_repository = None - -# Files to exclude from linting. This is set by the --exclude flag. -_excludes = None - -# Whether to suppress all PrintInfo messages, UNRELATED to --quiet flag -_quiet = False - -# The allowed line length of files. -# This is set by --linelength flag. -_line_length = 80 - -# This allows to use different include order rule than default -_include_order = "default" - -try: - # -- pylint: disable=used-before-assignment - unicode -except NameError: - # -- pylint: disable=redefined-builtin - basestring = unicode = str - -try: - # -- pylint: disable=used-before-assignment - long -except NameError: - # -- pylint: disable=redefined-builtin - long = int - -if sys.version_info < (3,): - # -- pylint: disable=no-member - # BINARY_TYPE = str - itervalues = dict.itervalues - iteritems = dict.iteritems -else: - # BINARY_TYPE = bytes - itervalues = dict.values - iteritems = dict.items - -def unicode_escape_decode(x): - if sys.version_info < (3,): - return codecs.unicode_escape_decode(x)[0] - else: - return x - -# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc. -# This is set by --headers flag. -_hpp_headers = set([]) - -# {str, bool}: a map from error categories to booleans which indicate if the -# category should be suppressed for every line. -_global_error_suppressions = {} - -def ProcessHppHeadersOption(val): - global _hpp_headers - try: - _hpp_headers = {ext.strip() for ext in val.split(',')} - except ValueError: - PrintUsage('Header extensions must be comma separated list.') - -def ProcessIncludeOrderOption(val): - if val is None or val == "default": - pass - elif val == "standardcfirst": - global _include_order - _include_order = val - else: - PrintUsage('Invalid includeorder value %s. Expected default|standardcfirst') - -def IsHeaderExtension(file_extension): - return file_extension in GetHeaderExtensions() - -def GetHeaderExtensions(): - if _hpp_headers: - return _hpp_headers - if _valid_extensions: - return {h for h in _valid_extensions if 'h' in h} - return set(['h', 'hh', 'hpp', 'hxx', 'h++', 'cuh']) - -# The allowed extensions for file names -# This is set by --extensions flag -def GetAllExtensions(): - return GetHeaderExtensions().union(_valid_extensions or set( - ['c', 'cc', 'cpp', 'cxx', 'c++', 'cu'])) - -def ProcessExtensionsOption(val): - global _valid_extensions - try: - extensions = [ext.strip() for ext in val.split(',')] - _valid_extensions = set(extensions) - except ValueError: - PrintUsage('Extensions should be a comma-separated list of values;' - 'for example: extensions=hpp,cpp\n' - 'This could not be parsed: "%s"' % (val,)) - -def GetNonHeaderExtensions(): - return GetAllExtensions().difference(GetHeaderExtensions()) - -def ParseNolintSuppressions(filename, raw_line, linenum, error): - """Updates the global list of line error-suppressions. - - Parses any NOLINT comments on the current line, updating the global - error_suppressions store. Reports an error if the NOLINT comment - was malformed. - - Args: - filename: str, the name of the input file. - raw_line: str, the line of input text, with comments. - linenum: int, the number of the current line. - error: function, an error handler. - """ - matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line) - if matched: - if matched.group(1): - suppressed_line = linenum + 1 - else: - suppressed_line = linenum - category = matched.group(2) - if category in (None, '(*)'): # => "suppress all" - _error_suppressions.setdefault(None, set()).add(suppressed_line) - else: - if category.startswith('(') and category.endswith(')'): - category = category[1:-1] - if category in _ERROR_CATEGORIES: - _error_suppressions.setdefault(category, set()).add(suppressed_line) - elif any(c for c in _OTHER_NOLINT_CATEGORY_PREFIXES if category.startswith(c)): - # Ignore any categories from other tools. - pass - elif category not in _LEGACY_ERROR_CATEGORIES: - error(filename, linenum, 'readability/nolint', 5, - 'Unknown NOLINT error category: %s' % category) - - -def ProcessGlobalSuppressions(lines): - """Updates the list of global error suppressions. - - Parses any lint directives in the file that have global effect. - - Args: - lines: An array of strings, each representing a line of the file, with the - last element being empty if the file is terminated with a newline. - """ - for line in lines: - if _SEARCH_C_FILE.search(line): - for category in _DEFAULT_C_SUPPRESSED_CATEGORIES: - _global_error_suppressions[category] = True - if _SEARCH_KERNEL_FILE.search(line): - for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES: - _global_error_suppressions[category] = True - - -def ResetNolintSuppressions(): - """Resets the set of NOLINT suppressions to empty.""" - _error_suppressions.clear() - _global_error_suppressions.clear() - - -def IsErrorSuppressedByNolint(category, linenum): - """Returns true if the specified error category is suppressed on this line. - - Consults the global error_suppressions map populated by - ParseNolintSuppressions/ProcessGlobalSuppressions/ResetNolintSuppressions. - - Args: - category: str, the category of the error. - linenum: int, the current line number. - Returns: - bool, True iff the error should be suppressed due to a NOLINT comment or - global suppression. - """ - return (_global_error_suppressions.get(category, False) or - linenum in _error_suppressions.get(category, set()) or - linenum in _error_suppressions.get(None, set())) - - -def Match(pattern, s): - """Matches the string with the pattern, caching the compiled regexp.""" - # The regexp compilation caching is inlined in both Match and Search for - # performance reasons; factoring it out into a separate function turns out - # to be noticeably expensive. - if pattern not in _regexp_compile_cache: - _regexp_compile_cache[pattern] = sre_compile.compile(pattern) - return _regexp_compile_cache[pattern].match(s) - - -def ReplaceAll(pattern, rep, s): - """Replaces instances of pattern in a string with a replacement. - - The compiled regex is kept in a cache shared by Match and Search. - - Args: - pattern: regex pattern - rep: replacement text - s: search string - - Returns: - string with replacements made (or original string if no replacements) - """ - if pattern not in _regexp_compile_cache: - _regexp_compile_cache[pattern] = sre_compile.compile(pattern) - return _regexp_compile_cache[pattern].sub(rep, s) - - -def Search(pattern, s): - """Searches the string for the pattern, caching the compiled regexp.""" - if pattern not in _regexp_compile_cache: - _regexp_compile_cache[pattern] = sre_compile.compile(pattern) - return _regexp_compile_cache[pattern].search(s) - - -def _IsSourceExtension(s): - """File extension (excluding dot) matches a source file extension.""" - return s in GetNonHeaderExtensions() - - -class _IncludeState(object): - """Tracks line numbers for includes, and the order in which includes appear. - - include_list contains list of lists of (header, line number) pairs. - It's a lists of lists rather than just one flat list to make it - easier to update across preprocessor boundaries. - - Call CheckNextIncludeOrder() once for each header in the file, passing - in the type constants defined above. Calls in an illegal order will - raise an _IncludeError with an appropriate error message. - - """ - # self._section will move monotonically through this set. If it ever - # needs to move backwards, CheckNextIncludeOrder will raise an error. - _INITIAL_SECTION = 0 - _MY_H_SECTION = 1 - _C_SECTION = 2 - _CPP_SECTION = 3 - _OTHER_SYS_SECTION = 4 - _OTHER_H_SECTION = 5 - - _TYPE_NAMES = { - _C_SYS_HEADER: 'C system header', - _CPP_SYS_HEADER: 'C++ system header', - _OTHER_SYS_HEADER: 'other system header', - _LIKELY_MY_HEADER: 'header this file implements', - _POSSIBLE_MY_HEADER: 'header this file may implement', - _OTHER_HEADER: 'other header', - } - _SECTION_NAMES = { - _INITIAL_SECTION: "... nothing. (This can't be an error.)", - _MY_H_SECTION: 'a header this file implements', - _C_SECTION: 'C system header', - _CPP_SECTION: 'C++ system header', - _OTHER_SYS_SECTION: 'other system header', - _OTHER_H_SECTION: 'other header', - } - - def __init__(self): - self.include_list = [[]] - self._section = None - self._last_header = None - self.ResetSection('') - - def FindHeader(self, header): - """Check if a header has already been included. - - Args: - header: header to check. - Returns: - Line number of previous occurrence, or -1 if the header has not - been seen before. - """ - for section_list in self.include_list: - for f in section_list: - if f[0] == header: - return f[1] - return -1 - - def ResetSection(self, directive): - """Reset section checking for preprocessor directive. - - Args: - directive: preprocessor directive (e.g. "if", "else"). - """ - # The name of the current section. - self._section = self._INITIAL_SECTION - # The path of last found header. - self._last_header = '' - - # Update list of includes. Note that we never pop from the - # include list. - if directive in ('if', 'ifdef', 'ifndef'): - self.include_list.append([]) - elif directive in ('else', 'elif'): - self.include_list[-1] = [] - - def SetLastHeader(self, header_path): - self._last_header = header_path - - def CanonicalizeAlphabeticalOrder(self, header_path): - """Returns a path canonicalized for alphabetical comparison. - - - replaces "-" with "_" so they both cmp the same. - - removes '-inl' since we don't require them to be after the main header. - - lowercase everything, just in case. - - Args: - header_path: Path to be canonicalized. - - Returns: - Canonicalized path. - """ - return header_path.replace('-inl.h', '.h').replace('-', '_').lower() - - def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path): - """Check if a header is in alphabetical order with the previous header. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - header_path: Canonicalized header to be checked. - - Returns: - Returns true if the header is in alphabetical order. - """ - # If previous section is different from current section, _last_header will - # be reset to empty string, so it's always less than current header. - # - # If previous line was a blank line, assume that the headers are - # intentionally sorted the way they are. - if (self._last_header > header_path and - Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])): - return False - return True - - def CheckNextIncludeOrder(self, header_type): - """Returns a non-empty error message if the next header is out of order. - - This function also updates the internal state to be ready to check - the next include. - - Args: - header_type: One of the _XXX_HEADER constants defined above. - - Returns: - The empty string if the header is in the right order, or an - error message describing what's wrong. - - """ - error_message = ('Found %s after %s' % - (self._TYPE_NAMES[header_type], - self._SECTION_NAMES[self._section])) - - last_section = self._section - - if header_type == _C_SYS_HEADER: - if self._section <= self._C_SECTION: - self._section = self._C_SECTION - else: - self._last_header = '' - return error_message - elif header_type == _CPP_SYS_HEADER: - if self._section <= self._CPP_SECTION: - self._section = self._CPP_SECTION - else: - self._last_header = '' - return error_message - elif header_type == _OTHER_SYS_HEADER: - if self._section <= self._OTHER_SYS_SECTION: - self._section = self._OTHER_SYS_SECTION - else: - self._last_header = '' - return error_message - elif header_type == _LIKELY_MY_HEADER: - if self._section <= self._MY_H_SECTION: - self._section = self._MY_H_SECTION - else: - self._section = self._OTHER_H_SECTION - elif header_type == _POSSIBLE_MY_HEADER: - if self._section <= self._MY_H_SECTION: - self._section = self._MY_H_SECTION - else: - # This will always be the fallback because we're not sure - # enough that the header is associated with this file. - self._section = self._OTHER_H_SECTION - else: - assert header_type == _OTHER_HEADER - self._section = self._OTHER_H_SECTION - - if last_section != self._section: - self._last_header = '' - - return '' - - -class _CppLintState(object): - """Maintains module-wide state..""" - - def __init__(self): - self.verbose_level = 1 # global setting. - self.error_count = 0 # global count of reported errors - # filters to apply when emitting error messages - self.filters = _DEFAULT_FILTERS[:] - # backup of filter list. Used to restore the state after each file. - self._filters_backup = self.filters[:] - self.counting = 'total' # In what way are we counting errors? - self.errors_by_category = {} # string to int dict storing error counts - self.quiet = False # Suppress non-error messages? - - # output format: - # "emacs" - format that emacs can parse (default) - # "eclipse" - format that eclipse can parse - # "vs7" - format that Microsoft Visual Studio 7 can parse - # "junit" - format that Jenkins, Bamboo, etc can parse - # "sed" - returns a gnu sed command to fix the problem - # "gsed" - like sed, but names the command gsed, e.g. for macOS homebrew users - self.output_format = 'emacs' - - # For JUnit output, save errors and failures until the end so that they - # can be written into the XML - self._junit_errors = [] - self._junit_failures = [] - - def SetOutputFormat(self, output_format): - """Sets the output format for errors.""" - self.output_format = output_format - - def SetQuiet(self, quiet): - """Sets the module's quiet settings, and returns the previous setting.""" - last_quiet = self.quiet - self.quiet = quiet - return last_quiet - - def SetVerboseLevel(self, level): - """Sets the module's verbosity, and returns the previous setting.""" - last_verbose_level = self.verbose_level - self.verbose_level = level - return last_verbose_level - - def SetCountingStyle(self, counting_style): - """Sets the module's counting options.""" - self.counting = counting_style - - def SetFilters(self, filters): - """Sets the error-message filters. - - These filters are applied when deciding whether to emit a given - error message. - - Args: - filters: A string of comma-separated filters (eg "+whitespace/indent"). - Each filter should start with + or -; else we die. - - Raises: - ValueError: The comma-separated filters did not all start with '+' or '-'. - E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter" - """ - # Default filters always have less priority than the flag ones. - self.filters = _DEFAULT_FILTERS[:] - self.AddFilters(filters) - - def AddFilters(self, filters): - """ Adds more filters to the existing list of error-message filters. """ - for filt in filters.split(','): - clean_filt = filt.strip() - if clean_filt: - self.filters.append(clean_filt) - for filt in self.filters: - if not (filt.startswith('+') or filt.startswith('-')): - raise ValueError('Every filter in --filters must start with + or -' - ' (%s does not)' % filt) - - def BackupFilters(self): - """ Saves the current filter list to backup storage.""" - self._filters_backup = self.filters[:] - - def RestoreFilters(self): - """ Restores filters previously backed up.""" - self.filters = self._filters_backup[:] - - def ResetErrorCounts(self): - """Sets the module's error statistic back to zero.""" - self.error_count = 0 - self.errors_by_category = {} - - def IncrementErrorCount(self, category): - """Bumps the module's error statistic.""" - self.error_count += 1 - if self.counting in ('toplevel', 'detailed'): - if self.counting != 'detailed': - category = category.split('/')[0] - if category not in self.errors_by_category: - self.errors_by_category[category] = 0 - self.errors_by_category[category] += 1 - - def PrintErrorCounts(self): - """Print a summary of errors by category, and the total.""" - for category, count in sorted(iteritems(self.errors_by_category)): - self.PrintInfo('Category \'%s\' errors found: %d\n' % - (category, count)) - if self.error_count > 0: - self.PrintInfo('Total errors found: %d\n' % self.error_count) - - def PrintInfo(self, message): - # _quiet does not represent --quiet flag. - # Hide infos from stdout to keep stdout pure for machine consumption - if not _quiet and self.output_format not in _MACHINE_OUTPUTS: - sys.stdout.write(message) - - def PrintError(self, message): - if self.output_format == 'junit': - self._junit_errors.append(message) - else: - sys.stderr.write(message) - - def AddJUnitFailure(self, filename, linenum, message, category, confidence): - self._junit_failures.append((filename, linenum, message, category, - confidence)) - - def FormatJUnitXML(self): - num_errors = len(self._junit_errors) - num_failures = len(self._junit_failures) - - testsuite = xml.etree.ElementTree.Element('testsuite') - testsuite.attrib['errors'] = str(num_errors) - testsuite.attrib['failures'] = str(num_failures) - testsuite.attrib['name'] = 'cpplint' - - if num_errors == 0 and num_failures == 0: - testsuite.attrib['tests'] = str(1) - xml.etree.ElementTree.SubElement(testsuite, 'testcase', name='passed') - - else: - testsuite.attrib['tests'] = str(num_errors + num_failures) - if num_errors > 0: - testcase = xml.etree.ElementTree.SubElement(testsuite, 'testcase') - testcase.attrib['name'] = 'errors' - error = xml.etree.ElementTree.SubElement(testcase, 'error') - error.text = '\n'.join(self._junit_errors) - if num_failures > 0: - # Group failures by file - failed_file_order = [] - failures_by_file = {} - for failure in self._junit_failures: - failed_file = failure[0] - if failed_file not in failed_file_order: - failed_file_order.append(failed_file) - failures_by_file[failed_file] = [] - failures_by_file[failed_file].append(failure) - # Create a testcase for each file - for failed_file in failed_file_order: - failures = failures_by_file[failed_file] - testcase = xml.etree.ElementTree.SubElement(testsuite, 'testcase') - testcase.attrib['name'] = failed_file - failure = xml.etree.ElementTree.SubElement(testcase, 'failure') - template = '{0}: {1} [{2}] [{3}]' - texts = [template.format(f[1], f[2], f[3], f[4]) for f in failures] - failure.text = '\n'.join(texts) - - xml_decl = '\n' - return xml_decl + xml.etree.ElementTree.tostring(testsuite, 'utf-8').decode('utf-8') - - -_cpplint_state = _CppLintState() - - -def _OutputFormat(): - """Gets the module's output format.""" - return _cpplint_state.output_format - - -def _SetOutputFormat(output_format): - """Sets the module's output format.""" - _cpplint_state.SetOutputFormat(output_format) - -def _Quiet(): - """Return's the module's quiet setting.""" - return _cpplint_state.quiet - -def _SetQuiet(quiet): - """Set the module's quiet status, and return previous setting.""" - return _cpplint_state.SetQuiet(quiet) - - -def _VerboseLevel(): - """Returns the module's verbosity setting.""" - return _cpplint_state.verbose_level - - -def _SetVerboseLevel(level): - """Sets the module's verbosity, and returns the previous setting.""" - return _cpplint_state.SetVerboseLevel(level) - - -def _SetCountingStyle(level): - """Sets the module's counting options.""" - _cpplint_state.SetCountingStyle(level) - - -def _Filters(): - """Returns the module's list of output filters, as a list.""" - return _cpplint_state.filters - - -def _SetFilters(filters): - """Sets the module's error-message filters. - - These filters are applied when deciding whether to emit a given - error message. - - Args: - filters: A string of comma-separated filters (eg "whitespace/indent"). - Each filter should start with + or -; else we die. - """ - _cpplint_state.SetFilters(filters) - -def _AddFilters(filters): - """Adds more filter overrides. - - Unlike _SetFilters, this function does not reset the current list of filters - available. - - Args: - filters: A string of comma-separated filters (eg "whitespace/indent"). - Each filter should start with + or -; else we die. - """ - _cpplint_state.AddFilters(filters) - -def _BackupFilters(): - """ Saves the current filter list to backup storage.""" - _cpplint_state.BackupFilters() - -def _RestoreFilters(): - """ Restores filters previously backed up.""" - _cpplint_state.RestoreFilters() - -class _FunctionState(object): - """Tracks current function name and the number of lines in its body.""" - - _NORMAL_TRIGGER = 250 # for --v=0, 500 for --v=1, etc. - _TEST_TRIGGER = 400 # about 50% more than _NORMAL_TRIGGER. - - def __init__(self): - self.in_a_function = False - self.lines_in_function = 0 - self.current_function = '' - - def Begin(self, function_name): - """Start analyzing function body. - - Args: - function_name: The name of the function being tracked. - """ - self.in_a_function = True - self.lines_in_function = 0 - self.current_function = function_name - - def Count(self): - """Count line in current function body.""" - if self.in_a_function: - self.lines_in_function += 1 - - def Check(self, error, filename, linenum): - """Report if too many lines in function body. - - Args: - error: The function to call with any errors found. - filename: The name of the current file. - linenum: The number of the line to check. - """ - if not self.in_a_function: - return - - if Match(r'T(EST|est)', self.current_function): - base_trigger = self._TEST_TRIGGER - else: - base_trigger = self._NORMAL_TRIGGER - trigger = base_trigger * 2**_VerboseLevel() - - if self.lines_in_function > trigger: - error_level = int(math.log(self.lines_in_function / base_trigger, 2)) - # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ... - if error_level > 5: - error_level = 5 - error(filename, linenum, 'readability/fn_size', error_level, - 'Small and focused functions are preferred:' - ' %s has %d non-comment lines' - ' (error triggered by exceeding %d lines).' % ( - self.current_function, self.lines_in_function, trigger)) - - def End(self): - """Stop analyzing function body.""" - self.in_a_function = False - - -class _IncludeError(Exception): - """Indicates a problem with the include order in a file.""" - pass - - -class FileInfo(object): - """Provides utility functions for filenames. - - FileInfo provides easy access to the components of a file's path - relative to the project root. - """ - - def __init__(self, filename): - self._filename = filename - - def FullName(self): - """Make Windows paths like Unix.""" - return os.path.abspath(self._filename).replace('\\', '/') - - def RepositoryName(self): - r"""FullName after removing the local path to the repository. - - If we have a real absolute path name here we can try to do something smart: - detecting the root of the checkout and truncating /path/to/checkout from - the name so that we get header guards that don't include things like - "C:\\Documents and Settings\\..." or "/home/username/..." in them and thus - people on different computers who have checked the source out to different - locations won't see bogus errors. - """ - fullname = self.FullName() - - if os.path.exists(fullname): - project_dir = os.path.dirname(fullname) - - # If the user specified a repository path, it exists, and the file is - # contained in it, use the specified repository path - if _repository: - repo = FileInfo(_repository).FullName() - root_dir = project_dir - while os.path.exists(root_dir): - # allow case-insensitive compare on Windows - if os.path.normcase(root_dir) == os.path.normcase(repo): - return os.path.relpath(fullname, root_dir).replace('\\', '/') - one_up_dir = os.path.dirname(root_dir) - if one_up_dir == root_dir: - break - root_dir = one_up_dir - - if os.path.exists(os.path.join(project_dir, ".svn")): - # If there's a .svn file in the current directory, we recursively look - # up the directory tree for the top of the SVN checkout - root_dir = project_dir - one_up_dir = os.path.dirname(root_dir) - while os.path.exists(os.path.join(one_up_dir, ".svn")): - root_dir = os.path.dirname(root_dir) - one_up_dir = os.path.dirname(one_up_dir) - - prefix = os.path.commonprefix([root_dir, project_dir]) - return fullname[len(prefix) + 1:] - - # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by - # searching up from the current path. - root_dir = current_dir = os.path.dirname(fullname) - while current_dir != os.path.dirname(current_dir): - if (os.path.exists(os.path.join(current_dir, ".git")) or - os.path.exists(os.path.join(current_dir, ".hg")) or - os.path.exists(os.path.join(current_dir, ".svn"))): - root_dir = current_dir - current_dir = os.path.dirname(current_dir) - - if (os.path.exists(os.path.join(root_dir, ".git")) or - os.path.exists(os.path.join(root_dir, ".hg")) or - os.path.exists(os.path.join(root_dir, ".svn"))): - prefix = os.path.commonprefix([root_dir, project_dir]) - return fullname[len(prefix) + 1:] - - # Don't know what to do; header guard warnings may be wrong... - return fullname - - def Split(self): - """Splits the file into the directory, basename, and extension. - - For 'chrome/browser/browser.cc', Split() would - return ('chrome/browser', 'browser', '.cc') - - Returns: - A tuple of (directory, basename, extension). - """ - - googlename = self.RepositoryName() - project, rest = os.path.split(googlename) - return (project,) + os.path.splitext(rest) - - def BaseName(self): - """File base name - text after the final slash, before the final period.""" - return self.Split()[1] - - def Extension(self): - """File extension - text following the final period, includes that period.""" - return self.Split()[2] - - def NoExtension(self): - """File has no source file extension.""" - return '/'.join(self.Split()[0:2]) - - def IsSource(self): - """File has a source file extension.""" - return _IsSourceExtension(self.Extension()[1:]) - - -def _ShouldPrintError(category, confidence, linenum): - """If confidence >= verbose, category passes filter and is not suppressed.""" - - # There are three ways we might decide not to print an error message: - # a "NOLINT(category)" comment appears in the source, - # the verbosity level isn't high enough, or the filters filter it out. - if IsErrorSuppressedByNolint(category, linenum): - return False - - if confidence < _cpplint_state.verbose_level: - return False - - is_filtered = False - for one_filter in _Filters(): - if one_filter.startswith('-'): - if category.startswith(one_filter[1:]): - is_filtered = True - elif one_filter.startswith('+'): - if category.startswith(one_filter[1:]): - is_filtered = False - else: - assert False # should have been checked for in SetFilter. - if is_filtered: - return False - - return True - - -def Error(filename, linenum, category, confidence, message): - """Logs the fact we've found a lint error. - - We log where the error was found, and also our confidence in the error, - that is, how certain we are this is a legitimate style regression, and - not a misidentification or a use that's sometimes justified. - - False positives can be suppressed by the use of - "cpplint(category)" comments on the offending line. These are - parsed into _error_suppressions. - - Args: - filename: The name of the file containing the error. - linenum: The number of the line containing the error. - category: A string used to describe the "category" this bug - falls under: "whitespace", say, or "runtime". Categories - may have a hierarchy separated by slashes: "whitespace/indent". - confidence: A number from 1-5 representing a confidence score for - the error, with 5 meaning that we are certain of the problem, - and 1 meaning that it could be a legitimate construct. - message: The error message. - """ - if _ShouldPrintError(category, confidence, linenum): - _cpplint_state.IncrementErrorCount(category) - if _cpplint_state.output_format == 'vs7': - _cpplint_state.PrintError('%s(%s): error cpplint: [%s] %s [%d]\n' % ( - filename, linenum, category, message, confidence)) - elif _cpplint_state.output_format == 'eclipse': - sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( - filename, linenum, message, category, confidence)) - elif _cpplint_state.output_format == 'junit': - _cpplint_state.AddJUnitFailure(filename, linenum, message, category, - confidence) - elif _cpplint_state.output_format in ['sed', 'gsed']: - if message in _SED_FIXUPS: - sys.stdout.write(_cpplint_state.output_format + " -i '%s%s' %s # %s [%s] [%d]\n" % ( - linenum, _SED_FIXUPS[message], filename, message, category, confidence)) - else: - sys.stderr.write('# %s:%s: "%s" [%s] [%d]\n' % ( - filename, linenum, message, category, confidence)) - else: - final_message = '%s:%s: %s [%s] [%d]\n' % ( - filename, linenum, message, category, confidence) - sys.stderr.write(final_message) - -# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard. -_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile( - r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)') -# Match a single C style comment on the same line. -_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/' -# Matches multi-line C style comments. -# This RE is a little bit more complicated than one might expect, because we -# have to take care of space removals tools so we can handle comments inside -# statements better. -# The current rule is: We only clear spaces from both sides when we're at the -# end of the line. Otherwise, we try to remove spaces from the right side, -# if this doesn't work we try on left side but only if there's a non-character -# on the right. -_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile( - r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' + - _RE_PATTERN_C_COMMENTS + r'\s+|' + - r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' + - _RE_PATTERN_C_COMMENTS + r')') - - -def IsCppString(line): - """Does line terminate so, that the next symbol is in string constant. - - This function does not consider comments at all. - - Args: - line: is a partial line of code starting from the 0..n. - - Returns: - True, if next character appended to 'line' is inside a - string constant. - """ - - line = line.replace(r'\\', 'XX') # after this, \\" does not match to \" - return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1 - - -def CleanseRawStrings(raw_lines): - """Removes C++11 raw strings from lines. - - Before: - static const char kData[] = R"( - multi-line string - )"; - - After: - static const char kData[] = "" - (replaced by blank line) - ""; - - Args: - raw_lines: list of raw lines. - - Returns: - list of lines with C++11 raw strings replaced by empty strings. - """ - - delimiter = None - lines_without_raw_strings = [] - for line in raw_lines: - if delimiter: - # Inside a raw string, look for the end - end = line.find(delimiter) - if end >= 0: - # Found the end of the string, match leading space for this - # line and resume copying the original lines, and also insert - # a "" on the last line. - leading_space = Match(r'^(\s*)\S', line) - line = leading_space.group(1) + '""' + line[end + len(delimiter):] - delimiter = None - else: - # Haven't found the end yet, append a blank line. - line = '""' - - # Look for beginning of a raw string, and replace them with - # empty strings. This is done in a loop to handle multiple raw - # strings on the same line. - while delimiter is None: - # Look for beginning of a raw string. - # See 2.14.15 [lex.string] for syntax. - # - # Once we have matched a raw string, we check the prefix of the - # line to make sure that the line is not part of a single line - # comment. It's done this way because we remove raw strings - # before removing comments as opposed to removing comments - # before removing raw strings. This is because there are some - # cpplint checks that requires the comments to be preserved, but - # we don't want to check comments that are inside raw strings. - matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) - if (matched and - not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//', - matched.group(1))): - delimiter = ')' + matched.group(2) + '"' - - end = matched.group(3).find(delimiter) - if end >= 0: - # Raw string ended on same line - line = (matched.group(1) + '""' + - matched.group(3)[end + len(delimiter):]) - delimiter = None - else: - # Start of a multi-line raw string - line = matched.group(1) + '""' - else: - break - - lines_without_raw_strings.append(line) - - # TODO(unknown): if delimiter is not None here, we might want to - # emit a warning for unterminated string. - return lines_without_raw_strings - - -def FindNextMultiLineCommentStart(lines, lineix): - """Find the beginning marker for a multiline comment.""" - while lineix < len(lines): - if lines[lineix].strip().startswith('/*'): - # Only return this marker if the comment goes beyond this line - if lines[lineix].strip().find('*/', 2) < 0: - return lineix - lineix += 1 - return len(lines) - - -def FindNextMultiLineCommentEnd(lines, lineix): - """We are inside a comment, find the end marker.""" - while lineix < len(lines): - if lines[lineix].strip().endswith('*/'): - return lineix - lineix += 1 - return len(lines) - - -def RemoveMultiLineCommentsFromRange(lines, begin, end): - """Clears a range of lines for multi-line comments.""" - # Having // comments makes the lines non-empty, so we will not get - # unnecessary blank line warnings later in the code. - for i in range(begin, end): - lines[i] = '/**/' - - -def RemoveMultiLineComments(filename, lines, error): - """Removes multiline (c-style) comments from lines.""" - lineix = 0 - while lineix < len(lines): - lineix_begin = FindNextMultiLineCommentStart(lines, lineix) - if lineix_begin >= len(lines): - return - lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin) - if lineix_end >= len(lines): - error(filename, lineix_begin + 1, 'readability/multiline_comment', 5, - 'Could not find end of multi-line comment') - return - RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1) - lineix = lineix_end + 1 - - -def CleanseComments(line): - """Removes //-comments and single-line C-style /* */ comments. - - Args: - line: A line of C++ source. - - Returns: - The line with single-line comments removed. - """ - commentpos = line.find('//') - if commentpos != -1 and not IsCppString(line[:commentpos]): - line = line[:commentpos].rstrip() - # get rid of /* ... */ - return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line) - - -class CleansedLines(object): - """Holds 4 copies of all lines with different preprocessing applied to them. - - 1) elided member contains lines without strings and comments. - 2) lines member contains lines without comments. - 3) raw_lines member contains all the lines without processing. - 4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw - strings removed. - All these members are of , and of the same length. - """ - - def __init__(self, lines): - self.elided = [] - self.lines = [] - self.raw_lines = lines - self.num_lines = len(lines) - self.lines_without_raw_strings = CleanseRawStrings(lines) - # # pylint: disable=consider-using-enumerate - for linenum in range(len(self.lines_without_raw_strings)): - self.lines.append(CleanseComments( - self.lines_without_raw_strings[linenum])) - elided = self._CollapseStrings(self.lines_without_raw_strings[linenum]) - self.elided.append(CleanseComments(elided)) - - def NumLines(self): - """Returns the number of lines represented.""" - return self.num_lines - - @staticmethod - def _CollapseStrings(elided): - """Collapses strings and chars on a line to simple "" or '' blocks. - - We nix strings first so we're not fooled by text like '"http://"' - - Args: - elided: The line being processed. - - Returns: - The line with collapsed strings. - """ - if _RE_PATTERN_INCLUDE.match(elided): - return elided - - # Remove escaped characters first to make quote/single quote collapsing - # basic. Things that look like escaped characters shouldn't occur - # outside of strings and chars. - elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) - - # Replace quoted strings and digit separators. Both single quotes - # and double quotes are processed in the same loop, otherwise - # nested quotes wouldn't work. - collapsed = '' - while True: - # Find the first quote character - match = Match(r'^([^\'"]*)([\'"])(.*)$', elided) - if not match: - collapsed += elided - break - head, quote, tail = match.groups() - - if quote == '"': - # Collapse double quoted strings - second_quote = tail.find('"') - if second_quote >= 0: - collapsed += head + '""' - elided = tail[second_quote + 1:] - else: - # Unmatched double quote, don't bother processing the rest - # of the line since this is probably a multiline string. - collapsed += elided - break - else: - # Found single quote, check nearby text to eliminate digit separators. - # - # There is no special handling for floating point here, because - # the integer/fractional/exponent parts would all be parsed - # correctly as long as there are digits on both sides of the - # separator. So we are fine as long as we don't see something - # like "0.'3" (gcc 4.9.0 will not allow this literal). - if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head): - match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail) - collapsed += head + match_literal.group(1).replace("'", '') - elided = match_literal.group(2) - else: - second_quote = tail.find('\'') - if second_quote >= 0: - collapsed += head + "''" - elided = tail[second_quote + 1:] - else: - # Unmatched single quote - collapsed += elided - break - - return collapsed - - -def FindEndOfExpressionInLine(line, startpos, stack): - """Find the position just after the end of current parenthesized expression. - - Args: - line: a CleansedLines line. - startpos: start searching at this position. - stack: nesting stack at startpos. - - Returns: - On finding matching end: (index just after matching end, None) - On finding an unclosed expression: (-1, None) - Otherwise: (-1, new stack at end of this line) - """ - for i in xrange(startpos, len(line)): - char = line[i] - if char in '([{': - # Found start of parenthesized expression, push to expression stack - stack.append(char) - elif char == '<': - # Found potential start of template argument list - if i > 0 and line[i - 1] == '<': - # Left shift operator - if stack and stack[-1] == '<': - stack.pop() - if not stack: - return (-1, None) - elif i > 0 and Search(r'\boperator\s*$', line[0:i]): - # operator<, don't add to stack - continue - else: - # Tentative start of template argument list - stack.append('<') - elif char in ')]}': - # Found end of parenthesized expression. - # - # If we are currently expecting a matching '>', the pending '<' - # must have been an operator. Remove them from expression stack. - while stack and stack[-1] == '<': - stack.pop() - if not stack: - return (-1, None) - if ((stack[-1] == '(' and char == ')') or - (stack[-1] == '[' and char == ']') or - (stack[-1] == '{' and char == '}')): - stack.pop() - if not stack: - return (i + 1, None) - else: - # Mismatched parentheses - return (-1, None) - elif char == '>': - # Found potential end of template argument list. - - # Ignore "->" and operator functions - if (i > 0 and - (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))): - continue - - # Pop the stack if there is a matching '<'. Otherwise, ignore - # this '>' since it must be an operator. - if stack: - if stack[-1] == '<': - stack.pop() - if not stack: - return (i + 1, None) - elif char == ';': - # Found something that look like end of statements. If we are currently - # expecting a '>', the matching '<' must have been an operator, since - # template argument list should not contain statements. - while stack and stack[-1] == '<': - stack.pop() - if not stack: - return (-1, None) - - # Did not find end of expression or unbalanced parentheses on this line - return (-1, stack) - - -def CloseExpression(clean_lines, linenum, pos): - """If input points to ( or { or [ or <, finds the position that closes it. - - If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the - linenum/pos that correspond to the closing of the expression. - - TODO(unknown): cpplint spends a fair bit of time matching parentheses. - Ideally we would want to index all opening and closing parentheses once - and have CloseExpression be just a simple lookup, but due to preprocessor - tricks, this is not so easy. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - pos: A position on the line. - - Returns: - A tuple (line, linenum, pos) pointer *past* the closing brace, or - (line, len(lines), -1) if we never find a close. Note we ignore - strings and comments when matching; and the line we return is the - 'cleansed' line at linenum. - """ - - line = clean_lines.elided[linenum] - if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]): - return (line, clean_lines.NumLines(), -1) - - # Check first line - (end_pos, stack) = FindEndOfExpressionInLine(line, pos, []) - if end_pos > -1: - return (line, linenum, end_pos) - - # Continue scanning forward - while stack and linenum < clean_lines.NumLines() - 1: - linenum += 1 - line = clean_lines.elided[linenum] - (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack) - if end_pos > -1: - return (line, linenum, end_pos) - - # Did not find end of expression before end of file, give up - return (line, clean_lines.NumLines(), -1) - - -def FindStartOfExpressionInLine(line, endpos, stack): - """Find position at the matching start of current expression. - - This is almost the reverse of FindEndOfExpressionInLine, but note - that the input position and returned position differs by 1. - - Args: - line: a CleansedLines line. - endpos: start searching at this position. - stack: nesting stack at endpos. - - Returns: - On finding matching start: (index at matching start, None) - On finding an unclosed expression: (-1, None) - Otherwise: (-1, new stack at beginning of this line) - """ - i = endpos - while i >= 0: - char = line[i] - if char in ')]}': - # Found end of expression, push to expression stack - stack.append(char) - elif char == '>': - # Found potential end of template argument list. - # - # Ignore it if it's a "->" or ">=" or "operator>" - if (i > 0 and - (line[i - 1] == '-' or - Match(r'\s>=\s', line[i - 1:]) or - Search(r'\boperator\s*$', line[0:i]))): - i -= 1 - else: - stack.append('>') - elif char == '<': - # Found potential start of template argument list - if i > 0 and line[i - 1] == '<': - # Left shift operator - i -= 1 - else: - # If there is a matching '>', we can pop the expression stack. - # Otherwise, ignore this '<' since it must be an operator. - if stack and stack[-1] == '>': - stack.pop() - if not stack: - return (i, None) - elif char in '([{': - # Found start of expression. - # - # If there are any unmatched '>' on the stack, they must be - # operators. Remove those. - while stack and stack[-1] == '>': - stack.pop() - if not stack: - return (-1, None) - if ((char == '(' and stack[-1] == ')') or - (char == '[' and stack[-1] == ']') or - (char == '{' and stack[-1] == '}')): - stack.pop() - if not stack: - return (i, None) - else: - # Mismatched parentheses - return (-1, None) - elif char == ';': - # Found something that look like end of statements. If we are currently - # expecting a '<', the matching '>' must have been an operator, since - # template argument list should not contain statements. - while stack and stack[-1] == '>': - stack.pop() - if not stack: - return (-1, None) - - i -= 1 - - return (-1, stack) - - -def ReverseCloseExpression(clean_lines, linenum, pos): - """If input points to ) or } or ] or >, finds the position that opens it. - - If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the - linenum/pos that correspond to the opening of the expression. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - pos: A position on the line. - - Returns: - A tuple (line, linenum, pos) pointer *at* the opening brace, or - (line, 0, -1) if we never find the matching opening brace. Note - we ignore strings and comments when matching; and the line we - return is the 'cleansed' line at linenum. - """ - line = clean_lines.elided[linenum] - if line[pos] not in ')}]>': - return (line, 0, -1) - - # Check last line - (start_pos, stack) = FindStartOfExpressionInLine(line, pos, []) - if start_pos > -1: - return (line, linenum, start_pos) - - # Continue scanning backward - while stack and linenum > 0: - linenum -= 1 - line = clean_lines.elided[linenum] - (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack) - if start_pos > -1: - return (line, linenum, start_pos) - - # Did not find start of expression before beginning of file, give up - return (line, 0, -1) - - -def CheckForCopyright(filename, lines, error): - """Logs an error if no Copyright message appears at the top of the file.""" - - # We'll say it should occur by line 10. Don't forget there's a - # placeholder line at the front. - for line in xrange(1, min(len(lines), 11)): - if re.search(r'Copyright', lines[line], re.I): break - else: # means no copyright line was found - error(filename, 0, 'legal/copyright', 5, - 'No copyright message found. ' - 'You should have a line: "Copyright [year] "') - - -def GetIndentLevel(line): - """Return the number of leading spaces in line. - - Args: - line: A string to check. - - Returns: - An integer count of leading spaces, possibly zero. - """ - indent = Match(r'^( *)\S', line) - if indent: - return len(indent.group(1)) - else: - return 0 - -def PathSplitToList(path): - """Returns the path split into a list by the separator. - - Args: - path: An absolute or relative path (e.g. '/a/b/c/' or '../a') - - Returns: - A list of path components (e.g. ['a', 'b', 'c]). - """ - lst = [] - while True: - (head, tail) = os.path.split(path) - if head == path: # absolute paths end - lst.append(head) - break - if tail == path: # relative paths end - lst.append(tail) - break - - path = head - lst.append(tail) - - lst.reverse() - return lst - -def GetHeaderGuardCPPVariable(filename): - """Returns the CPP variable that should be used as a header guard. - - Args: - filename: The name of a C++ header file. - - Returns: - The CPP variable that should be used as a header guard in the - named file. - - """ - - # Restores original filename in case that cpplint is invoked from Emacs's - # flymake. - filename = re.sub(r'_flymake\.h$', '.h', filename) - filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename) - # Replace 'c++' with 'cpp'. - filename = filename.replace('C++', 'cpp').replace('c++', 'cpp') - - fileinfo = FileInfo(filename) - file_path_from_root = fileinfo.RepositoryName() - - def FixupPathFromRoot(): - if _root_debug: - sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n" - % (_root, fileinfo.RepositoryName())) - - # Process the file path with the --root flag if it was set. - if not _root: - if _root_debug: - sys.stderr.write("_root unspecified\n") - return file_path_from_root - - def StripListPrefix(lst, prefix): - # f(['x', 'y'], ['w, z']) -> None (not a valid prefix) - if lst[:len(prefix)] != prefix: - return None - # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd'] - return lst[(len(prefix)):] - - # root behavior: - # --root=subdir , lstrips subdir from the header guard - maybe_path = StripListPrefix(PathSplitToList(file_path_from_root), - PathSplitToList(_root)) - - if _root_debug: - sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," + - " _root=%s)\n") % (maybe_path, file_path_from_root, _root)) - - if maybe_path: - return os.path.join(*maybe_path) - - # --root=.. , will prepend the outer directory to the header guard - full_path = fileinfo.FullName() - # adapt slashes for windows - root_abspath = os.path.abspath(_root).replace('\\', '/') - - maybe_path = StripListPrefix(PathSplitToList(full_path), - PathSplitToList(root_abspath)) - - if _root_debug: - sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " + - "root_abspath=%s)\n") % (maybe_path, full_path, root_abspath)) - - if maybe_path: - return os.path.join(*maybe_path) - - if _root_debug: - sys.stderr.write("_root ignore, returning %s\n" % (file_path_from_root)) - - # --root=FAKE_DIR is ignored - return file_path_from_root - - file_path_from_root = FixupPathFromRoot() - return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_' - - -def CheckForHeaderGuard(filename, clean_lines, error): - """Checks that the file contains a header guard. - - Logs an error if no #ifndef header guard is present. For other - headers, checks that the full pathname is used. - - Args: - filename: The name of the C++ header file. - clean_lines: A CleansedLines instance containing the file. - error: The function to call with any errors found. - """ - - # Don't check for header guards if there are error suppression - # comments somewhere in this file. - # - # Because this is silencing a warning for a nonexistent line, we - # only support the very specific NOLINT(build/header_guard) syntax, - # and not the general NOLINT or NOLINT(*) syntax. - raw_lines = clean_lines.lines_without_raw_strings - for i in raw_lines: - if Search(r'//\s*NOLINT\(build/header_guard\)', i): - return - - # Allow pragma once instead of header guards - for i in raw_lines: - if Search(r'^\s*#pragma\s+once', i): - return - - cppvar = GetHeaderGuardCPPVariable(filename) - - ifndef = '' - ifndef_linenum = 0 - define = '' - endif = '' - endif_linenum = 0 - for linenum, line in enumerate(raw_lines): - linesplit = line.split() - if len(linesplit) >= 2: - # find the first occurrence of #ifndef and #define, save arg - if not ifndef and linesplit[0] == '#ifndef': - # set ifndef to the header guard presented on the #ifndef line. - ifndef = linesplit[1] - ifndef_linenum = linenum - if not define and linesplit[0] == '#define': - define = linesplit[1] - # find the last occurrence of #endif, save entire line - if line.startswith('#endif'): - endif = line - endif_linenum = linenum - - if not ifndef or not define or ifndef != define: - error(filename, 0, 'build/header_guard', 5, - 'No #ifndef header guard found, suggested CPP variable is: %s' % - cppvar) - return - - # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ - # for backward compatibility. - if ifndef != cppvar: - error_level = 0 - if ifndef != cppvar + '_': - error_level = 5 - - ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum, - error) - error(filename, ifndef_linenum, 'build/header_guard', error_level, - '#ifndef header guard has wrong style, please use: %s' % cppvar) - - # Check for "//" comments on endif line. - ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum, - error) - match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif) - if match: - if match.group(1) == '_': - # Issue low severity warning for deprecated double trailing underscore - error(filename, endif_linenum, 'build/header_guard', 0, - '#endif line should be "#endif // %s"' % cppvar) - return - - # Didn't find the corresponding "//" comment. If this file does not - # contain any "//" comments at all, it could be that the compiler - # only wants "/**/" comments, look for those instead. - no_single_line_comments = True - for i in xrange(1, len(raw_lines) - 1): - line = raw_lines[i] - if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line): - no_single_line_comments = False - break - - if no_single_line_comments: - match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif) - if match: - if match.group(1) == '_': - # Low severity warning for double trailing underscore - error(filename, endif_linenum, 'build/header_guard', 0, - '#endif line should be "#endif /* %s */"' % cppvar) - return - - # Didn't find anything - error(filename, endif_linenum, 'build/header_guard', 5, - '#endif line should be "#endif // %s"' % cppvar) - - -def CheckHeaderFileIncluded(filename, include_state, error): - """Logs an error if a source file does not include its header.""" - - # Do not check test files - fileinfo = FileInfo(filename) - if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()): - return - - for ext in GetHeaderExtensions(): - basefilename = filename[0:len(filename) - len(fileinfo.Extension())] - headerfile = basefilename + '.' + ext - if not os.path.exists(headerfile): - continue - headername = FileInfo(headerfile).RepositoryName() - first_include = None - include_uses_unix_dir_aliases = False - for section_list in include_state.include_list: - for f in section_list: - include_text = f[0] - if "./" in include_text: - include_uses_unix_dir_aliases = True - if headername in include_text or include_text in headername: - return - if not first_include: - first_include = f[1] - - message = '%s should include its header file %s' % (fileinfo.RepositoryName(), headername) - if include_uses_unix_dir_aliases: - message += ". Relative paths like . and .. are not allowed." - - error(filename, first_include, 'build/include', 5, message) - - -def CheckForBadCharacters(filename, lines, error): - """Logs an error for each line containing bad characters. - - Two kinds of bad characters: - - 1. Unicode replacement characters: These indicate that either the file - contained invalid UTF-8 (likely) or Unicode replacement characters (which - it shouldn't). Note that it's possible for this to throw off line - numbering if the invalid UTF-8 occurred adjacent to a newline. - - 2. NUL bytes. These are problematic for some tools. - - Args: - filename: The name of the current file. - lines: An array of strings, each representing a line of the file. - error: The function to call with any errors found. - """ - for linenum, line in enumerate(lines): - if unicode_escape_decode('\ufffd') in line: - error(filename, linenum, 'readability/utf8', 5, - 'Line contains invalid UTF-8 (or Unicode replacement character).') - if '\0' in line: - error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.') - - -def CheckForNewlineAtEOF(filename, lines, error): - """Logs an error if there is no newline char at the end of the file. - - Args: - filename: The name of the current file. - lines: An array of strings, each representing a line of the file. - error: The function to call with any errors found. - """ - - # The array lines() was created by adding two newlines to the - # original file (go figure), then splitting on \n. - # To verify that the file ends in \n, we just have to make sure the - # last-but-two element of lines() exists and is empty. - if len(lines) < 3 or lines[-2]: - error(filename, len(lines) - 2, 'whitespace/ending_newline', 5, - 'Could not find a newline character at the end of the file.') - - -def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): - """Logs an error if we see /* ... */ or "..." that extend past one line. - - /* ... */ comments are legit inside macros, for one line. - Otherwise, we prefer // comments, so it's ok to warn about the - other. Likewise, it's ok for strings to extend across multiple - lines, as long as a line continuation character (backslash) - terminates each line. Although not currently prohibited by the C++ - style guide, it's ugly and unnecessary. We don't do well with either - in this lint program, so we warn about both. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # Remove all \\ (escaped backslashes) from the line. They are OK, and the - # second (escaped) slash may trigger later \" detection erroneously. - line = line.replace('\\\\', '') - - if line.count('/*') > line.count('*/'): - error(filename, linenum, 'readability/multiline_comment', 5, - 'Complex multi-line /*...*/-style comment found. ' - 'Lint may give bogus warnings. ' - 'Consider replacing these with //-style comments, ' - 'with #if 0...#endif, ' - 'or with more clearly structured multi-line comments.') - - if (line.count('"') - line.count('\\"')) % 2: - error(filename, linenum, 'readability/multiline_string', 5, - 'Multi-line string ("...") found. This lint script doesn\'t ' - 'do well with such strings, and may give bogus warnings. ' - 'Use C++11 raw strings or concatenation instead.') - - -# (non-threadsafe name, thread-safe alternative, validation pattern) -# -# The validation pattern is used to eliminate false positives such as: -# _rand(); // false positive due to substring match. -# ->rand(); // some member function rand(). -# ACMRandom rand(seed); // some variable named rand. -# ISAACRandom rand(); // another variable named rand. -# -# Basically we require the return value of these functions to be used -# in some expression context on the same line by matching on some -# operator before the function name. This eliminates constructors and -# member function calls. -_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)' -_THREADING_LIST = ( - ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'), - ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'), - ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'), - ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'), - ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'), - ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'), - ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'), - ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'), - ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'), - ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'), - ('strtok(', 'strtok_r(', - _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'), - ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'), - ) - - -def CheckPosixThreading(filename, clean_lines, linenum, error): - """Checks for calls to thread-unsafe functions. - - Much code has been originally written without consideration of - multi-threading. Also, engineers are relying on their old experience; - they have learned posix before threading extensions were added. These - tests guide the engineers to use thread-safe functions (when using - posix directly). - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST: - # Additional pattern matching check to confirm that this is the - # function we are looking for - if Search(pattern, line): - error(filename, linenum, 'runtime/threadsafe_fn', 2, - 'Consider using ' + multithread_safe_func + - '...) instead of ' + single_thread_func + - '...) for improved thread safety.') - - -def CheckVlogArguments(filename, clean_lines, linenum, error): - """Checks that VLOG() is only used for defining a logging level. - - For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and - VLOG(FATAL) are not. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line): - error(filename, linenum, 'runtime/vlog', 5, - 'VLOG() should be used with numeric verbosity level. ' - 'Use LOG() if you want symbolic severity levels.') - -# Matches invalid increment: *count++, which moves pointer instead of -# incrementing a value. -_RE_PATTERN_INVALID_INCREMENT = re.compile( - r'^\s*\*\w+(\+\+|--);') - - -def CheckInvalidIncrement(filename, clean_lines, linenum, error): - """Checks for invalid increment *count++. - - For example following function: - void increment_counter(int* count) { - *count++; - } - is invalid, because it effectively does count++, moving pointer, and should - be replaced with ++*count, (*count)++ or *count += 1. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - if _RE_PATTERN_INVALID_INCREMENT.match(line): - error(filename, linenum, 'runtime/invalid_increment', 5, - 'Changing pointer instead of value (or unused value of operator*).') - - -def IsMacroDefinition(clean_lines, linenum): - if Search(r'^#define', clean_lines[linenum]): - return True - - if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]): - return True - - return False - - -def IsForwardClassDeclaration(clean_lines, linenum): - return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum]) - - -class _BlockInfo(object): - """Stores information about a generic block of code.""" - - def __init__(self, linenum, seen_open_brace): - self.starting_linenum = linenum - self.seen_open_brace = seen_open_brace - self.open_parentheses = 0 - self.inline_asm = _NO_ASM - self.check_namespace_indentation = False - - def CheckBegin(self, filename, clean_lines, linenum, error): - """Run checks that applies to text up to the opening brace. - - This is mostly for checking the text after the class identifier - and the "{", usually where the base class is specified. For other - blocks, there isn't much to check, so we always pass. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - pass - - def CheckEnd(self, filename, clean_lines, linenum, error): - """Run checks that applies to text after the closing brace. - - This is mostly used for checking end of namespace comments. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - pass - - def IsBlockInfo(self): - """Returns true if this block is a _BlockInfo. - - This is convenient for verifying that an object is an instance of - a _BlockInfo, but not an instance of any of the derived classes. - - Returns: - True for this class, False for derived classes. - """ - return self.__class__ == _BlockInfo - - -class _ExternCInfo(_BlockInfo): - """Stores information about an 'extern "C"' block.""" - - def __init__(self, linenum): - _BlockInfo.__init__(self, linenum, True) - - -class _ClassInfo(_BlockInfo): - """Stores information about a class.""" - - def __init__(self, name, class_or_struct, clean_lines, linenum): - _BlockInfo.__init__(self, linenum, False) - self.name = name - self.is_derived = False - self.check_namespace_indentation = True - if class_or_struct == 'struct': - self.access = 'public' - self.is_struct = True - else: - self.access = 'private' - self.is_struct = False - - # Remember initial indentation level for this class. Using raw_lines here - # instead of elided to account for leading comments. - self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum]) - - # Try to find the end of the class. This will be confused by things like: - # class A { - # } *x = { ... - # - # But it's still good enough for CheckSectionSpacing. - self.last_line = 0 - depth = 0 - for i in range(linenum, clean_lines.NumLines()): - line = clean_lines.elided[i] - depth += line.count('{') - line.count('}') - if not depth: - self.last_line = i - break - - def CheckBegin(self, filename, clean_lines, linenum, error): - # Look for a bare ':' - if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]): - self.is_derived = True - - def CheckEnd(self, filename, clean_lines, linenum, error): - # If there is a DISALLOW macro, it should appear near the end of - # the class. - seen_last_thing_in_class = False - for i in xrange(linenum - 1, self.starting_linenum, -1): - match = Search( - r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' + - self.name + r'\)', - clean_lines.elided[i]) - if match: - if seen_last_thing_in_class: - error(filename, i, 'readability/constructors', 3, - match.group(1) + ' should be the last thing in the class') - break - - if not Match(r'^\s*$', clean_lines.elided[i]): - seen_last_thing_in_class = True - - # Check that closing brace is aligned with beginning of the class. - # Only do this if the closing brace is indented by only whitespaces. - # This means we will not check single-line class definitions. - indent = Match(r'^( *)\}', clean_lines.elided[linenum]) - if indent and len(indent.group(1)) != self.class_indent: - if self.is_struct: - parent = 'struct ' + self.name - else: - parent = 'class ' + self.name - error(filename, linenum, 'whitespace/indent', 3, - 'Closing brace should be aligned with beginning of %s' % parent) - - -class _NamespaceInfo(_BlockInfo): - """Stores information about a namespace.""" - - def __init__(self, name, linenum): - _BlockInfo.__init__(self, linenum, False) - self.name = name or '' - self.check_namespace_indentation = True - - def CheckEnd(self, filename, clean_lines, linenum, error): - """Check end of namespace comments.""" - line = clean_lines.raw_lines[linenum] - - # Check how many lines is enclosed in this namespace. Don't issue - # warning for missing namespace comments if there aren't enough - # lines. However, do apply checks if there is already an end of - # namespace comment and it's incorrect. - # - # TODO(unknown): We always want to check end of namespace comments - # if a namespace is large, but sometimes we also want to apply the - # check if a short namespace contained nontrivial things (something - # other than forward declarations). There is currently no logic on - # deciding what these nontrivial things are, so this check is - # triggered by namespace size only, which works most of the time. - if (linenum - self.starting_linenum < 10 - and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)): - return - - # Look for matching comment at end of namespace. - # - # Note that we accept C style "/* */" comments for terminating - # namespaces, so that code that terminate namespaces inside - # preprocessor macros can be cpplint clean. - # - # We also accept stuff like "// end of namespace ." with the - # period at the end. - # - # Besides these, we don't accept anything else, otherwise we might - # get false negatives when existing comment is a substring of the - # expected namespace. - if self.name: - # Named namespace - if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' + - re.escape(self.name) + r'[\*/\.\\\s]*$'), - line): - error(filename, linenum, 'readability/namespace', 5, - 'Namespace should be terminated with "// namespace %s"' % - self.name) - else: - # Anonymous namespace - if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): - # If "// namespace anonymous" or "// anonymous namespace (more text)", - # mention "// anonymous namespace" as an acceptable form - if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line): - error(filename, linenum, 'readability/namespace', 5, - 'Anonymous namespace should be terminated with "// namespace"' - ' or "// anonymous namespace"') - else: - error(filename, linenum, 'readability/namespace', 5, - 'Anonymous namespace should be terminated with "// namespace"') - - -class _PreprocessorInfo(object): - """Stores checkpoints of nesting stacks when #if/#else is seen.""" - - def __init__(self, stack_before_if): - # The entire nesting stack before #if - self.stack_before_if = stack_before_if - - # The entire nesting stack up to #else - self.stack_before_else = [] - - # Whether we have already seen #else or #elif - self.seen_else = False - - -class NestingState(object): - """Holds states related to parsing braces.""" - - def __init__(self): - # Stack for tracking all braces. An object is pushed whenever we - # see a "{", and popped when we see a "}". Only 3 types of - # objects are possible: - # - _ClassInfo: a class or struct. - # - _NamespaceInfo: a namespace. - # - _BlockInfo: some other type of block. - self.stack = [] - - # Top of the previous stack before each Update(). - # - # Because the nesting_stack is updated at the end of each line, we - # had to do some convoluted checks to find out what is the current - # scope at the beginning of the line. This check is simplified by - # saving the previous top of nesting stack. - # - # We could save the full stack, but we only need the top. Copying - # the full nesting stack would slow down cpplint by ~10%. - self.previous_stack_top = [] - - # Stack of _PreprocessorInfo objects. - self.pp_stack = [] - - def SeenOpenBrace(self): - """Check if we have seen the opening brace for the innermost block. - - Returns: - True if we have seen the opening brace, False if the innermost - block is still expecting an opening brace. - """ - return (not self.stack) or self.stack[-1].seen_open_brace - - def InNamespaceBody(self): - """Check if we are currently one level inside a namespace body. - - Returns: - True if top of the stack is a namespace block, False otherwise. - """ - return self.stack and isinstance(self.stack[-1], _NamespaceInfo) - - def InExternC(self): - """Check if we are currently one level inside an 'extern "C"' block. - - Returns: - True if top of the stack is an extern block, False otherwise. - """ - return self.stack and isinstance(self.stack[-1], _ExternCInfo) - - def InClassDeclaration(self): - """Check if we are currently one level inside a class or struct declaration. - - Returns: - True if top of the stack is a class/struct, False otherwise. - """ - return self.stack and isinstance(self.stack[-1], _ClassInfo) - - def InAsmBlock(self): - """Check if we are currently one level inside an inline ASM block. - - Returns: - True if the top of the stack is a block containing inline ASM. - """ - return self.stack and self.stack[-1].inline_asm != _NO_ASM - - def InTemplateArgumentList(self, clean_lines, linenum, pos): - """Check if current position is inside template argument list. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - pos: position just after the suspected template argument. - Returns: - True if (linenum, pos) is inside template arguments. - """ - while linenum < clean_lines.NumLines(): - # Find the earliest character that might indicate a template argument - line = clean_lines.elided[linenum] - match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:]) - if not match: - linenum += 1 - pos = 0 - continue - token = match.group(1) - pos += len(match.group(0)) - - # These things do not look like template argument list: - # class Suspect { - # class Suspect x; } - if token in ('{', '}', ';'): return False - - # These things look like template argument list: - # template - # template - # template - # template - if token in ('>', '=', '[', ']', '.'): return True - - # Check if token is an unmatched '<'. - # If not, move on to the next character. - if token != '<': - pos += 1 - if pos >= len(line): - linenum += 1 - pos = 0 - continue - - # We can't be sure if we just find a single '<', and need to - # find the matching '>'. - (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1) - if end_pos < 0: - # Not sure if template argument list or syntax error in file - return False - linenum = end_line - pos = end_pos - return False - - def UpdatePreprocessor(self, line): - """Update preprocessor stack. - - We need to handle preprocessors due to classes like this: - #ifdef SWIG - struct ResultDetailsPageElementExtensionPoint { - #else - struct ResultDetailsPageElementExtensionPoint : public Extension { - #endif - - We make the following assumptions (good enough for most files): - - Preprocessor condition evaluates to true from #if up to first - #else/#elif/#endif. - - - Preprocessor condition evaluates to false from #else/#elif up - to #endif. We still perform lint checks on these lines, but - these do not affect nesting stack. - - Args: - line: current line to check. - """ - if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line): - # Beginning of #if block, save the nesting stack here. The saved - # stack will allow us to restore the parsing state in the #else case. - self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack))) - elif Match(r'^\s*#\s*(else|elif)\b', line): - # Beginning of #else block - if self.pp_stack: - if not self.pp_stack[-1].seen_else: - # This is the first #else or #elif block. Remember the - # whole nesting stack up to this point. This is what we - # keep after the #endif. - self.pp_stack[-1].seen_else = True - self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack) - - # Restore the stack to how it was before the #if - self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if) - else: - # TODO(unknown): unexpected #else, issue warning? - pass - elif Match(r'^\s*#\s*endif\b', line): - # End of #if or #else blocks. - if self.pp_stack: - # If we saw an #else, we will need to restore the nesting - # stack to its former state before the #else, otherwise we - # will just continue from where we left off. - if self.pp_stack[-1].seen_else: - # Here we can just use a shallow copy since we are the last - # reference to it. - self.stack = self.pp_stack[-1].stack_before_else - # Drop the corresponding #if - self.pp_stack.pop() - else: - # TODO(unknown): unexpected #endif, issue warning? - pass - - # TODO(unknown): Update() is too long, but we will refactor later. - def Update(self, filename, clean_lines, linenum, error): - """Update nesting state with current line. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # Remember top of the previous nesting stack. - # - # The stack is always pushed/popped and not modified in place, so - # we can just do a shallow copy instead of copy.deepcopy. Using - # deepcopy would slow down cpplint by ~28%. - if self.stack: - self.previous_stack_top = self.stack[-1] - else: - self.previous_stack_top = None - - # Update pp_stack - self.UpdatePreprocessor(line) - - # Count parentheses. This is to avoid adding struct arguments to - # the nesting stack. - if self.stack: - inner_block = self.stack[-1] - depth_change = line.count('(') - line.count(')') - inner_block.open_parentheses += depth_change - - # Also check if we are starting or ending an inline assembly block. - if inner_block.inline_asm in (_NO_ASM, _END_ASM): - if (depth_change != 0 and - inner_block.open_parentheses == 1 and - _MATCH_ASM.match(line)): - # Enter assembly block - inner_block.inline_asm = _INSIDE_ASM - else: - # Not entering assembly block. If previous line was _END_ASM, - # we will now shift to _NO_ASM state. - inner_block.inline_asm = _NO_ASM - elif (inner_block.inline_asm == _INSIDE_ASM and - inner_block.open_parentheses == 0): - # Exit assembly block - inner_block.inline_asm = _END_ASM - - # Consume namespace declaration at the beginning of the line. Do - # this in a loop so that we catch same line declarations like this: - # namespace proto2 { namespace bridge { class MessageSet; } } - while True: - # Match start of namespace. The "\b\s*" below catches namespace - # declarations even if it weren't followed by a whitespace, this - # is so that we don't confuse our namespace checker. The - # missing spaces will be flagged by CheckSpacing. - namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line) - if not namespace_decl_match: - break - - new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum) - self.stack.append(new_namespace) - - line = namespace_decl_match.group(2) - if line.find('{') != -1: - new_namespace.seen_open_brace = True - line = line[line.find('{') + 1:] - - # Look for a class declaration in whatever is left of the line - # after parsing namespaces. The regexp accounts for decorated classes - # such as in: - # class LOCKABLE API Object { - # }; - class_decl_match = Match( - r'^(\s*(?:template\s*<[\w\s<>,:=]*>\s*)?' - r'(class|struct)\s+(?:[a-zA-Z0-9_]+\s+)*(\w+(?:::\w+)*))' - r'(.*)$', line) - if (class_decl_match and - (not self.stack or self.stack[-1].open_parentheses == 0)): - # We do not want to accept classes that are actually template arguments: - # template , - # template class Ignore3> - # void Function() {}; - # - # To avoid template argument cases, we scan forward and look for - # an unmatched '>'. If we see one, assume we are inside a - # template argument list. - end_declaration = len(class_decl_match.group(1)) - if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration): - self.stack.append(_ClassInfo( - class_decl_match.group(3), class_decl_match.group(2), - clean_lines, linenum)) - line = class_decl_match.group(4) - - # If we have not yet seen the opening brace for the innermost block, - # run checks here. - if not self.SeenOpenBrace(): - self.stack[-1].CheckBegin(filename, clean_lines, linenum, error) - - # Update access control if we are inside a class/struct - if self.stack and isinstance(self.stack[-1], _ClassInfo): - classinfo = self.stack[-1] - access_match = Match( - r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?' - r':(?:[^:]|$)', - line) - if access_match: - classinfo.access = access_match.group(2) - - # Check that access keywords are indented +1 space. Skip this - # check if the keywords are not preceded by whitespaces. - indent = access_match.group(1) - if (len(indent) != classinfo.class_indent + 1 and - Match(r'^\s*$', indent)): - if classinfo.is_struct: - parent = 'struct ' + classinfo.name - else: - parent = 'class ' + classinfo.name - slots = '' - if access_match.group(3): - slots = access_match.group(3) - error(filename, linenum, 'whitespace/indent', 3, - '%s%s: should be indented +1 space inside %s' % ( - access_match.group(2), slots, parent)) - - # Consume braces or semicolons from what's left of the line - while True: - # Match first brace, semicolon, or closed parenthesis. - matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line) - if not matched: - break - - token = matched.group(1) - if token == '{': - # If namespace or class hasn't seen a opening brace yet, mark - # namespace/class head as complete. Push a new block onto the - # stack otherwise. - if not self.SeenOpenBrace(): - self.stack[-1].seen_open_brace = True - elif Match(r'^extern\s*"[^"]*"\s*\{', line): - self.stack.append(_ExternCInfo(linenum)) - else: - self.stack.append(_BlockInfo(linenum, True)) - if _MATCH_ASM.match(line): - self.stack[-1].inline_asm = _BLOCK_ASM - - elif token == ';' or token == ')': - # If we haven't seen an opening brace yet, but we already saw - # a semicolon, this is probably a forward declaration. Pop - # the stack for these. - # - # Similarly, if we haven't seen an opening brace yet, but we - # already saw a closing parenthesis, then these are probably - # function arguments with extra "class" or "struct" keywords. - # Also pop these stack for these. - if not self.SeenOpenBrace(): - self.stack.pop() - else: # token == '}' - # Perform end of block checks and pop the stack. - if self.stack: - self.stack[-1].CheckEnd(filename, clean_lines, linenum, error) - self.stack.pop() - line = matched.group(2) - - def InnermostClass(self): - """Get class info on the top of the stack. - - Returns: - A _ClassInfo object if we are inside a class, or None otherwise. - """ - for i in range(len(self.stack), 0, -1): - classinfo = self.stack[i - 1] - if isinstance(classinfo, _ClassInfo): - return classinfo - return None - - def CheckCompletedBlocks(self, filename, error): - """Checks that all classes and namespaces have been completely parsed. - - Call this when all lines in a file have been processed. - Args: - filename: The name of the current file. - error: The function to call with any errors found. - """ - # Note: This test can result in false positives if #ifdef constructs - # get in the way of brace matching. See the testBuildClass test in - # cpplint_unittest.py for an example of this. - for obj in self.stack: - if isinstance(obj, _ClassInfo): - error(filename, obj.starting_linenum, 'build/class', 5, - 'Failed to find complete declaration of class %s' % - obj.name) - elif isinstance(obj, _NamespaceInfo): - error(filename, obj.starting_linenum, 'build/namespaces', 5, - 'Failed to find complete declaration of namespace %s' % - obj.name) - - -def CheckForNonStandardConstructs(filename, clean_lines, linenum, - nesting_state, error): - r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2. - - Complain about several constructs which gcc-2 accepts, but which are - not standard C++. Warning about these in lint is one way to ease the - transition to new compilers. - - put storage class first (e.g. "static const" instead of "const static"). - - "%lld" instead of %qd" in printf-type functions. - - "%1$d" is non-standard in printf-type functions. - - "\%" is an undefined character escape sequence. - - text after #endif is not allowed. - - invalid inner-style forward declaration. - - >? and ?= and )\?=?\s*(\w+|[+-]?\d+)(\.\d*)?', - line): - error(filename, linenum, 'build/deprecated', 3, - '>? and ))?' - # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;' - error(filename, linenum, 'runtime/member_string_references', 2, - 'const string& members are dangerous. It is much better to use ' - 'alternatives, such as pointers or simple constants.') - - # Everything else in this function operates on class declarations. - # Return early if the top of the nesting stack is not a class, or if - # the class head is not completed yet. - classinfo = nesting_state.InnermostClass() - if not classinfo or not classinfo.seen_open_brace: - return - - # The class may have been declared with namespace or classname qualifiers. - # The constructor and destructor will not have those qualifiers. - base_classname = classinfo.name.split('::')[-1] - - # Look for single-argument constructors that aren't marked explicit. - # Technically a valid construct, but against style. - explicit_constructor_match = Match( - r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?' - r'(?:(?:inline|constexpr)\s+)*%s\s*' - r'\(((?:[^()]|\([^()]*\))*)\)' - % re.escape(base_classname), - line) - - if explicit_constructor_match: - is_marked_explicit = explicit_constructor_match.group(1) - - if not explicit_constructor_match.group(2): - constructor_args = [] - else: - constructor_args = explicit_constructor_match.group(2).split(',') - - # collapse arguments so that commas in template parameter lists and function - # argument parameter lists don't split arguments in two - i = 0 - while i < len(constructor_args): - constructor_arg = constructor_args[i] - while (constructor_arg.count('<') > constructor_arg.count('>') or - constructor_arg.count('(') > constructor_arg.count(')')): - constructor_arg += ',' + constructor_args[i + 1] - del constructor_args[i + 1] - constructor_args[i] = constructor_arg - i += 1 - - variadic_args = [arg for arg in constructor_args if '&&...' in arg] - defaulted_args = [arg for arg in constructor_args if '=' in arg] - noarg_constructor = (not constructor_args or # empty arg list - # 'void' arg specifier - (len(constructor_args) == 1 and - constructor_args[0].strip() == 'void')) - onearg_constructor = ((len(constructor_args) == 1 and # exactly one arg - not noarg_constructor) or - # all but at most one arg defaulted - (len(constructor_args) >= 1 and - not noarg_constructor and - len(defaulted_args) >= len(constructor_args) - 1) or - # variadic arguments with zero or one argument - (len(constructor_args) <= 2 and - len(variadic_args) >= 1)) - initializer_list_constructor = bool( - onearg_constructor and - Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0])) - copy_constructor = bool( - onearg_constructor and - Match(r'((const\s+(volatile\s+)?)?|(volatile\s+(const\s+)?))?' - r'%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' - % re.escape(base_classname), constructor_args[0].strip())) - - if (not is_marked_explicit and - onearg_constructor and - not initializer_list_constructor and - not copy_constructor): - if defaulted_args or variadic_args: - error(filename, linenum, 'runtime/explicit', 5, - 'Constructors callable with one argument ' - 'should be marked explicit.') - else: - error(filename, linenum, 'runtime/explicit', 5, - 'Single-parameter constructors should be marked explicit.') - elif is_marked_explicit and not onearg_constructor: - if noarg_constructor: - error(filename, linenum, 'runtime/explicit', 5, - 'Zero-parameter constructors should not be marked explicit.') - - -def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): - """Checks for the correctness of various spacing around function calls. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # Since function calls often occur inside if/for/while/switch - # expressions - which have their own, more liberal conventions - we - # first see if we should be looking inside such an expression for a - # function call, to which we can apply more strict standards. - fncall = line # if there's no control flow construct, look at whole line - for pattern in (r'\bif\s*\((.*)\)\s*{', - r'\bfor\s*\((.*)\)\s*{', - r'\bwhile\s*\((.*)\)\s*[{;]', - r'\bswitch\s*\((.*)\)\s*{'): - match = Search(pattern, line) - if match: - fncall = match.group(1) # look inside the parens for function calls - break - - # Except in if/for/while/switch, there should never be space - # immediately inside parens (eg "f( 3, 4 )"). We make an exception - # for nested parens ( (a+b) + c ). Likewise, there should never be - # a space before a ( when it's a function argument. I assume it's a - # function argument when the char before the whitespace is legal in - # a function name (alnum + _) and we're not starting a macro. Also ignore - # pointers and references to arrays and functions coz they're too tricky: - # we use a very simple way to recognize these: - # " (something)(maybe-something)" or - # " (something)(maybe-something," or - # " (something)[something]" - # Note that we assume the contents of [] to be short enough that - # they'll never need to wrap. - if ( # Ignore control structures. - not Search(r'\b(if|elif|for|while|switch|return|new|delete|catch|sizeof)\b', - fncall) and - # Ignore pointers/references to functions. - not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and - # Ignore pointers/references to arrays. - not Search(r' \([^)]+\)\[[^\]]+\]', fncall)): - if Search(r'\w\s*\(\s(?!\s*\\$)', fncall): # a ( used for a fn call - error(filename, linenum, 'whitespace/parens', 4, - 'Extra space after ( in function call') - elif Search(r'\(\s+(?!(\s*\\)|\()', fncall): - error(filename, linenum, 'whitespace/parens', 2, - 'Extra space after (') - if (Search(r'\w\s+\(', fncall) and - not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and - not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and - not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and - not Search(r'\bcase\s+\(', fncall)): - # TODO(unknown): Space after an operator function seem to be a common - # error, silence those for now by restricting them to highest verbosity. - if Search(r'\boperator_*\b', line): - error(filename, linenum, 'whitespace/parens', 0, - 'Extra space before ( in function call') - else: - error(filename, linenum, 'whitespace/parens', 4, - 'Extra space before ( in function call') - # If the ) is followed only by a newline or a { + newline, assume it's - # part of a control statement (if/while/etc), and don't complain - if Search(r'[^)]\s+\)\s*[^{\s]', fncall): - # If the closing parenthesis is preceded by only whitespaces, - # try to give a more descriptive error message. - if Search(r'^\s+\)', fncall): - error(filename, linenum, 'whitespace/parens', 2, - 'Closing ) should be moved to the previous line') - else: - error(filename, linenum, 'whitespace/parens', 2, - 'Extra space before )') - - -def IsBlankLine(line): - """Returns true if the given line is blank. - - We consider a line to be blank if the line is empty or consists of - only white spaces. - - Args: - line: A line of a string. - - Returns: - True, if the given line is blank. - """ - return not line or line.isspace() - - -def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, - error): - is_namespace_indent_item = ( - len(nesting_state.stack) > 1 and - nesting_state.stack[-1].check_namespace_indentation and - isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and - nesting_state.previous_stack_top == nesting_state.stack[-2]) - - if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, - clean_lines.elided, line): - CheckItemIndentationInNamespace(filename, clean_lines.elided, - line, error) - - -def CheckForFunctionLengths(filename, clean_lines, linenum, - function_state, error): - """Reports for long function bodies. - - For an overview why this is done, see: - https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions - - Uses a simplistic algorithm assuming other style guidelines - (especially spacing) are followed. - Only checks unindented functions, so class members are unchecked. - Trivial bodies are unchecked, so constructors with huge initializer lists - may be missed. - Blank/comment lines are not counted so as to avoid encouraging the removal - of vertical space and comments just to get through a lint check. - NOLINT *on the last line of a function* disables this check. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - function_state: Current function name and lines in body so far. - error: The function to call with any errors found. - """ - lines = clean_lines.lines - line = lines[linenum] - joined_line = '' - - starting_func = False - regexp = r'(\w(\w|::|\*|\&|\s)*)\(' # decls * & space::name( ... - match_result = Match(regexp, line) - if match_result: - # If the name is all caps and underscores, figure it's a macro and - # ignore it, unless it's TEST or TEST_F. - function_name = match_result.group(1).split()[-1] - if function_name == 'TEST' or function_name == 'TEST_F' or ( - not Match(r'[A-Z_]+$', function_name)): - starting_func = True - - if starting_func: - body_found = False - for start_linenum in xrange(linenum, clean_lines.NumLines()): - start_line = lines[start_linenum] - joined_line += ' ' + start_line.lstrip() - if Search(r'(;|})', start_line): # Declarations and trivial functions - body_found = True - break # ... ignore - if Search(r'{', start_line): - body_found = True - function = Search(r'((\w|:)*)\(', line).group(1) - if Match(r'TEST', function): # Handle TEST... macros - parameter_regexp = Search(r'(\(.*\))', joined_line) - if parameter_regexp: # Ignore bad syntax - function += parameter_regexp.group(1) - else: - function += '()' - function_state.Begin(function) - break - if not body_found: - # No body for the function (or evidence of a non-function) was found. - error(filename, linenum, 'readability/fn_size', 5, - 'Lint failed to find start of function body.') - elif Match(r'^\}\s*$', line): # function end - function_state.Check(error, filename, linenum) - function_state.End() - elif not Match(r'^\s*$', line): - function_state.Count() # Count non-blank/non-comment lines. - - -_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?') - - -def CheckComment(line, filename, linenum, next_line_start, error): - """Checks for common mistakes in comments. - - Args: - line: The line in question. - filename: The name of the current file. - linenum: The number of the line to check. - next_line_start: The first non-whitespace column of the next line. - error: The function to call with any errors found. - """ - commentpos = line.find('//') - if commentpos != -1: - # Check if the // may be in quotes. If so, ignore it - if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0: - # Allow one space for new scopes, two spaces otherwise: - if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and - ((commentpos >= 1 and - line[commentpos-1] not in string.whitespace) or - (commentpos >= 2 and - line[commentpos-2] not in string.whitespace))): - error(filename, linenum, 'whitespace/comments', 2, - 'At least two spaces is best between code and comments') - - # Checks for common mistakes in TODO comments. - comment = line[commentpos:] - match = _RE_PATTERN_TODO.match(comment) - if match: - # One whitespace is correct; zero whitespace is handled elsewhere. - leading_whitespace = match.group(1) - if len(leading_whitespace) > 1: - error(filename, linenum, 'whitespace/todo', 2, - 'Too many spaces before TODO') - - username = match.group(2) - if not username: - error(filename, linenum, 'readability/todo', 2, - 'Missing username in TODO; it should look like ' - '"// TODO(my_username): Stuff."') - - middle_whitespace = match.group(3) - # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison - if middle_whitespace != ' ' and middle_whitespace != '': - error(filename, linenum, 'whitespace/todo', 2, - 'TODO(my_username) should be followed by a space') - - # If the comment contains an alphanumeric character, there - # should be a space somewhere between it and the // unless - # it's a /// or //! Doxygen comment. - if (Match(r'//[^ ]*\w', comment) and - not Match(r'(///|//\!)(\s+|$)', comment)): - error(filename, linenum, 'whitespace/comments', 4, - 'Should have a space between // and comment') - - -def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): - """Checks for the correctness of various spacing issues in the code. - - Things we check for: spaces around operators, spaces after - if/for/while/switch, no spaces around parens in function calls, two - spaces between code and comment, don't start a block with a blank - line, don't end a function with a blank line, don't add a blank line - after public/protected/private, don't have too many blank lines in a row. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - nesting_state: A NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: The function to call with any errors found. - """ - - # Don't use "elided" lines here, otherwise we can't check commented lines. - # Don't want to use "raw" either, because we don't want to check inside C++11 - # raw strings, - raw = clean_lines.lines_without_raw_strings - line = raw[linenum] - - # Before nixing comments, check if the line is blank for no good - # reason. This includes the first line after a block is opened, and - # blank lines at the end of a function (ie, right before a line like '}' - # - # Skip all the blank line checks if we are immediately inside a - # namespace body. In other words, don't issue blank line warnings - # for this block: - # namespace { - # - # } - # - # A warning about missing end of namespace comments will be issued instead. - # - # Also skip blank line checks for 'extern "C"' blocks, which are formatted - # like namespaces. - if (IsBlankLine(line) and - not nesting_state.InNamespaceBody() and - not nesting_state.InExternC()): - elided = clean_lines.elided - prev_line = elided[linenum - 1] - prevbrace = prev_line.rfind('{') - # TODO(unknown): Don't complain if line before blank line, and line after, - # both start with alnums and are indented the same amount. - # This ignores whitespace at the start of a namespace block - # because those are not usually indented. - if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1: - # OK, we have a blank line at the start of a code block. Before we - # complain, we check if it is an exception to the rule: The previous - # non-empty line has the parameters of a function header that are indented - # 4 spaces (because they did not fit in a 80 column line when placed on - # the same line as the function name). We also check for the case where - # the previous line is indented 6 spaces, which may happen when the - # initializers of a constructor do not fit into a 80 column line. - exception = False - if Match(r' {6}\w', prev_line): # Initializer list? - # We are looking for the opening column of initializer list, which - # should be indented 4 spaces to cause 6 space indentation afterwards. - search_position = linenum-2 - while (search_position >= 0 - and Match(r' {6}\w', elided[search_position])): - search_position -= 1 - exception = (search_position >= 0 - and elided[search_position][:5] == ' :') - else: - # Search for the function arguments or an initializer list. We use a - # simple heuristic here: If the line is indented 4 spaces; and we have a - # closing paren, without the opening paren, followed by an opening brace - # or colon (for initializer lists) we assume that it is the last line of - # a function header. If we have a colon indented 4 spaces, it is an - # initializer list. - exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)', - prev_line) - or Match(r' {4}:', prev_line)) - - if not exception: - error(filename, linenum, 'whitespace/blank_line', 2, - 'Redundant blank line at the start of a code block ' - 'should be deleted.') - # Ignore blank lines at the end of a block in a long if-else - # chain, like this: - # if (condition1) { - # // Something followed by a blank line - # - # } else if (condition2) { - # // Something else - # } - if linenum + 1 < clean_lines.NumLines(): - next_line = raw[linenum + 1] - if (next_line - and Match(r'\s*}', next_line) - and next_line.find('} else ') == -1): - error(filename, linenum, 'whitespace/blank_line', 3, - 'Redundant blank line at the end of a code block ' - 'should be deleted.') - - matched = Match(r'\s*(public|protected|private):', prev_line) - if matched: - error(filename, linenum, 'whitespace/blank_line', 3, - 'Do not leave a blank line after "%s:"' % matched.group(1)) - - # Next, check comments - next_line_start = 0 - if linenum + 1 < clean_lines.NumLines(): - next_line = raw[linenum + 1] - next_line_start = len(next_line) - len(next_line.lstrip()) - CheckComment(line, filename, linenum, next_line_start, error) - - # get rid of comments and strings - line = clean_lines.elided[linenum] - - # You shouldn't have spaces before your brackets, except for C++11 attributes - # or maybe after 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'. - if (Search(r'\w\s+\[(?!\[)', line) and - not Search(r'(?:auto&?|delete|return)\s+\[', line)): - error(filename, linenum, 'whitespace/braces', 5, - 'Extra space before [') - - # In range-based for, we wanted spaces before and after the colon, but - # not around "::" tokens that might appear. - if (Search(r'for *\(.*[^:]:[^: ]', line) or - Search(r'for *\(.*[^: ]:[^:]', line)): - error(filename, linenum, 'whitespace/forcolon', 2, - 'Missing space around colon in range-based for loop') - - -def CheckOperatorSpacing(filename, clean_lines, linenum, error): - """Checks for horizontal spacing around operators. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # Don't try to do spacing checks for operator methods. Do this by - # replacing the troublesome characters with something else, - # preserving column position for all other characters. - # - # The replacement is done repeatedly to avoid false positives from - # operators that call operators. - while True: - match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line) - if match: - line = match.group(1) + ('_' * len(match.group(2))) + match.group(3) - else: - break - - # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". - # Otherwise not. Note we only check for non-spaces on *both* sides; - # sometimes people put non-spaces on one side when aligning ='s among - # many lines (not that this is behavior that I approve of...) - if ((Search(r'[\w.]=', line) or - Search(r'=[\w.]', line)) - and not Search(r'\b(if|while|for) ', line) - # Operators taken from [lex.operators] in C++11 standard. - and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line) - and not Search(r'operator=', line)): - error(filename, linenum, 'whitespace/operators', 4, - 'Missing spaces around =') - - # It's ok not to have spaces around binary operators like + - * /, but if - # there's too little whitespace, we get concerned. It's hard to tell, - # though, so we punt on this one for now. TODO. - - # You should always have whitespace around binary operators. - # - # Check <= and >= first to avoid false positives with < and >, then - # check non-include lines for spacing around < and >. - # - # If the operator is followed by a comma, assume it's be used in a - # macro context and don't do any checks. This avoids false - # positives. - # - # Note that && is not included here. This is because there are too - # many false positives due to RValue references. - match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line) - if match: - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around %s' % match.group(1)) - elif not Match(r'#.*include', line): - # Look for < that is not surrounded by spaces. This is only - # triggered if both sides are missing spaces, even though - # technically it should flag if at least one side is missing a - # space. This is done to avoid some false positives with shifts. - match = Match(r'^(.*[^\s<])<[^\s=<,]', line) - if match: - (_, _, end_pos) = CloseExpression( - clean_lines, linenum, len(match.group(1))) - if end_pos <= -1: - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around <') - - # Look for > that is not surrounded by spaces. Similar to the - # above, we only trigger if both sides are missing spaces to avoid - # false positives with shifts. - match = Match(r'^(.*[^-\s>])>[^\s=>,]', line) - if match: - (_, _, start_pos) = ReverseCloseExpression( - clean_lines, linenum, len(match.group(1))) - if start_pos <= -1: - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around >') - - # We allow no-spaces around << when used like this: 10<<20, but - # not otherwise (particularly, not when used as streams) - # - # We also allow operators following an opening parenthesis, since - # those tend to be macros that deal with operators. - match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line) - if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and - not (match.group(1) == 'operator' and match.group(2) == ';')): - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around <<') - - # We allow no-spaces around >> for almost anything. This is because - # C++11 allows ">>" to close nested templates, which accounts for - # most cases when ">>" is not followed by a space. - # - # We still warn on ">>" followed by alpha character, because that is - # likely due to ">>" being used for right shifts, e.g.: - # value >> alpha - # - # When ">>" is used to close templates, the alphanumeric letter that - # follows would be part of an identifier, and there should still be - # a space separating the template type and the identifier. - # type> alpha - match = Search(r'>>[a-zA-Z_]', line) - if match: - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around >>') - - # There shouldn't be space around unary operators - match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line) - if match: - error(filename, linenum, 'whitespace/operators', 4, - 'Extra space for operator %s' % match.group(1)) - - -def CheckParenthesisSpacing(filename, clean_lines, linenum, error): - """Checks for horizontal spacing around parentheses. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # No spaces after an if, while, switch, or for - match = Search(r' (if\(|for\(|while\(|switch\()', line) - if match: - error(filename, linenum, 'whitespace/parens', 5, - 'Missing space before ( in %s' % match.group(1)) - - # For if/for/while/switch, the left and right parens should be - # consistent about how many spaces are inside the parens, and - # there should either be zero or one spaces inside the parens. - # We don't want: "if ( foo)" or "if ( foo )". - # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed. - match = Search(r'\b(if|for|while|switch)\s*' - r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', - line) - if match: - if len(match.group(2)) != len(match.group(4)): - if not (match.group(3) == ';' and - len(match.group(2)) == 1 + len(match.group(4)) or - not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)): - error(filename, linenum, 'whitespace/parens', 5, - 'Mismatching spaces inside () in %s' % match.group(1)) - if len(match.group(2)) not in [0, 1]: - error(filename, linenum, 'whitespace/parens', 5, - 'Should have zero or one spaces inside ( and ) in %s' % - match.group(1)) - - -def CheckCommaSpacing(filename, clean_lines, linenum, error): - """Checks for horizontal spacing near commas and semicolons. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - raw = clean_lines.lines_without_raw_strings - line = clean_lines.elided[linenum] - - # You should always have a space after a comma (either as fn arg or operator) - # - # This does not apply when the non-space character following the - # comma is another comma, since the only time when that happens is - # for empty macro arguments. - # - # We run this check in two passes: first pass on elided lines to - # verify that lines contain missing whitespaces, second pass on raw - # lines to confirm that those missing whitespaces are not due to - # elided comments. - if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and - Search(r',[^,\s]', raw[linenum])): - error(filename, linenum, 'whitespace/comma', 3, - 'Missing space after ,') - - # You should always have a space after a semicolon - # except for few corner cases - # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more - # space after ; - if Search(r';[^\s};\\)/]', line): - error(filename, linenum, 'whitespace/semicolon', 3, - 'Missing space after ;') - - -def _IsType(clean_lines, nesting_state, expr): - """Check if expression looks like a type name, returns true if so. - - Args: - clean_lines: A CleansedLines instance containing the file. - nesting_state: A NestingState instance which maintains information about - the current stack of nested blocks being parsed. - expr: The expression to check. - Returns: - True, if token looks like a type. - """ - # Keep only the last token in the expression - last_word = Match(r'^.*(\b\S+)$', expr) - if last_word: - token = last_word.group(1) - else: - token = expr - - # Match native types and stdint types - if _TYPES.match(token): - return True - - # Try a bit harder to match templated types. Walk up the nesting - # stack until we find something that resembles a typename - # declaration for what we are looking for. - typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) + - r'\b') - block_index = len(nesting_state.stack) - 1 - while block_index >= 0: - if isinstance(nesting_state.stack[block_index], _NamespaceInfo): - return False - - # Found where the opening brace is. We want to scan from this - # line up to the beginning of the function, minus a few lines. - # template - # class C - # : public ... { // start scanning here - last_line = nesting_state.stack[block_index].starting_linenum - - next_block_start = 0 - if block_index > 0: - next_block_start = nesting_state.stack[block_index - 1].starting_linenum - first_line = last_line - while first_line >= next_block_start: - if clean_lines.elided[first_line].find('template') >= 0: - break - first_line -= 1 - if first_line < next_block_start: - # Didn't find any "template" keyword before reaching the next block, - # there are probably no template things to check for this block - block_index -= 1 - continue - - # Look for typename in the specified range - for i in xrange(first_line, last_line + 1, 1): - if Search(typename_pattern, clean_lines.elided[i]): - return True - block_index -= 1 - - return False - - -def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error): - """Checks for horizontal spacing near commas. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - nesting_state: A NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # Except after an opening paren, or after another opening brace (in case of - # an initializer list, for instance), you should have spaces before your - # braces when they are delimiting blocks, classes, namespaces etc. - # And since you should never have braces at the beginning of a line, - # this is an easy test. Except that braces used for initialization don't - # follow the same rule; we often don't want spaces before those. - match = Match(r'^(.*[^ ({>]){', line) - - if match: - # Try a bit harder to check for brace initialization. This - # happens in one of the following forms: - # Constructor() : initializer_list_{} { ... } - # Constructor{}.MemberFunction() - # Type variable{}; - # FunctionCall(type{}, ...); - # LastArgument(..., type{}); - # LOG(INFO) << type{} << " ..."; - # map_of_type[{...}] = ...; - # ternary = expr ? new type{} : nullptr; - # OuterTemplate{}> - # - # We check for the character following the closing brace, and - # silence the warning if it's one of those listed above, i.e. - # "{.;,)<>]:". - # - # To account for nested initializer list, we allow any number of - # closing braces up to "{;,)<". We can't simply silence the - # warning on first sight of closing brace, because that would - # cause false negatives for things that are not initializer lists. - # Silence this: But not this: - # Outer{ if (...) { - # Inner{...} if (...){ // Missing space before { - # }; } - # - # There is a false negative with this approach if people inserted - # spurious semicolons, e.g. "if (cond){};", but we will catch the - # spurious semicolon with a separate check. - leading_text = match.group(1) - (endline, endlinenum, endpos) = CloseExpression( - clean_lines, linenum, len(match.group(1))) - trailing_text = '' - if endpos > -1: - trailing_text = endline[endpos:] - for offset in xrange(endlinenum + 1, - min(endlinenum + 3, clean_lines.NumLines() - 1)): - trailing_text += clean_lines.elided[offset] - # We also suppress warnings for `uint64_t{expression}` etc., as the style - # guide recommends brace initialization for integral types to avoid - # overflow/truncation. - if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text) - and not _IsType(clean_lines, nesting_state, leading_text)): - error(filename, linenum, 'whitespace/braces', 5, - 'Missing space before {') - - # Make sure '} else {' has spaces. - if Search(r'}else', line): - error(filename, linenum, 'whitespace/braces', 5, - 'Missing space before else') - - # You shouldn't have a space before a semicolon at the end of the line. - # There's a special case for "for" since the style guide allows space before - # the semicolon there. - if Search(r':\s*;\s*$', line): - error(filename, linenum, 'whitespace/semicolon', 5, - 'Semicolon defining empty statement. Use {} instead.') - elif Search(r'^\s*;\s*$', line): - error(filename, linenum, 'whitespace/semicolon', 5, - 'Line contains only semicolon. If this should be an empty statement, ' - 'use {} instead.') - elif (Search(r'\s+;\s*$', line) and - not Search(r'\bfor\b', line)): - error(filename, linenum, 'whitespace/semicolon', 5, - 'Extra space before last semicolon. If this should be an empty ' - 'statement, use {} instead.') - - -def IsDecltype(clean_lines, linenum, column): - """Check if the token ending on (linenum, column) is decltype(). - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: the number of the line to check. - column: end column of the token to check. - Returns: - True if this token is decltype() expression, False otherwise. - """ - (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column) - if start_col < 0: - return False - if Search(r'\bdecltype\s*$', text[0:start_col]): - return True - return False - -def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): - """Checks for additional blank line issues related to sections. - - Currently the only thing checked here is blank line before protected/private. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - class_info: A _ClassInfo objects. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - # Skip checks if the class is small, where small means 25 lines or less. - # 25 lines seems like a good cutoff since that's the usual height of - # terminals, and any class that can't fit in one screen can't really - # be considered "small". - # - # Also skip checks if we are on the first line. This accounts for - # classes that look like - # class Foo { public: ... }; - # - # If we didn't find the end of the class, last_line would be zero, - # and the check will be skipped by the first condition. - if (class_info.last_line - class_info.starting_linenum <= 24 or - linenum <= class_info.starting_linenum): - return - - matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum]) - if matched: - # Issue warning if the line before public/protected/private was - # not a blank line, but don't do this if the previous line contains - # "class" or "struct". This can happen two ways: - # - We are at the beginning of the class. - # - We are forward-declaring an inner class that is semantically - # private, but needed to be public for implementation reasons. - # Also ignores cases where the previous line ends with a backslash as can be - # common when defining classes in C macros. - prev_line = clean_lines.lines[linenum - 1] - if (not IsBlankLine(prev_line) and - not Search(r'\b(class|struct)\b', prev_line) and - not Search(r'\\$', prev_line)): - # Try a bit harder to find the beginning of the class. This is to - # account for multi-line base-specifier lists, e.g.: - # class Derived - # : public Base { - end_class_head = class_info.starting_linenum - for i in range(class_info.starting_linenum, linenum): - if Search(r'\{\s*$', clean_lines.lines[i]): - end_class_head = i - break - if end_class_head < linenum - 1: - error(filename, linenum, 'whitespace/blank_line', 3, - '"%s:" should be preceded by a blank line' % matched.group(1)) - - -def GetPreviousNonBlankLine(clean_lines, linenum): - """Return the most recent non-blank line and its line number. - - Args: - clean_lines: A CleansedLines instance containing the file contents. - linenum: The number of the line to check. - - Returns: - A tuple with two elements. The first element is the contents of the last - non-blank line before the current line, or the empty string if this is the - first non-blank line. The second is the line number of that line, or -1 - if this is the first non-blank line. - """ - - prevlinenum = linenum - 1 - while prevlinenum >= 0: - prevline = clean_lines.elided[prevlinenum] - if not IsBlankLine(prevline): # if not a blank line... - return (prevline, prevlinenum) - prevlinenum -= 1 - return ('', -1) - - -def CheckBraces(filename, clean_lines, linenum, error): - """Looks for misplaced braces (e.g. at the end of line). - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - - line = clean_lines.elided[linenum] # get rid of comments and strings - - if Match(r'\s*{\s*$', line): - # We allow an open brace to start a line in the case where someone is using - # braces in a block to explicitly create a new scope, which is commonly used - # to control the lifetime of stack-allocated variables. Braces are also - # used for brace initializers inside function calls. We don't detect this - # perfectly: we just don't complain if the last non-whitespace character on - # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the - # previous line starts a preprocessor block. We also allow a brace on the - # following line if it is part of an array initialization and would not fit - # within the 80 character limit of the preceding line. - prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] - if (not Search(r'[,;:}{(]\s*$', prevline) and - not Match(r'\s*#', prevline) and - not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)): - error(filename, linenum, 'whitespace/braces', 4, - '{ should almost always be at the end of the previous line') - - # An else clause should be on the same line as the preceding closing brace. - if Match(r'\s*else\b\s*(?:if\b|\{|$)', line): - prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] - if Match(r'\s*}\s*$', prevline): - error(filename, linenum, 'whitespace/newline', 4, - 'An else should appear on the same line as the preceding }') - - # If braces come on one side of an else, they should be on both. - # However, we have to worry about "else if" that spans multiple lines! - if Search(r'else if\s*\(', line): # could be multi-line if - brace_on_left = bool(Search(r'}\s*else if\s*\(', line)) - # find the ( after the if - pos = line.find('else if') - pos = line.find('(', pos) - if pos > 0: - (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) - brace_on_right = endline[endpos:].find('{') != -1 - if brace_on_left != brace_on_right: # must be brace after if - error(filename, linenum, 'readability/braces', 5, - 'If an else has a brace on one side, it should have it on both') - elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): - error(filename, linenum, 'readability/braces', 5, - 'If an else has a brace on one side, it should have it on both') - - # Likewise, an else should never have the else clause on the same line - if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): - error(filename, linenum, 'whitespace/newline', 4, - 'Else clause should never be on same line as else (use 2 lines)') - - # In the same way, a do/while should never be on one line - if Match(r'\s*do [^\s{]', line): - error(filename, linenum, 'whitespace/newline', 4, - 'do/while clauses should not be on a single line') - - # Check single-line if/else bodies. The style guide says 'curly braces are not - # required for single-line statements'. We additionally allow multi-line, - # single statements, but we reject anything with more than one semicolon in - # it. This means that the first semicolon after the if should be at the end of - # its line, and the line after that should have an indent level equal to or - # lower than the if. We also check for ambiguous if/else nesting without - # braces. - if_else_match = Search(r'\b(if\s*(|constexpr)\s*\(|else\b)', line) - if if_else_match and not Match(r'\s*#', line): - if_indent = GetIndentLevel(line) - endline, endlinenum, endpos = line, linenum, if_else_match.end() - if_match = Search(r'\bif\s*(|constexpr)\s*\(', line) - if if_match: - # This could be a multiline if condition, so find the end first. - pos = if_match.end() - 1 - (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos) - # Check for an opening brace, either directly after the if or on the next - # line. If found, this isn't a single-statement conditional. - if (not Match(r'\s*{', endline[endpos:]) - and not (Match(r'\s*$', endline[endpos:]) - and endlinenum < (len(clean_lines.elided) - 1) - and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))): - while (endlinenum < len(clean_lines.elided) - and ';' not in clean_lines.elided[endlinenum][endpos:]): - endlinenum += 1 - endpos = 0 - if endlinenum < len(clean_lines.elided): - endline = clean_lines.elided[endlinenum] - # We allow a mix of whitespace and closing braces (e.g. for one-liner - # methods) and a single \ after the semicolon (for macros) - endpos = endline.find(';') - if not Match(r';[\s}]*(\\?)$', endline[endpos:]): - # Semicolon isn't the last character, there's something trailing. - # Output a warning if the semicolon is not contained inside - # a lambda expression. - if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$', - endline): - error(filename, linenum, 'readability/braces', 4, - 'If/else bodies with multiple statements require braces') - elif endlinenum < len(clean_lines.elided) - 1: - # Make sure the next line is dedented - next_line = clean_lines.elided[endlinenum + 1] - next_indent = GetIndentLevel(next_line) - # With ambiguous nested if statements, this will error out on the - # if that *doesn't* match the else, regardless of whether it's the - # inner one or outer one. - if (if_match and Match(r'\s*else\b', next_line) - and next_indent != if_indent): - error(filename, linenum, 'readability/braces', 4, - 'Else clause should be indented at the same level as if. ' - 'Ambiguous nested if/else chains require braces.') - elif next_indent > if_indent: - error(filename, linenum, 'readability/braces', 4, - 'If/else bodies with multiple statements require braces') - - -def CheckTrailingSemicolon(filename, clean_lines, linenum, error): - """Looks for redundant trailing semicolon. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - - line = clean_lines.elided[linenum] - - # Block bodies should not be followed by a semicolon. Due to C++11 - # brace initialization, there are more places where semicolons are - # required than not, so we explicitly list the allowed rules rather - # than listing the disallowed ones. These are the places where "};" - # should be replaced by just "}": - # 1. Some flavor of block following closing parenthesis: - # for (;;) {}; - # while (...) {}; - # switch (...) {}; - # Function(...) {}; - # if (...) {}; - # if (...) else if (...) {}; - # - # 2. else block: - # if (...) else {}; - # - # 3. const member function: - # Function(...) const {}; - # - # 4. Block following some statement: - # x = 42; - # {}; - # - # 5. Block at the beginning of a function: - # Function(...) { - # {}; - # } - # - # Note that naively checking for the preceding "{" will also match - # braces inside multi-dimensional arrays, but this is fine since - # that expression will not contain semicolons. - # - # 6. Block following another block: - # while (true) {} - # {}; - # - # 7. End of namespaces: - # namespace {}; - # - # These semicolons seems far more common than other kinds of - # redundant semicolons, possibly due to people converting classes - # to namespaces. For now we do not warn for this case. - # - # Try matching case 1 first. - match = Match(r'^(.*\)\s*)\{', line) - if match: - # Matched closing parenthesis (case 1). Check the token before the - # matching opening parenthesis, and don't warn if it looks like a - # macro. This avoids these false positives: - # - macro that defines a base class - # - multi-line macro that defines a base class - # - macro that defines the whole class-head - # - # But we still issue warnings for macros that we know are safe to - # warn, specifically: - # - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P - # - TYPED_TEST - # - INTERFACE_DEF - # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: - # - # We implement a list of safe macros instead of a list of - # unsafe macros, even though the latter appears less frequently in - # google code and would have been easier to implement. This is because - # the downside for getting the allowed checks wrong means some extra - # semicolons, while the downside for getting disallowed checks wrong - # would result in compile errors. - # - # In addition to macros, we also don't want to warn on - # - Compound literals - # - Lambdas - # - alignas specifier with anonymous structs - # - decltype - closing_brace_pos = match.group(1).rfind(')') - opening_parenthesis = ReverseCloseExpression( - clean_lines, linenum, closing_brace_pos) - if opening_parenthesis[2] > -1: - line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] - macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix) - func = Match(r'^(.*\])\s*$', line_prefix) - if ((macro and - macro.group(1) not in ( - 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', - 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', - 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or - (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or - Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or - Search(r'\bdecltype$', line_prefix) or - Search(r'\s+=\s*$', line_prefix)): - match = None - if (match and - opening_parenthesis[1] > 1 and - Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])): - # Multi-line lambda-expression - match = None - - else: - # Try matching cases 2-3. - match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line) - if not match: - # Try matching cases 4-6. These are always matched on separate lines. - # - # Note that we can't simply concatenate the previous line to the - # current line and do a single match, otherwise we may output - # duplicate warnings for the blank line case: - # if (cond) { - # // blank line - # } - prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] - if prevline and Search(r'[;{}]\s*$', prevline): - match = Match(r'^(\s*)\{', line) - - # Check matching closing brace - if match: - (endline, endlinenum, endpos) = CloseExpression( - clean_lines, linenum, len(match.group(1))) - if endpos > -1 and Match(r'^\s*;', endline[endpos:]): - # Current {} pair is eligible for semicolon check, and we have found - # the redundant semicolon, output warning here. - # - # Note: because we are scanning forward for opening braces, and - # outputting warnings for the matching closing brace, if there are - # nested blocks with trailing semicolons, we will get the error - # messages in reversed order. - - # We need to check the line forward for NOLINT - raw_lines = clean_lines.raw_lines - ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1, - error) - ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum, - error) - - error(filename, endlinenum, 'readability/braces', 4, - "You don't need a ; after a }") - - -def CheckEmptyBlockBody(filename, clean_lines, linenum, error): - """Look for empty loop/conditional body with only a single semicolon. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - - # Search for loop keywords at the beginning of the line. Because only - # whitespaces are allowed before the keywords, this will also ignore most - # do-while-loops, since those lines should start with closing brace. - # - # We also check "if" blocks here, since an empty conditional block - # is likely an error. - line = clean_lines.elided[linenum] - matched = Match(r'\s*(for|while|if)\s*\(', line) - if matched: - # Find the end of the conditional expression. - (end_line, end_linenum, end_pos) = CloseExpression( - clean_lines, linenum, line.find('(')) - - # Output warning if what follows the condition expression is a semicolon. - # No warning for all other cases, including whitespace or newline, since we - # have a separate check for semicolons preceded by whitespace. - if end_pos >= 0 and Match(r';', end_line[end_pos:]): - if matched.group(1) == 'if': - error(filename, end_linenum, 'whitespace/empty_conditional_body', 5, - 'Empty conditional bodies should use {}') - else: - error(filename, end_linenum, 'whitespace/empty_loop_body', 5, - 'Empty loop bodies should use {} or continue') - - # Check for if statements that have completely empty bodies (no comments) - # and no else clauses. - if end_pos >= 0 and matched.group(1) == 'if': - # Find the position of the opening { for the if statement. - # Return without logging an error if it has no brackets. - opening_linenum = end_linenum - opening_line_fragment = end_line[end_pos:] - # Loop until EOF or find anything that's not whitespace or opening {. - while not Search(r'^\s*\{', opening_line_fragment): - if Search(r'^(?!\s*$)', opening_line_fragment): - # Conditional has no brackets. - return - opening_linenum += 1 - if opening_linenum == len(clean_lines.elided): - # Couldn't find conditional's opening { or any code before EOF. - return - opening_line_fragment = clean_lines.elided[opening_linenum] - # Set opening_line (opening_line_fragment may not be entire opening line). - opening_line = clean_lines.elided[opening_linenum] - - # Find the position of the closing }. - opening_pos = opening_line_fragment.find('{') - if opening_linenum == end_linenum: - # We need to make opening_pos relative to the start of the entire line. - opening_pos += end_pos - (closing_line, closing_linenum, closing_pos) = CloseExpression( - clean_lines, opening_linenum, opening_pos) - if closing_pos < 0: - return - - # Now construct the body of the conditional. This consists of the portion - # of the opening line after the {, all lines until the closing line, - # and the portion of the closing line before the }. - if (clean_lines.raw_lines[opening_linenum] != - CleanseComments(clean_lines.raw_lines[opening_linenum])): - # Opening line ends with a comment, so conditional isn't empty. - return - if closing_linenum > opening_linenum: - # Opening line after the {. Ignore comments here since we checked above. - bodylist = list(opening_line[opening_pos+1:]) - # All lines until closing line, excluding closing line, with comments. - bodylist.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum]) - # Closing line before the }. Won't (and can't) have comments. - bodylist.append(clean_lines.elided[closing_linenum][:closing_pos-1]) - body = '\n'.join(bodylist) - else: - # If statement has brackets and fits on a single line. - body = opening_line[opening_pos+1:closing_pos-1] - - # Check if the body is empty - if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body): - return - # The body is empty. Now make sure there's not an else clause. - current_linenum = closing_linenum - current_line_fragment = closing_line[closing_pos:] - # Loop until EOF or find anything that's not whitespace or else clause. - while Search(r'^\s*$|^(?=\s*else)', current_line_fragment): - if Search(r'^(?=\s*else)', current_line_fragment): - # Found an else clause, so don't log an error. - return - current_linenum += 1 - if current_linenum == len(clean_lines.elided): - break - current_line_fragment = clean_lines.elided[current_linenum] - - # The body is empty and there's no else clause until EOF or other code. - error(filename, end_linenum, 'whitespace/empty_if_body', 4, - ('If statement had no body and no else clause')) - - -def FindCheckMacro(line): - """Find a replaceable CHECK-like macro. - - Args: - line: line to search on. - Returns: - (macro name, start position), or (None, -1) if no replaceable - macro is found. - """ - for macro in _CHECK_MACROS: - i = line.find(macro) - if i >= 0: - # Find opening parenthesis. Do a regular expression match here - # to make sure that we are matching the expected CHECK macro, as - # opposed to some other macro that happens to contain the CHECK - # substring. - matched = Match(r'^(.*\b' + macro + r'\s*)\(', line) - if not matched: - continue - return (macro, len(matched.group(1))) - return (None, -1) - - -def CheckCheck(filename, clean_lines, linenum, error): - """Checks the use of CHECK and EXPECT macros. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - - # Decide the set of replacement macros that should be suggested - lines = clean_lines.elided - (check_macro, start_pos) = FindCheckMacro(lines[linenum]) - if not check_macro: - return - - # Find end of the boolean expression by matching parentheses - (last_line, end_line, end_pos) = CloseExpression( - clean_lines, linenum, start_pos) - if end_pos < 0: - return - - # If the check macro is followed by something other than a - # semicolon, assume users will log their own custom error messages - # and don't suggest any replacements. - if not Match(r'\s*;', last_line[end_pos:]): - return - - if linenum == end_line: - expression = lines[linenum][start_pos + 1:end_pos - 1] - else: - expression = lines[linenum][start_pos + 1:] - for i in xrange(linenum + 1, end_line): - expression += lines[i] - expression += last_line[0:end_pos - 1] - - # Parse expression so that we can take parentheses into account. - # This avoids false positives for inputs like "CHECK((a < 4) == b)", - # which is not replaceable by CHECK_LE. - lhs = '' - rhs = '' - operator = None - while expression: - matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||' - r'==|!=|>=|>|<=|<|\()(.*)$', expression) - if matched: - token = matched.group(1) - if token == '(': - # Parenthesized operand - expression = matched.group(2) - (end, _) = FindEndOfExpressionInLine(expression, 0, ['(']) - if end < 0: - return # Unmatched parenthesis - lhs += '(' + expression[0:end] - expression = expression[end:] - elif token in ('&&', '||'): - # Logical and/or operators. This means the expression - # contains more than one term, for example: - # CHECK(42 < a && a < b); - # - # These are not replaceable with CHECK_LE, so bail out early. - return - elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'): - # Non-relational operator - lhs += token - expression = matched.group(2) - else: - # Relational operator - operator = token - rhs = matched.group(2) - break - else: - # Unparenthesized operand. Instead of appending to lhs one character - # at a time, we do another regular expression match to consume several - # characters at once if possible. Trivial benchmark shows that this - # is more efficient when the operands are longer than a single - # character, which is generally the case. - matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression) - if not matched: - matched = Match(r'^(\s*\S)(.*)$', expression) - if not matched: - break - lhs += matched.group(1) - expression = matched.group(2) - - # Only apply checks if we got all parts of the boolean expression - if not (lhs and operator and rhs): - return - - # Check that rhs do not contain logical operators. We already know - # that lhs is fine since the loop above parses out && and ||. - if rhs.find('&&') > -1 or rhs.find('||') > -1: - return - - # At least one of the operands must be a constant literal. This is - # to avoid suggesting replacements for unprintable things like - # CHECK(variable != iterator) - # - # The following pattern matches decimal, hex integers, strings, and - # characters (in that order). - lhs = lhs.strip() - rhs = rhs.strip() - match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$' - if Match(match_constant, lhs) or Match(match_constant, rhs): - # Note: since we know both lhs and rhs, we can provide a more - # descriptive error message like: - # Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42) - # Instead of: - # Consider using CHECK_EQ instead of CHECK(a == b) - # - # We are still keeping the less descriptive message because if lhs - # or rhs gets long, the error message might become unreadable. - error(filename, linenum, 'readability/check', 2, - 'Consider using %s instead of %s(a %s b)' % ( - _CHECK_REPLACEMENT[check_macro][operator], - check_macro, operator)) - - -def CheckAltTokens(filename, clean_lines, linenum, error): - """Check alternative keywords being used in boolean expressions. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # Avoid preprocessor lines - if Match(r'^\s*#', line): - return - - # Last ditch effort to avoid multi-line comments. This will not help - # if the comment started before the current line or ended after the - # current line, but it catches most of the false positives. At least, - # it provides a way to workaround this warning for people who use - # multi-line comments in preprocessor macros. - # - # TODO(unknown): remove this once cpplint has better support for - # multi-line comments. - if line.find('/*') >= 0 or line.find('*/') >= 0: - return - - for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line): - error(filename, linenum, 'readability/alt_tokens', 2, - 'Use operator %s instead of %s' % ( - _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1))) - - -def GetLineWidth(line): - """Determines the width of the line in column positions. - - Args: - line: A string, which may be a Unicode string. - - Returns: - The width of the line in column positions, accounting for Unicode - combining characters and wide characters. - """ - if isinstance(line, unicode): - width = 0 - for uc in unicodedata.normalize('NFC', line): - if unicodedata.east_asian_width(uc) in ('W', 'F'): - width += 2 - elif not unicodedata.combining(uc): - # Issue 337 - # https://mail.python.org/pipermail/python-list/2012-August/628809.html - if (sys.version_info.major, sys.version_info.minor) <= (3, 2): - # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81 - is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4 - # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564 - is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF - if not is_wide_build and is_low_surrogate: - width -= 1 - - width += 1 - return width - else: - return len(line) - - -def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, - error): - """Checks rules from the 'C++ style rules' section of cppguide.html. - - Most of these rules are hard to test (naming, comment style), but we - do what we can. In particular we check for 2-space indents, line lengths, - tab usage, spaces inside code, etc. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - file_extension: The extension (without the dot) of the filename. - nesting_state: A NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: The function to call with any errors found. - """ - - # Don't use "elided" lines here, otherwise we can't check commented lines. - # Don't want to use "raw" either, because we don't want to check inside C++11 - # raw strings, - raw_lines = clean_lines.lines_without_raw_strings - line = raw_lines[linenum] - prev = raw_lines[linenum - 1] if linenum > 0 else '' - - if line.find('\t') != -1: - error(filename, linenum, 'whitespace/tab', 1, - 'Tab found; better to use spaces') - - # One or three blank spaces at the beginning of the line is weird; it's - # hard to reconcile that with 2-space indents. - # NOTE: here are the conditions rob pike used for his tests. Mine aren't - # as sophisticated, but it may be worth becoming so: RLENGTH==initial_spaces - # if(RLENGTH > 20) complain = 0; - # if(match($0, " +(error|private|public|protected):")) complain = 0; - # if(match(prev, "&& *$")) complain = 0; - # if(match(prev, "\\|\\| *$")) complain = 0; - # if(match(prev, "[\",=><] *$")) complain = 0; - # if(match($0, " <<")) complain = 0; - # if(match(prev, " +for \\(")) complain = 0; - # if(prevodd && match(prevprev, " +for \\(")) complain = 0; - scope_or_label_pattern = r'\s*(?:public|private|protected|signals)(?:\s+(?:slots\s*)?)?:\s*\\?$' - classinfo = nesting_state.InnermostClass() - initial_spaces = 0 - cleansed_line = clean_lines.elided[linenum] - while initial_spaces < len(line) and line[initial_spaces] == ' ': - initial_spaces += 1 - # There are certain situations we allow one space, notably for - # section labels, and also lines containing multi-line raw strings. - # We also don't check for lines that look like continuation lines - # (of lines ending in double quotes, commas, equals, or angle brackets) - # because the rules for how to indent those are non-trivial. - if (not Search(r'[",=><] *$', prev) and - (initial_spaces == 1 or initial_spaces == 3) and - not Match(scope_or_label_pattern, cleansed_line) and - not (clean_lines.raw_lines[linenum] != line and - Match(r'^\s*""', line))): - error(filename, linenum, 'whitespace/indent', 3, - 'Weird number of spaces at line-start. ' - 'Are you using a 2-space indent?') - - if line and line[-1].isspace(): - error(filename, linenum, 'whitespace/end_of_line', 4, - 'Line ends in whitespace. Consider deleting these extra spaces.') - - # Check if the line is a header guard. - is_header_guard = False - if IsHeaderExtension(file_extension): - cppvar = GetHeaderGuardCPPVariable(filename) - if (line.startswith('#ifndef %s' % cppvar) or - line.startswith('#define %s' % cppvar) or - line.startswith('#endif // %s' % cppvar)): - is_header_guard = True - # #include lines and header guards can be long, since there's no clean way to - # split them. - # - # URLs can be long too. It's possible to split these, but it makes them - # harder to cut&paste. - # - # The "$Id:...$" comment may also get very long without it being the - # developers fault. - # - # Doxygen documentation copying can get pretty long when using an overloaded - # function declaration - if (not line.startswith('#include') and not is_header_guard and - not Match(r'^\s*//.*http(s?)://\S*$', line) and - not Match(r'^\s*//\s*[^\s]*$', line) and - not Match(r'^// \$Id:.*#[0-9]+ \$$', line) and - not Match(r'^\s*/// [@\\](copydoc|copydetails|copybrief) .*$', line)): - line_width = GetLineWidth(line) - if line_width > _line_length: - error(filename, linenum, 'whitespace/line_length', 2, - 'Lines should be <= %i characters long' % _line_length) - - if (cleansed_line.count(';') > 1 and - # allow simple single line lambdas - not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}\n\r]*\}', - line) and - # for loops are allowed two ;'s (and may run over two lines). - cleansed_line.find('for') == -1 and - (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or - GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and - # It's ok to have many commands in a switch case that fits in 1 line - not ((cleansed_line.find('case ') != -1 or - cleansed_line.find('default:') != -1) and - cleansed_line.find('break;') != -1)): - error(filename, linenum, 'whitespace/newline', 0, - 'More than one command on the same line') - - # Some more style checks - CheckBraces(filename, clean_lines, linenum, error) - CheckTrailingSemicolon(filename, clean_lines, linenum, error) - CheckEmptyBlockBody(filename, clean_lines, linenum, error) - CheckSpacing(filename, clean_lines, linenum, nesting_state, error) - CheckOperatorSpacing(filename, clean_lines, linenum, error) - CheckParenthesisSpacing(filename, clean_lines, linenum, error) - CheckCommaSpacing(filename, clean_lines, linenum, error) - CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error) - CheckSpacingForFunctionCall(filename, clean_lines, linenum, error) - CheckCheck(filename, clean_lines, linenum, error) - CheckAltTokens(filename, clean_lines, linenum, error) - classinfo = nesting_state.InnermostClass() - if classinfo: - CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) - - -_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$') -# Matches the first component of a filename delimited by -s and _s. That is: -# _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo' -# _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo' -# _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo' -# _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo' -_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+') - - -def _DropCommonSuffixes(filename): - """Drops common suffixes like _test.cc or -inl.h from filename. - - For example: - >>> _DropCommonSuffixes('foo/foo-inl.h') - 'foo/foo' - >>> _DropCommonSuffixes('foo/bar/foo.cc') - 'foo/bar/foo' - >>> _DropCommonSuffixes('foo/foo_internal.h') - 'foo/foo' - >>> _DropCommonSuffixes('foo/foo_unusualinternal.h') - 'foo/foo_unusualinternal' - - Args: - filename: The input filename. - - Returns: - The filename with the common suffix removed. - """ - for suffix in itertools.chain( - ('%s.%s' % (test_suffix.lstrip('_'), ext) - for test_suffix, ext in itertools.product(_test_suffixes, GetNonHeaderExtensions())), - ('%s.%s' % (suffix, ext) - for suffix, ext in itertools.product(['inl', 'imp', 'internal'], GetHeaderExtensions()))): - if (filename.endswith(suffix) and len(filename) > len(suffix) and - filename[-len(suffix) - 1] in ('-', '_')): - return filename[:-len(suffix) - 1] - return os.path.splitext(filename)[0] - - -def _ClassifyInclude(fileinfo, include, used_angle_brackets, include_order="default"): - """Figures out what kind of header 'include' is. - - Args: - fileinfo: The current file cpplint is running over. A FileInfo instance. - include: The path to a #included file. - used_angle_brackets: True if the #include used <> rather than "". - include_order: "default" or other value allowed in program arguments - - Returns: - One of the _XXX_HEADER constants. - - For example: - >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True) - _C_SYS_HEADER - >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True) - _CPP_SYS_HEADER - >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', True, "standardcfirst") - _OTHER_SYS_HEADER - >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False) - _LIKELY_MY_HEADER - >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'), - ... 'bar/foo_other_ext.h', False) - _POSSIBLE_MY_HEADER - >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False) - _OTHER_HEADER - """ - # This is a list of all standard c++ header files, except - # those already checked for above. - is_cpp_header = include in _CPP_HEADERS - - # Mark include as C header if in list or in a known folder for standard-ish C headers. - is_std_c_header = (include_order == "default") or (include in _C_HEADERS - # additional linux glibc header folders - or Search(r'(?:%s)\/.*\.h' % "|".join(C_STANDARD_HEADER_FOLDERS), include)) - - # Headers with C++ extensions shouldn't be considered C system headers - include_ext = os.path.splitext(include)[1] - is_system = used_angle_brackets and not include_ext in ['.hh', '.hpp', '.hxx', '.h++'] - - if is_system: - if is_cpp_header: - return _CPP_SYS_HEADER - if is_std_c_header: - return _C_SYS_HEADER - else: - return _OTHER_SYS_HEADER - - # If the target file and the include we're checking share a - # basename when we drop common extensions, and the include - # lives in . , then it's likely to be owned by the target file. - target_dir, target_base = ( - os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName()))) - include_dir, include_base = os.path.split(_DropCommonSuffixes(include)) - target_dir_pub = os.path.normpath(target_dir + '/../public') - target_dir_pub = target_dir_pub.replace('\\', '/') - if target_base == include_base and ( - include_dir == target_dir or - include_dir == target_dir_pub): - return _LIKELY_MY_HEADER - - # If the target and include share some initial basename - # component, it's possible the target is implementing the - # include, so it's allowed to be first, but we'll never - # complain if it's not there. - target_first_component = _RE_FIRST_COMPONENT.match(target_base) - include_first_component = _RE_FIRST_COMPONENT.match(include_base) - if (target_first_component and include_first_component and - target_first_component.group(0) == - include_first_component.group(0)): - return _POSSIBLE_MY_HEADER - - return _OTHER_HEADER - - - -def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): - """Check rules that are applicable to #include lines. - - Strings on #include lines are NOT removed from elided line, to make - certain tasks easier. However, to prevent false positives, checks - applicable to #include lines in CheckLanguage must be put here. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - include_state: An _IncludeState instance in which the headers are inserted. - error: The function to call with any errors found. - """ - fileinfo = FileInfo(filename) - line = clean_lines.lines[linenum] - - # "include" should use the new style "foo/bar.h" instead of just "bar.h" - # Only do this check if the included header follows google naming - # conventions. If not, assume that it's a 3rd party API that - # requires special include conventions. - # - # We also make an exception for Lua headers, which follow google - # naming convention but not the include convention. - match = Match(r'#include\s*"([^/]+\.(.*))"', line) - if match: - if (IsHeaderExtension(match.group(2)) and - not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1))): - error(filename, linenum, 'build/include_subdir', 4, - 'Include the directory when naming header files') - - # we shouldn't include a file more than once. actually, there are a - # handful of instances where doing so is okay, but in general it's - # not. - match = _RE_PATTERN_INCLUDE.search(line) - if match: - include = match.group(2) - used_angle_brackets = (match.group(1) == '<') - duplicate_line = include_state.FindHeader(include) - if duplicate_line >= 0: - error(filename, linenum, 'build/include', 4, - '"%s" already included at %s:%s' % - (include, filename, duplicate_line)) - return - - for extension in GetNonHeaderExtensions(): - if (include.endswith('.' + extension) and - os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)): - error(filename, linenum, 'build/include', 4, - 'Do not include .' + extension + ' files from other packages') - return - - # We DO want to include a 3rd party looking header if it matches the - # filename. Otherwise we get an erroneous error "...should include its - # header" error later. - third_src_header = False - for ext in GetHeaderExtensions(): - basefilename = filename[0:len(filename) - len(fileinfo.Extension())] - headerfile = basefilename + '.' + ext - headername = FileInfo(headerfile).RepositoryName() - if headername in include or include in headername: - third_src_header = True - break - - if third_src_header or not _THIRD_PARTY_HEADERS_PATTERN.match(include): - include_state.include_list[-1].append((include, linenum)) - - # We want to ensure that headers appear in the right order: - # 1) for foo.cc, foo.h (preferred location) - # 2) c system files - # 3) cpp system files - # 4) for foo.cc, foo.h (deprecated location) - # 5) other google headers - # - # We classify each include statement as one of those 5 types - # using a number of techniques. The include_state object keeps - # track of the highest type seen, and complains if we see a - # lower type after that. - error_message = include_state.CheckNextIncludeOrder( - _ClassifyInclude(fileinfo, include, used_angle_brackets, _include_order)) - if error_message: - error(filename, linenum, 'build/include_order', 4, - '%s. Should be: %s.h, c system, c++ system, other.' % - (error_message, fileinfo.BaseName())) - canonical_include = include_state.CanonicalizeAlphabeticalOrder(include) - if not include_state.IsInAlphabeticalOrder( - clean_lines, linenum, canonical_include): - error(filename, linenum, 'build/include_alpha', 4, - 'Include "%s" not in alphabetical order' % include) - include_state.SetLastHeader(canonical_include) - - - -def _GetTextInside(text, start_pattern): - r"""Retrieves all the text between matching open and close parentheses. - - Given a string of lines and a regular expression string, retrieve all the text - following the expression and between opening punctuation symbols like - (, [, or {, and the matching close-punctuation symbol. This properly nested - occurrences of the punctuations, so for the text like - printf(a(), b(c())); - a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'. - start_pattern must match string having an open punctuation symbol at the end. - - Args: - text: The lines to extract text. Its comments and strings must be elided. - It can be single line and can span multiple lines. - start_pattern: The regexp string indicating where to start extracting - the text. - Returns: - The extracted text. - None if either the opening string or ending punctuation could not be found. - """ - # TODO(unknown): Audit cpplint.py to see what places could be profitably - # rewritten to use _GetTextInside (and use inferior regexp matching today). - - # Give opening punctuations to get the matching close-punctuations. - matching_punctuation = {'(': ')', '{': '}', '[': ']'} - closing_punctuation = set(itervalues(matching_punctuation)) - - # Find the position to start extracting text. - match = re.search(start_pattern, text, re.M) - if not match: # start_pattern not found in text. - return None - start_position = match.end(0) - - assert start_position > 0, ( - 'start_pattern must ends with an opening punctuation.') - assert text[start_position - 1] in matching_punctuation, ( - 'start_pattern must ends with an opening punctuation.') - # Stack of closing punctuations we expect to have in text after position. - punctuation_stack = [matching_punctuation[text[start_position - 1]]] - position = start_position - while punctuation_stack and position < len(text): - if text[position] == punctuation_stack[-1]: - punctuation_stack.pop() - elif text[position] in closing_punctuation: - # A closing punctuation without matching opening punctuations. - return None - elif text[position] in matching_punctuation: - punctuation_stack.append(matching_punctuation[text[position]]) - position += 1 - if punctuation_stack: - # Opening punctuations left without matching close-punctuations. - return None - # punctuations match. - return text[start_position:position - 1] - - -# Patterns for matching call-by-reference parameters. -# -# Supports nested templates up to 2 levels deep using this messy pattern: -# < (?: < (?: < [^<>]* -# > -# | [^<>] )* -# > -# | [^<>] )* -# > -_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*' # =~ [[:alpha:]][[:alnum:]]* -_RE_PATTERN_TYPE = ( - r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?' - r'(?:\w|' - r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|' - r'::)+') -# A call-by-reference parameter ends with '& identifier'. -_RE_PATTERN_REF_PARAM = re.compile( - r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*' - r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]') -# A call-by-const-reference parameter either ends with 'const& identifier' -# or looks like 'const type& identifier' when 'type' is atomic. -_RE_PATTERN_CONST_REF_PARAM = ( - r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + - r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') -# Stream types. -_RE_PATTERN_REF_STREAM_PARAM = ( - r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')') - - -def CheckLanguage(filename, clean_lines, linenum, file_extension, - include_state, nesting_state, error): - """Checks rules from the 'C++ language rules' section of cppguide.html. - - Some of these rules are hard to test (function overloading, using - uint32 inappropriately), but we do the best we can. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - file_extension: The extension (without the dot) of the filename. - include_state: An _IncludeState instance in which the headers are inserted. - nesting_state: A NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: The function to call with any errors found. - """ - # If the line is empty or consists of entirely a comment, no need to - # check it. - line = clean_lines.elided[linenum] - if not line: - return - - match = _RE_PATTERN_INCLUDE.search(line) - if match: - CheckIncludeLine(filename, clean_lines, linenum, include_state, error) - return - - # Reset include state across preprocessor directives. This is meant - # to silence warnings for conditional includes. - match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line) - if match: - include_state.ResetSection(match.group(1)) - - - # Perform other checks now that we are sure that this is not an include line - CheckCasts(filename, clean_lines, linenum, error) - CheckGlobalStatic(filename, clean_lines, linenum, error) - CheckPrintf(filename, clean_lines, linenum, error) - - if IsHeaderExtension(file_extension): - # TODO(unknown): check that 1-arg constructors are explicit. - # How to tell it's a constructor? - # (handled in CheckForNonStandardConstructs for now) - # TODO(unknown): check that classes declare or disable copy/assign - # (level 1 error) - pass - - # Check if people are using the verboten C basic types. The only exception - # we regularly allow is "unsigned short port" for port. - if Search(r'\bshort port\b', line): - if not Search(r'\bunsigned short port\b', line): - error(filename, linenum, 'runtime/int', 4, - 'Use "unsigned short" for ports, not "short"') - else: - match = Search(r'\b(short|long(?! +double)|long long)\b', line) - if match: - error(filename, linenum, 'runtime/int', 4, - 'Use int16/int64/etc, rather than the C type %s' % match.group(1)) - - # Check if some verboten operator overloading is going on - # TODO(unknown): catch out-of-line unary operator&: - # class X {}; - # int operator&(const X& x) { return 42; } // unary operator& - # The trick is it's hard to tell apart from binary operator&: - # class Y { int operator&(const Y& x) { return 23; } }; // binary operator& - if Search(r'\boperator\s*&\s*\(\s*\)', line): - error(filename, linenum, 'runtime/operator', 4, - 'Unary operator& is dangerous. Do not use it.') - - # Check for suspicious usage of "if" like - # } if (a == b) { - if Search(r'\}\s*if\s*\(', line): - error(filename, linenum, 'readability/braces', 4, - 'Did you mean "else if"? If not, start a new line for "if".') - - # Check for potential format string bugs like printf(foo). - # We constrain the pattern not to pick things like DocidForPrintf(foo). - # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) - # TODO(unknown): Catch the following case. Need to change the calling - # convention of the whole function to process multiple line to handle it. - # printf( - # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); - printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(') - if printf_args: - match = Match(r'([\w.\->()]+)$', printf_args) - if match and match.group(1) != '__VA_ARGS__': - function_name = re.search(r'\b((?:string)?printf)\s*\(', - line, re.I).group(1) - error(filename, linenum, 'runtime/printf', 4, - 'Potential format string bug. Do %s("%%s", %s) instead.' - % (function_name, match.group(1))) - - # Check for potential memset bugs like memset(buf, sizeof(buf), 0). - match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line) - if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)): - error(filename, linenum, 'runtime/memset', 4, - 'Did you mean "memset(%s, 0, %s)"?' - % (match.group(1), match.group(2))) - - if Search(r'\busing namespace\b', line): - if Search(r'\bliterals\b', line): - error(filename, linenum, 'build/namespaces_literals', 5, - 'Do not use namespace using-directives. ' - 'Use using-declarations instead.') - else: - error(filename, linenum, 'build/namespaces', 5, - 'Do not use namespace using-directives. ' - 'Use using-declarations instead.') - - # Detect variable-length arrays. - match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line) - if (match and match.group(2) != 'return' and match.group(2) != 'delete' and - match.group(3).find(']') == -1): - # Split the size using space and arithmetic operators as delimiters. - # If any of the resulting tokens are not compile time constants then - # report the error. - tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3)) - is_const = True - skip_next = False - for tok in tokens: - if skip_next: - skip_next = False - continue - - if Search(r'sizeof\(.+\)', tok): continue - if Search(r'arraysize\(\w+\)', tok): continue - - tok = tok.lstrip('(') - tok = tok.rstrip(')') - if not tok: continue - if Match(r'\d+', tok): continue - if Match(r'0[xX][0-9a-fA-F]+', tok): continue - if Match(r'k[A-Z0-9]\w*', tok): continue - if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue - if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue - # A catch all for tricky sizeof cases, including 'sizeof expression', - # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)' - # requires skipping the next token because we split on ' ' and '*'. - if tok.startswith('sizeof'): - skip_next = True - continue - is_const = False - break - if not is_const: - error(filename, linenum, 'runtime/arrays', 1, - 'Do not use variable-length arrays. Use an appropriately named ' - "('k' followed by CamelCase) compile-time constant for the size.") - - # Check for use of unnamed namespaces in header files. Registration - # macros are typically OK, so we allow use of "namespace {" on lines - # that end with backslashes. - if (IsHeaderExtension(file_extension) - and Search(r'\bnamespace\s*{', line) - and line[-1] != '\\'): - error(filename, linenum, 'build/namespaces_headers', 4, - 'Do not use unnamed namespaces in header files. See ' - 'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' - ' for more information.') - - -def CheckGlobalStatic(filename, clean_lines, linenum, error): - """Check for unsafe global or static objects. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # Match two lines at a time to support multiline declarations - if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line): - line += clean_lines.elided[linenum + 1].strip() - - # Check for people declaring static/global STL strings at the top level. - # This is dangerous because the C++ language does not guarantee that - # globals with constructors are initialized before the first access, and - # also because globals can be destroyed when some threads are still running. - # TODO(unknown): Generalize this to also find static unique_ptr instances. - # TODO(unknown): File bugs for clang-tidy to find these. - match = Match( - r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +' - r'([a-zA-Z0-9_:]+)\b(.*)', - line) - - # Remove false positives: - # - String pointers (as opposed to values). - # string *pointer - # const string *pointer - # string const *pointer - # string *const pointer - # - # - Functions and template specializations. - # string Function(... - # string Class::Method(... - # - # - Operators. These are matched separately because operator names - # cross non-word boundaries, and trying to match both operators - # and functions at the same time would decrease accuracy of - # matching identifiers. - # string Class::operator*() - if (match and - not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and - not Search(r'\boperator\W', line) and - not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))): - if Search(r'\bconst\b', line): - error(filename, linenum, 'runtime/string', 4, - 'For a static/global string constant, use a C style string ' - 'instead: "%schar%s %s[]".' % - (match.group(1), match.group(2) or '', match.group(3))) - else: - error(filename, linenum, 'runtime/string', 4, - 'Static/global string variables are not permitted.') - - if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or - Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)): - error(filename, linenum, 'runtime/init', 4, - 'You seem to be initializing a member variable with itself.') - - -def CheckPrintf(filename, clean_lines, linenum, error): - """Check for printf related issues. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # When snprintf is used, the second argument shouldn't be a literal. - match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) - if match and match.group(2) != '0': - # If 2nd arg is zero, snprintf is used to calculate size. - error(filename, linenum, 'runtime/printf', 3, - 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' - 'to snprintf.' % (match.group(1), match.group(2))) - - # Check if some verboten C functions are being used. - if Search(r'\bsprintf\s*\(', line): - error(filename, linenum, 'runtime/printf', 5, - 'Never use sprintf. Use snprintf instead.') - match = Search(r'\b(strcpy|strcat)\s*\(', line) - if match: - error(filename, linenum, 'runtime/printf', 4, - 'Almost always, snprintf is better than %s' % match.group(1)) - - -def IsDerivedFunction(clean_lines, linenum): - """Check if current line contains an inherited function. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - Returns: - True if current line contains a function with "override" - virt-specifier. - """ - # Scan back a few lines for start of current function - for i in xrange(linenum, max(-1, linenum - 10), -1): - match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i]) - if match: - # Look for "override" after the matching closing parenthesis - line, _, closing_paren = CloseExpression( - clean_lines, i, len(match.group(1))) - return (closing_paren >= 0 and - Search(r'\boverride\b', line[closing_paren:])) - return False - - -def IsOutOfLineMethodDefinition(clean_lines, linenum): - """Check if current line contains an out-of-line method definition. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - Returns: - True if current line contains an out-of-line method definition. - """ - # Scan back a few lines for start of current function - for i in xrange(linenum, max(-1, linenum - 10), -1): - if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]): - return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None - return False - - -def IsInitializerList(clean_lines, linenum): - """Check if current line is inside constructor initializer list. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - Returns: - True if current line appears to be inside constructor initializer - list, False otherwise. - """ - for i in xrange(linenum, 1, -1): - line = clean_lines.elided[i] - if i == linenum: - remove_function_body = Match(r'^(.*)\{\s*$', line) - if remove_function_body: - line = remove_function_body.group(1) - - if Search(r'\s:\s*\w+[({]', line): - # A lone colon tend to indicate the start of a constructor - # initializer list. It could also be a ternary operator, which - # also tend to appear in constructor initializer lists as - # opposed to parameter lists. - return True - if Search(r'\}\s*,\s*$', line): - # A closing brace followed by a comma is probably the end of a - # brace-initialized member in constructor initializer list. - return True - if Search(r'[{};]\s*$', line): - # Found one of the following: - # - A closing brace or semicolon, probably the end of the previous - # function. - # - An opening brace, probably the start of current class or namespace. - # - # Current line is probably not inside an initializer list since - # we saw one of those things without seeing the starting colon. - return False - - # Got to the beginning of the file without seeing the start of - # constructor initializer list. - return False - - -def CheckForNonConstReference(filename, clean_lines, linenum, - nesting_state, error): - """Check for non-const references. - - Separate from CheckLanguage since it scans backwards from current - line, instead of scanning forward. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - nesting_state: A NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: The function to call with any errors found. - """ - # Do nothing if there is no '&' on current line. - line = clean_lines.elided[linenum] - if '&' not in line: - return - - # If a function is inherited, current function doesn't have much of - # a choice, so any non-const references should not be blamed on - # derived function. - if IsDerivedFunction(clean_lines, linenum): - return - - # Don't warn on out-of-line method definitions, as we would warn on the - # in-line declaration, if it isn't marked with 'override'. - if IsOutOfLineMethodDefinition(clean_lines, linenum): - return - - # Long type names may be broken across multiple lines, usually in one - # of these forms: - # LongType - # ::LongTypeContinued &identifier - # LongType:: - # LongTypeContinued &identifier - # LongType< - # ...>::LongTypeContinued &identifier - # - # If we detected a type split across two lines, join the previous - # line to current line so that we can match const references - # accordingly. - # - # Note that this only scans back one line, since scanning back - # arbitrary number of lines would be expensive. If you have a type - # that spans more than 2 lines, please use a typedef. - if linenum > 1: - previous = None - if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line): - # previous_line\n + ::current_line - previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$', - clean_lines.elided[linenum - 1]) - elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line): - # previous_line::\n + current_line - previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$', - clean_lines.elided[linenum - 1]) - if previous: - line = previous.group(1) + line.lstrip() - else: - # Check for templated parameter that is split across multiple lines - endpos = line.rfind('>') - if endpos > -1: - (_, startline, startpos) = ReverseCloseExpression( - clean_lines, linenum, endpos) - if startpos > -1 and startline < linenum: - # Found the matching < on an earlier line, collect all - # pieces up to current line. - line = '' - for i in xrange(startline, linenum + 1): - line += clean_lines.elided[i].strip() - - # Check for non-const references in function parameters. A single '&' may - # found in the following places: - # inside expression: binary & for bitwise AND - # inside expression: unary & for taking the address of something - # inside declarators: reference parameter - # We will exclude the first two cases by checking that we are not inside a - # function body, including one that was just introduced by a trailing '{'. - # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. - if (nesting_state.previous_stack_top and - not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or - isinstance(nesting_state.previous_stack_top, _NamespaceInfo))): - # Not at toplevel, not within a class, and not within a namespace - return - - # Avoid initializer lists. We only need to scan back from the - # current line for something that starts with ':'. - # - # We don't need to check the current line, since the '&' would - # appear inside the second set of parentheses on the current line as - # opposed to the first set. - if linenum > 0: - for i in xrange(linenum - 1, max(0, linenum - 10), -1): - previous_line = clean_lines.elided[i] - if not Search(r'[),]\s*$', previous_line): - break - if Match(r'^\s*:\s+\S', previous_line): - return - - # Avoid preprocessors - if Search(r'\\\s*$', line): - return - - # Avoid constructor initializer lists - if IsInitializerList(clean_lines, linenum): - return - - # We allow non-const references in a few standard places, like functions - # called "swap()" or iostream operators like "<<" or ">>". Do not check - # those function parameters. - # - # We also accept & in static_assert, which looks like a function but - # it's actually a declaration expression. - allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|' - r'operator\s*[<>][<>]|' - r'static_assert|COMPILE_ASSERT' - r')\s*\(') - if Search(allowed_functions, line): - return - elif not Search(r'\S+\([^)]*$', line): - # Don't see an allowed function on this line. Actually we - # didn't see any function name on this line, so this is likely a - # multi-line parameter list. Try a bit harder to catch this case. - for i in xrange(2): - if (linenum > i and - Search(allowed_functions, clean_lines.elided[linenum - i - 1])): - return - - decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body - for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): - if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and - not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)): - error(filename, linenum, 'runtime/references', 2, - 'Is this a non-const reference? ' - 'If so, make const or use a pointer: ' + - ReplaceAll(' *<', '<', parameter)) - - -def CheckCasts(filename, clean_lines, linenum, error): - """Various cast related checks. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - # Check to see if they're using an conversion function cast. - # I just try to capture the most common basic types, though there are more. - # Parameterless conversion functions, such as bool(), are allowed as they are - # probably a member operator declaration or default constructor. - match = Search( - r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b' - r'(int|float|double|bool|char|int32|uint32|int64|uint64)' - r'(\([^)].*)', line) - expecting_function = ExpectingFunctionArgs(clean_lines, linenum) - if match and not expecting_function: - matched_type = match.group(2) - - # matched_new_or_template is used to silence two false positives: - # - New operators - # - Template arguments with function types - # - # For template arguments, we match on types immediately following - # an opening bracket without any spaces. This is a fast way to - # silence the common case where the function type is the first - # template argument. False negative with less-than comparison is - # avoided because those operators are usually followed by a space. - # - # function // bracket + no space = false positive - # value < double(42) // bracket + space = true positive - matched_new_or_template = match.group(1) - - # Avoid arrays by looking for brackets that come after the closing - # parenthesis. - if Match(r'\([^()]+\)\s*\[', match.group(3)): - return - - # Other things to ignore: - # - Function pointers - # - Casts to pointer types - # - Placement new - # - Alias declarations - matched_funcptr = match.group(3) - if (matched_new_or_template is None and - not (matched_funcptr and - (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', - matched_funcptr) or - matched_funcptr.startswith('(*)'))) and - not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and - not Search(r'new\(\S+\)\s*' + matched_type, line)): - error(filename, linenum, 'readability/casting', 4, - 'Using deprecated casting style. ' - 'Use static_cast<%s>(...) instead' % - matched_type) - - if not expecting_function: - CheckCStyleCast(filename, clean_lines, linenum, 'static_cast', - r'\((int|float|double|bool|char|u?int(16|32|64)|size_t)\)', error) - - # This doesn't catch all cases. Consider (const char * const)"hello". - # - # (char *) "foo" should always be a const_cast (reinterpret_cast won't - # compile). - if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast', - r'\((char\s?\*+\s?)\)\s*"', error): - pass - else: - # Check pointer casts for other than string constants - CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast', - r'\((\w+\s?\*+\s?)\)', error) - - # In addition, we look for people taking the address of a cast. This - # is dangerous -- casts can assign to temporaries, so the pointer doesn't - # point where you think. - # - # Some non-identifier character is required before the '&' for the - # expression to be recognized as a cast. These are casts: - # expression = &static_cast(temporary()); - # function(&(int*)(temporary())); - # - # This is not a cast: - # reference_type&(int* function_param); - match = Search( - r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|' - r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line) - if match: - # Try a better error message when the & is bound to something - # dereferenced by the casted pointer, as opposed to the casted - # pointer itself. - parenthesis_error = False - match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line) - if match: - _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1))) - if x1 >= 0 and clean_lines.elided[y1][x1] == '(': - _, y2, x2 = CloseExpression(clean_lines, y1, x1) - if x2 >= 0: - extended_line = clean_lines.elided[y2][x2:] - if y2 < clean_lines.NumLines() - 1: - extended_line += clean_lines.elided[y2 + 1] - if Match(r'\s*(?:->|\[)', extended_line): - parenthesis_error = True - - if parenthesis_error: - error(filename, linenum, 'readability/casting', 4, - ('Are you taking an address of something dereferenced ' - 'from a cast? Wrapping the dereferenced expression in ' - 'parentheses will make the binding more obvious')) - else: - error(filename, linenum, 'runtime/casting', 4, - ('Are you taking an address of a cast? ' - 'This is dangerous: could be a temp var. ' - 'Take the address before doing the cast, rather than after')) - - -def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error): - """Checks for a C-style cast by looking for the pattern. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - cast_type: The string for the C++ cast to recommend. This is either - reinterpret_cast, static_cast, or const_cast, depending. - pattern: The regular expression used to find C-style casts. - error: The function to call with any errors found. - - Returns: - True if an error was emitted. - False otherwise. - """ - line = clean_lines.elided[linenum] - match = Search(pattern, line) - if not match: - return False - - # Exclude lines with keywords that tend to look like casts - context = line[0:match.start(1) - 1] - if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context): - return False - - # Try expanding current context to see if we one level of - # parentheses inside a macro. - if linenum > 0: - for i in xrange(linenum - 1, max(0, linenum - 5), -1): - context = clean_lines.elided[i] + context - if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context): - return False - - # operator++(int) and operator--(int) - if (context.endswith(' operator++') or context.endswith(' operator--') or - context.endswith('::operator++') or context.endswith('::operator--')): - return False - - # A single unnamed argument for a function tends to look like old style cast. - # If we see those, don't issue warnings for deprecated casts. - remainder = line[match.end(0):] - if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)', - remainder): - return False - - # At this point, all that should be left is actual casts. - error(filename, linenum, 'readability/casting', 4, - 'Using C-style cast. Use %s<%s>(...) instead' % - (cast_type, match.group(1))) - - return True - - -def ExpectingFunctionArgs(clean_lines, linenum): - """Checks whether where function type arguments are expected. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - - Returns: - True if the line at 'linenum' is inside something that expects arguments - of function types. - """ - line = clean_lines.elided[linenum] - return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or - (linenum >= 2 and - (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', - clean_lines.elided[linenum - 1]) or - Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', - clean_lines.elided[linenum - 2]) or - Search(r'\bstd::m?function\s*\<\s*$', - clean_lines.elided[linenum - 1])))) - - -_HEADERS_CONTAINING_TEMPLATES = ( - ('', ('deque',)), - ('', ('unary_function', 'binary_function', - 'plus', 'minus', 'multiplies', 'divides', 'modulus', - 'negate', - 'equal_to', 'not_equal_to', 'greater', 'less', - 'greater_equal', 'less_equal', - 'logical_and', 'logical_or', 'logical_not', - 'unary_negate', 'not1', 'binary_negate', 'not2', - 'bind1st', 'bind2nd', - 'pointer_to_unary_function', - 'pointer_to_binary_function', - 'ptr_fun', - 'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t', - 'mem_fun_ref_t', - 'const_mem_fun_t', 'const_mem_fun1_t', - 'const_mem_fun_ref_t', 'const_mem_fun1_ref_t', - 'mem_fun_ref', - )), - ('', ('numeric_limits',)), - ('', ('list',)), - ('', ('multimap',)), - ('', ('allocator', 'make_shared', 'make_unique', 'shared_ptr', - 'unique_ptr', 'weak_ptr')), - ('', ('queue', 'priority_queue',)), - ('', ('multiset',)), - ('', ('stack',)), - ('', ('char_traits', 'basic_string',)), - ('', ('tuple',)), - ('', ('unordered_map', 'unordered_multimap')), - ('', ('unordered_set', 'unordered_multiset')), - ('', ('pair',)), - ('', ('vector',)), - - # gcc extensions. - # Note: std::hash is their hash, ::hash is our hash - ('', ('hash_map', 'hash_multimap',)), - ('', ('hash_set', 'hash_multiset',)), - ('', ('slist',)), - ) - -_HEADERS_MAYBE_TEMPLATES = ( - ('', ('copy', 'max', 'min', 'min_element', 'sort', - 'transform', - )), - ('', ('forward', 'make_pair', 'move', 'swap')), - ) - -_RE_PATTERN_STRING = re.compile(r'\bstring\b') - -_re_pattern_headers_maybe_templates = [] -for _header, _templates in _HEADERS_MAYBE_TEMPLATES: - for _template in _templates: - # Match max(..., ...), max(..., ...), but not foo->max, foo.max or - # 'type::max()'. - _re_pattern_headers_maybe_templates.append( - (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), - _template, - _header)) -# Match set, but not foo->set, foo.set -_re_pattern_headers_maybe_templates.append( - (re.compile(r'[^>.]\bset\s*\<'), - 'set<>', - '')) -# Match 'map var' and 'std::map(...)', but not 'map(...)'' -_re_pattern_headers_maybe_templates.append( - (re.compile(r'(std\b::\bmap\s*\<)|(^(std\b::\b)map\b\(\s*\<)'), - 'map<>', - '')) - -# Other scripts may reach in and modify this pattern. -_re_pattern_templates = [] -for _header, _templates in _HEADERS_CONTAINING_TEMPLATES: - for _template in _templates: - _re_pattern_templates.append( - (re.compile(r'(\<|\b)' + _template + r'\s*\<'), - _template + '<>', - _header)) - - -def FilesBelongToSameModule(filename_cc, filename_h): - """Check if these two filenames belong to the same module. - - The concept of a 'module' here is a as follows: - foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the - same 'module' if they are in the same directory. - some/path/public/xyzzy and some/path/internal/xyzzy are also considered - to belong to the same module here. - - If the filename_cc contains a longer path than the filename_h, for example, - '/absolute/path/to/base/sysinfo.cc', and this file would include - 'base/sysinfo.h', this function also produces the prefix needed to open the - header. This is used by the caller of this function to more robustly open the - header file. We don't have access to the real include paths in this context, - so we need this guesswork here. - - Known bugs: tools/base/bar.cc and base/bar.h belong to the same module - according to this implementation. Because of this, this function gives - some false positives. This should be sufficiently rare in practice. - - Args: - filename_cc: is the path for the source (e.g. .cc) file - filename_h: is the path for the header path - - Returns: - Tuple with a bool and a string: - bool: True if filename_cc and filename_h belong to the same module. - string: the additional prefix needed to open the header file. - """ - fileinfo_cc = FileInfo(filename_cc) - if not fileinfo_cc.Extension().lstrip('.') in GetNonHeaderExtensions(): - return (False, '') - - fileinfo_h = FileInfo(filename_h) - if not IsHeaderExtension(fileinfo_h.Extension().lstrip('.')): - return (False, '') - - filename_cc = filename_cc[:-(len(fileinfo_cc.Extension()))] - matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo_cc.BaseName()) - if matched_test_suffix: - filename_cc = filename_cc[:-len(matched_test_suffix.group(1))] - - filename_cc = filename_cc.replace('/public/', '/') - filename_cc = filename_cc.replace('/internal/', '/') - - filename_h = filename_h[:-(len(fileinfo_h.Extension()))] - if filename_h.endswith('-inl'): - filename_h = filename_h[:-len('-inl')] - filename_h = filename_h.replace('/public/', '/') - filename_h = filename_h.replace('/internal/', '/') - - files_belong_to_same_module = filename_cc.endswith(filename_h) - common_path = '' - if files_belong_to_same_module: - common_path = filename_cc[:-len(filename_h)] - return files_belong_to_same_module, common_path - - -def UpdateIncludeState(filename, include_dict, io=codecs): - """Fill up the include_dict with new includes found from the file. - - Args: - filename: the name of the header to read. - include_dict: a dictionary in which the headers are inserted. - io: The io factory to use to read the file. Provided for testability. - - Returns: - True if a header was successfully added. False otherwise. - """ - headerfile = None - try: - with io.open(filename, 'r', 'utf8', 'replace') as headerfile: - linenum = 0 - for line in headerfile: - linenum += 1 - clean_line = CleanseComments(line) - match = _RE_PATTERN_INCLUDE.search(clean_line) - if match: - include = match.group(2) - include_dict.setdefault(include, linenum) - return True - except IOError: - return False - - - -def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, - io=codecs): - """Reports for missing stl includes. - - This function will output warnings to make sure you are including the headers - necessary for the stl containers and functions that you use. We only give one - reason to include a header. For example, if you use both equal_to<> and - less<> in a .h file, only one (the latter in the file) of these will be - reported as a reason to include the . - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - include_state: An _IncludeState instance. - error: The function to call with any errors found. - io: The IO factory to use to read the header file. Provided for unittest - injection. - """ - required = {} # A map of header name to linenumber and the template entity. - # Example of required: { '': (1219, 'less<>') } - - for linenum in xrange(clean_lines.NumLines()): - line = clean_lines.elided[linenum] - if not line or line[0] == '#': - continue - - # String is special -- it is a non-templatized type in STL. - matched = _RE_PATTERN_STRING.search(line) - if matched: - # Don't warn about strings in non-STL namespaces: - # (We check only the first match per line; good enough.) - prefix = line[:matched.start()] - if prefix.endswith('std::') or not prefix.endswith('::'): - required[''] = (linenum, 'string') - - for pattern, template, header in _re_pattern_headers_maybe_templates: - if pattern.search(line): - required[header] = (linenum, template) - - # The following function is just a speed up, no semantics are changed. - if not '<' in line: # Reduces the cpu time usage by skipping lines. - continue - - for pattern, template, header in _re_pattern_templates: - matched = pattern.search(line) - if matched: - # Don't warn about IWYU in non-STL namespaces: - # (We check only the first match per line; good enough.) - prefix = line[:matched.start()] - if prefix.endswith('std::') or not prefix.endswith('::'): - required[header] = (linenum, template) - - # The policy is that if you #include something in foo.h you don't need to - # include it again in foo.cc. Here, we will look at possible includes. - # Let's flatten the include_state include_list and copy it into a dictionary. - include_dict = dict([item for sublist in include_state.include_list - for item in sublist]) - - # Did we find the header for this file (if any) and successfully load it? - header_found = False - - # Use the absolute path so that matching works properly. - abs_filename = FileInfo(filename).FullName() - - # For Emacs's flymake. - # If cpplint is invoked from Emacs's flymake, a temporary file is generated - # by flymake and that file name might end with '_flymake.cc'. In that case, - # restore original file name here so that the corresponding header file can be - # found. - # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h' - # instead of 'foo_flymake.h' - abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) - - # include_dict is modified during iteration, so we iterate over a copy of - # the keys. - header_keys = list(include_dict.keys()) - for header in header_keys: - (same_module, common_path) = FilesBelongToSameModule(abs_filename, header) - fullpath = common_path + header - if same_module and UpdateIncludeState(fullpath, include_dict, io): - header_found = True - - # If we can't find the header file for a .cc, assume it's because we don't - # know where to look. In that case we'll give up as we're not sure they - # didn't include it in the .h file. - # TODO(unknown): Do a better job of finding .h files so we are confident that - # not having the .h file means there isn't one. - if not header_found: - for extension in GetNonHeaderExtensions(): - if filename.endswith('.' + extension): - return - - # All the lines have been processed, report the errors found. - for required_header_unstripped in sorted(required, key=required.__getitem__): - template = required[required_header_unstripped][1] - if required_header_unstripped.strip('<>"') not in include_dict: - error(filename, required[required_header_unstripped][0], - 'build/include_what_you_use', 4, - 'Add #include ' + required_header_unstripped + ' for ' + template) - - -_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<') - - -def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): - """Check that make_pair's template arguments are deduced. - - G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are - specified explicitly, and such use isn't intended in any case. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line) - if match: - error(filename, linenum, 'build/explicit_make_pair', - 4, # 4 = high confidence - 'For C++11-compatibility, omit template arguments from make_pair' - ' OR use pair directly OR if appropriate, construct a pair directly') - - -def CheckRedundantVirtual(filename, clean_lines, linenum, error): - """Check if line contains a redundant "virtual" function-specifier. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - # Look for "virtual" on current line. - line = clean_lines.elided[linenum] - virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line) - if not virtual: return - - # Ignore "virtual" keywords that are near access-specifiers. These - # are only used in class base-specifier and do not apply to member - # functions. - if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or - Match(r'^\s+(public|protected|private)\b', virtual.group(3))): - return - - # Ignore the "virtual" keyword from virtual base classes. Usually - # there is a column on the same line in these cases (virtual base - # classes are rare in google3 because multiple inheritance is rare). - if Match(r'^.*[^:]:[^:].*$', line): return - - # Look for the next opening parenthesis. This is the start of the - # parameter list (possibly on the next line shortly after virtual). - # TODO(unknown): doesn't work if there are virtual functions with - # decltype() or other things that use parentheses, but csearch suggests - # that this is rare. - end_col = -1 - end_line = -1 - start_col = len(virtual.group(2)) - for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())): - line = clean_lines.elided[start_line][start_col:] - parameter_list = Match(r'^([^(]*)\(', line) - if parameter_list: - # Match parentheses to find the end of the parameter list - (_, end_line, end_col) = CloseExpression( - clean_lines, start_line, start_col + len(parameter_list.group(1))) - break - start_col = 0 - - if end_col < 0: - return # Couldn't find end of parameter list, give up - - # Look for "override" or "final" after the parameter list - # (possibly on the next few lines). - for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())): - line = clean_lines.elided[i][end_col:] - match = Search(r'\b(override|final)\b', line) - if match: - error(filename, linenum, 'readability/inheritance', 4, - ('"virtual" is redundant since function is ' - 'already declared as "%s"' % match.group(1))) - - # Set end_col to check whole lines after we are done with the - # first line. - end_col = 0 - if Search(r'[^\w]\s*$', line): - break - - -def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error): - """Check if line contains a redundant "override" or "final" virt-specifier. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - # Look for closing parenthesis nearby. We need one to confirm where - # the declarator ends and where the virt-specifier starts to avoid - # false positives. - line = clean_lines.elided[linenum] - declarator_end = line.rfind(')') - if declarator_end >= 0: - fragment = line[declarator_end:] - else: - if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0: - fragment = line - else: - return - - # Check that at most one of "override" or "final" is present, not both - if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment): - error(filename, linenum, 'readability/inheritance', 4, - ('"override" is redundant since function is ' - 'already declared as "final"')) - - - - -# Returns true if we are at a new block, and it is directly -# inside of a namespace. -def IsBlockInNameSpace(nesting_state, is_forward_declaration): - """Checks that the new block is directly in a namespace. - - Args: - nesting_state: The _NestingState object that contains info about our state. - is_forward_declaration: If the class is a forward declared class. - Returns: - Whether or not the new block is directly in a namespace. - """ - if is_forward_declaration: - return len(nesting_state.stack) >= 1 and ( - isinstance(nesting_state.stack[-1], _NamespaceInfo)) - - - return (len(nesting_state.stack) > 1 and - nesting_state.stack[-1].check_namespace_indentation and - isinstance(nesting_state.stack[-2], _NamespaceInfo)) - - -def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, - raw_lines_no_comments, linenum): - """This method determines if we should apply our namespace indentation check. - - Args: - nesting_state: The current nesting state. - is_namespace_indent_item: If we just put a new class on the stack, True. - If the top of the stack is not a class, or we did not recently - add the class, False. - raw_lines_no_comments: The lines without the comments. - linenum: The current line number we are processing. - - Returns: - True if we should apply our namespace indentation check. Currently, it - only works for classes and namespaces inside of a namespace. - """ - - is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments, - linenum) - - if not (is_namespace_indent_item or is_forward_declaration): - return False - - # If we are in a macro, we do not want to check the namespace indentation. - if IsMacroDefinition(raw_lines_no_comments, linenum): - return False - - return IsBlockInNameSpace(nesting_state, is_forward_declaration) - - -# Call this method if the line is directly inside of a namespace. -# If the line above is blank (excluding comments) or the start of -# an inner namespace, it cannot be indented. -def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum, - error): - line = raw_lines_no_comments[linenum] - if Match(r'^\s+', line): - error(filename, linenum, 'runtime/indentation_namespace', 4, - 'Do not indent within a namespace') - - -def ProcessLine(filename, file_extension, clean_lines, line, - include_state, function_state, nesting_state, error, - extra_check_functions=None): - """Processes a single line in the file. - - Args: - filename: Filename of the file that is being processed. - file_extension: The extension (dot not included) of the file. - clean_lines: An array of strings, each representing a line of the file, - with comments stripped. - line: Number of line being processed. - include_state: An _IncludeState instance in which the headers are inserted. - function_state: A _FunctionState instance which counts function lines, etc. - nesting_state: A NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: A callable to which errors are reported, which takes 4 arguments: - filename, line number, error level, and message - extra_check_functions: An array of additional check functions that will be - run on each source line. Each function takes 4 - arguments: filename, clean_lines, line, error - """ - raw_lines = clean_lines.raw_lines - ParseNolintSuppressions(filename, raw_lines[line], line, error) - nesting_state.Update(filename, clean_lines, line, error) - CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, - error) - if nesting_state.InAsmBlock(): return - CheckForFunctionLengths(filename, clean_lines, line, function_state, error) - CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) - CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error) - CheckLanguage(filename, clean_lines, line, file_extension, include_state, - nesting_state, error) - CheckForNonConstReference(filename, clean_lines, line, nesting_state, error) - CheckForNonStandardConstructs(filename, clean_lines, line, - nesting_state, error) - CheckVlogArguments(filename, clean_lines, line, error) - CheckPosixThreading(filename, clean_lines, line, error) - CheckInvalidIncrement(filename, clean_lines, line, error) - CheckMakePairUsesDeduction(filename, clean_lines, line, error) - CheckRedundantVirtual(filename, clean_lines, line, error) - CheckRedundantOverrideOrFinal(filename, clean_lines, line, error) - if extra_check_functions: - for check_fn in extra_check_functions: - check_fn(filename, clean_lines, line, error) - -def FlagCxx11Features(filename, clean_lines, linenum, error): - """Flag those c++11 features that we only allow in certain places. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) - - # Flag unapproved C++ TR1 headers. - if include and include.group(1).startswith('tr1/'): - error(filename, linenum, 'build/c++tr1', 5, - ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1)) - - # Flag unapproved C++11 headers. - if include and include.group(1) in ('cfenv', - 'condition_variable', - 'fenv.h', - 'future', - 'mutex', - 'thread', - 'chrono', - 'ratio', - 'regex', - 'system_error', - ): - error(filename, linenum, 'build/c++11', 5, - ('<%s> is an unapproved C++11 header.') % include.group(1)) - - # The only place where we need to worry about C++11 keywords and library - # features in preprocessor directives is in macro definitions. - if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return - - # These are classes and free functions. The classes are always - # mentioned as std::*, but we only catch the free functions if - # they're not found by ADL. They're alphabetical by header. - for top_name in ( - # type_traits - 'alignment_of', - 'aligned_union', - ): - if Search(r'\bstd::%s\b' % top_name, line): - error(filename, linenum, 'build/c++11', 5, - ('std::%s is an unapproved C++11 class or function. Send c-style ' - 'an example of where it would make your code more readable, and ' - 'they may let you use it.') % top_name) - - -def FlagCxx14Features(filename, clean_lines, linenum, error): - """Flag those C++14 features that we restrict. - - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] - - include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) - - # Flag unapproved C++14 headers. - if include and include.group(1) in ('scoped_allocator', 'shared_mutex'): - error(filename, linenum, 'build/c++14', 5, - ('<%s> is an unapproved C++14 header.') % include.group(1)) - - -def ProcessFileData(filename, file_extension, lines, error, - extra_check_functions=None): - """Performs lint checks and reports any errors to the given error function. - - Args: - filename: Filename of the file that is being processed. - file_extension: The extension (dot not included) of the file. - lines: An array of strings, each representing a line of the file, with the - last element being empty if the file is terminated with a newline. - error: A callable to which errors are reported, which takes 4 arguments: - filename, line number, error level, and message - extra_check_functions: An array of additional check functions that will be - run on each source line. Each function takes 4 - arguments: filename, clean_lines, line, error - """ - lines = (['// marker so line numbers and indices both start at 1'] + lines + - ['// marker so line numbers end in a known way']) - - include_state = _IncludeState() - function_state = _FunctionState() - nesting_state = NestingState() - - ResetNolintSuppressions() - - CheckForCopyright(filename, lines, error) - ProcessGlobalSuppressions(lines) - RemoveMultiLineComments(filename, lines, error) - clean_lines = CleansedLines(lines) - - if IsHeaderExtension(file_extension): - CheckForHeaderGuard(filename, clean_lines, error) - - for line in xrange(clean_lines.NumLines()): - ProcessLine(filename, file_extension, clean_lines, line, - include_state, function_state, nesting_state, error, - extra_check_functions) - FlagCxx11Features(filename, clean_lines, line, error) - nesting_state.CheckCompletedBlocks(filename, error) - - CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) - - # Check that the .cc file has included its header if it exists. - if _IsSourceExtension(file_extension): - CheckHeaderFileIncluded(filename, include_state, error) - - # We check here rather than inside ProcessLine so that we see raw - # lines rather than "cleaned" lines. - CheckForBadCharacters(filename, lines, error) - - CheckForNewlineAtEOF(filename, lines, error) - -def ProcessConfigOverrides(filename): - """ Loads the configuration files and processes the config overrides. - - Args: - filename: The name of the file being processed by the linter. - - Returns: - False if the current |filename| should not be processed further. - """ - - abs_filename = os.path.abspath(filename) - cfg_filters = [] - keep_looking = True - while keep_looking: - abs_path, base_name = os.path.split(abs_filename) - if not base_name: - break # Reached the root directory. - - cfg_file = os.path.join(abs_path, "CPPLINT.cfg") - abs_filename = abs_path - if not os.path.isfile(cfg_file): - continue - - try: - with codecs.open(cfg_file, 'r', 'utf8', 'replace') as file_handle: - for line in file_handle: - line, _, _ = line.partition('#') # Remove comments. - if not line.strip(): - continue - - name, _, val = line.partition('=') - name = name.strip() - val = val.strip() - if name == 'set noparent': - keep_looking = False - elif name == 'filter': - cfg_filters.append(val) - elif name == 'exclude_files': - # When matching exclude_files pattern, use the base_name of - # the current file name or the directory name we are processing. - # For example, if we are checking for lint errors in /foo/bar/baz.cc - # and we found the .cfg file at /foo/CPPLINT.cfg, then the config - # file's "exclude_files" filter is meant to be checked against "bar" - # and not "baz" nor "bar/baz.cc". - if base_name: - pattern = re.compile(val) - if pattern.match(base_name): - if _cpplint_state.quiet: - # Suppress "Ignoring file" warning when using --quiet. - return False - _cpplint_state.PrintInfo('Ignoring "%s": file excluded by "%s". ' - 'File path component "%s" matches ' - 'pattern "%s"\n' % - (filename, cfg_file, base_name, val)) - return False - elif name == 'linelength': - global _line_length - try: - _line_length = int(val) - except ValueError: - _cpplint_state.PrintError('Line length must be numeric.') - elif name == 'extensions': - ProcessExtensionsOption(val) - elif name == 'root': - global _root - # root directories are specified relative to CPPLINT.cfg dir. - _root = os.path.join(os.path.dirname(cfg_file), val) - elif name == 'headers': - ProcessHppHeadersOption(val) - elif name == 'includeorder': - ProcessIncludeOrderOption(val) - else: - _cpplint_state.PrintError( - 'Invalid configuration option (%s) in file %s\n' % - (name, cfg_file)) - - except IOError: - _cpplint_state.PrintError( - "Skipping config file '%s': Can't open for reading\n" % cfg_file) - keep_looking = False - - # Apply all the accumulated filters in reverse order (top-level directory - # config options having the least priority). - for cfg_filter in reversed(cfg_filters): - _AddFilters(cfg_filter) - - return True - - -def ProcessFile(filename, vlevel, extra_check_functions=None): - """Does google-lint on a single file. - - Args: - filename: The name of the file to parse. - - vlevel: The level of errors to report. Every error of confidence - >= verbose_level will be reported. 0 is a good default. - - extra_check_functions: An array of additional check functions that will be - run on each source line. Each function takes 4 - arguments: filename, clean_lines, line, error - """ - - _SetVerboseLevel(vlevel) - _BackupFilters() - old_errors = _cpplint_state.error_count - - if not ProcessConfigOverrides(filename): - _RestoreFilters() - return - - lf_lines = [] - crlf_lines = [] - try: - # Support the UNIX convention of using "-" for stdin. Note that - # we are not opening the file with universal newline support - # (which codecs doesn't support anyway), so the resulting lines do - # contain trailing '\r' characters if we are reading a file that - # has CRLF endings. - # If after the split a trailing '\r' is present, it is removed - # below. - if filename == '-': - lines = codecs.StreamReaderWriter(sys.stdin, - codecs.getreader('utf8'), - codecs.getwriter('utf8'), - 'replace').read().split('\n') - else: - with codecs.open(filename, 'r', 'utf8', 'replace') as target_file: - lines = target_file.read().split('\n') - - # Remove trailing '\r'. - # The -1 accounts for the extra trailing blank line we get from split() - for linenum in range(len(lines) - 1): - if lines[linenum].endswith('\r'): - lines[linenum] = lines[linenum].rstrip('\r') - crlf_lines.append(linenum + 1) - else: - lf_lines.append(linenum + 1) - - except IOError: - _cpplint_state.PrintError( - "Skipping input '%s': Can't open for reading\n" % filename) - _RestoreFilters() - return - - # Note, if no dot is found, this will give the entire filename as the ext. - file_extension = filename[filename.rfind('.') + 1:] - - # When reading from stdin, the extension is unknown, so no cpplint tests - # should rely on the extension. - if filename != '-' and file_extension not in GetAllExtensions(): - _cpplint_state.PrintError('Ignoring %s; not a valid file name ' - '(%s)\n' % (filename, ', '.join(GetAllExtensions()))) - else: - ProcessFileData(filename, file_extension, lines, Error, - extra_check_functions) - - # If end-of-line sequences are a mix of LF and CR-LF, issue - # warnings on the lines with CR. - # - # Don't issue any warnings if all lines are uniformly LF or CR-LF, - # since critique can handle these just fine, and the style guide - # doesn't dictate a particular end of line sequence. - # - # We can't depend on os.linesep to determine what the desired - # end-of-line sequence should be, since that will return the - # server-side end-of-line sequence. - if lf_lines and crlf_lines: - # Warn on every line with CR. An alternative approach might be to - # check whether the file is mostly CRLF or just LF, and warn on the - # minority, we bias toward LF here since most tools prefer LF. - for linenum in crlf_lines: - Error(filename, linenum, 'whitespace/newline', 1, - 'Unexpected \\r (^M) found; better to use only \\n') - - # Suppress printing anything if --quiet was passed unless the error - # count has increased after processing this file. - if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count: - _cpplint_state.PrintInfo('Done processing %s\n' % filename) - _RestoreFilters() - - -def PrintUsage(message): - """Prints a brief usage string and exits, optionally with an error message. - - Args: - message: The optional error message. - """ - sys.stderr.write(_USAGE % (sorted(list(GetAllExtensions())), - ','.join(sorted(list(GetAllExtensions()))), - sorted(GetHeaderExtensions()), - ','.join(sorted(GetHeaderExtensions())))) - - if message: - sys.exit('\nFATAL ERROR: ' + message) - else: - sys.exit(0) - -def PrintVersion(): - sys.stdout.write('Cpplint fork (https://github.com/cpplint/cpplint)\n') - sys.stdout.write('cpplint ' + __VERSION__ + '\n') - sys.stdout.write('Python ' + sys.version + '\n') - sys.exit(0) - -def PrintCategories(): - """Prints a list of all the error-categories used by error messages. - - These are the categories used to filter messages via --filter. - """ - sys.stderr.write(''.join(' %s\n' % cat for cat in _ERROR_CATEGORIES)) - sys.exit(0) - - -def ParseArguments(args): - """Parses the command line arguments. - - This may set the output format and verbosity level as side-effects. - - Args: - args: The command line arguments: - - Returns: - The list of filenames to lint. - """ - try: - (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=', - 'v=', - 'version', - 'counting=', - 'filter=', - 'root=', - 'repository=', - 'linelength=', - 'extensions=', - 'exclude=', - 'recursive', - 'headers=', - 'includeorder=', - 'quiet']) - except getopt.GetoptError: - PrintUsage('Invalid arguments.') - - verbosity = _VerboseLevel() - output_format = _OutputFormat() - filters = '' - quiet = _Quiet() - counting_style = '' - recursive = False - - for (opt, val) in opts: - if opt == '--help': - PrintUsage(None) - if opt == '--version': - PrintVersion() - elif opt == '--output': - if val not in ('emacs', 'vs7', 'eclipse', 'junit', 'sed', 'gsed'): - PrintUsage('The only allowed output formats are emacs, vs7, eclipse ' - 'sed, gsed and junit.') - output_format = val - elif opt == '--quiet': - quiet = True - elif opt == '--verbose' or opt == '--v': - verbosity = int(val) - elif opt == '--filter': - filters = val - if not filters: - PrintCategories() - elif opt == '--counting': - if val not in ('total', 'toplevel', 'detailed'): - PrintUsage('Valid counting options are total, toplevel, and detailed') - counting_style = val - elif opt == '--root': - global _root - _root = val - elif opt == '--repository': - global _repository - _repository = val - elif opt == '--linelength': - global _line_length - try: - _line_length = int(val) - except ValueError: - PrintUsage('Line length must be digits.') - elif opt == '--exclude': - global _excludes - if not _excludes: - _excludes = set() - _excludes.update(glob.glob(val)) - elif opt == '--extensions': - ProcessExtensionsOption(val) - elif opt == '--headers': - ProcessHppHeadersOption(val) - elif opt == '--recursive': - recursive = True - elif opt == '--includeorder': - ProcessIncludeOrderOption(val) - - if not filenames: - PrintUsage('No files were specified.') - - if recursive: - filenames = _ExpandDirectories(filenames) - - if _excludes: - filenames = _FilterExcludedFiles(filenames) - - _SetOutputFormat(output_format) - _SetQuiet(quiet) - _SetVerboseLevel(verbosity) - _SetFilters(filters) - _SetCountingStyle(counting_style) - - filenames.sort() - return filenames - -def _ExpandDirectories(filenames): - """Searches a list of filenames and replaces directories in the list with - all files descending from those directories. Files with extensions not in - the valid extensions list are excluded. - - Args: - filenames: A list of files or directories - - Returns: - A list of all files that are members of filenames or descended from a - directory in filenames - """ - expanded = set() - for filename in filenames: - if not os.path.isdir(filename): - expanded.add(filename) - continue - - for root, _, files in os.walk(filename): - for loopfile in files: - fullname = os.path.join(root, loopfile) - if fullname.startswith('.' + os.path.sep): - fullname = fullname[len('.' + os.path.sep):] - expanded.add(fullname) - - filtered = [] - for filename in expanded: - if os.path.splitext(filename)[1][1:] in GetAllExtensions(): - filtered.append(filename) - return filtered - -def _FilterExcludedFiles(fnames): - """Filters out files listed in the --exclude command line switch. File paths - in the switch are evaluated relative to the current working directory - """ - exclude_paths = [os.path.abspath(f) for f in _excludes] - # because globbing does not work recursively, exclude all subpath of all excluded entries - return [f for f in fnames - if not any(e for e in exclude_paths - if _IsParentOrSame(e, os.path.abspath(f)))] - -def _IsParentOrSame(parent, child): - """Return true if child is subdirectory of parent. - Assumes both paths are absolute and don't contain symlinks. - """ - parent = os.path.normpath(parent) - child = os.path.normpath(child) - if parent == child: - return True - - prefix = os.path.commonprefix([parent, child]) - if prefix != parent: - return False - # Note: os.path.commonprefix operates on character basis, so - # take extra care of situations like '/foo/ba' and '/foo/bar/baz' - child_suffix = child[len(prefix):] - child_suffix = child_suffix.lstrip(os.sep) - return child == os.path.join(prefix, child_suffix) - -def main(): - filenames = ParseArguments(sys.argv[1:]) - backup_err = sys.stderr - try: - # Change stderr to write with replacement characters so we don't die - # if we try to print something containing non-ASCII characters. - sys.stderr = codecs.StreamReader(sys.stderr, 'replace') - - _cpplint_state.ResetErrorCounts() - for filename in filenames: - ProcessFile(filename, _cpplint_state.verbose_level) - # If --quiet is passed, suppress printing error count unless there are errors. - if not _cpplint_state.quiet or _cpplint_state.error_count > 0: - _cpplint_state.PrintErrorCounts() - - if _cpplint_state.output_format == 'junit': - sys.stderr.write(_cpplint_state.FormatJUnitXML()) - - finally: - sys.stderr = backup_err - - sys.exit(_cpplint_state.error_count > 0) - - -if __name__ == '__main__': - main() diff --git a/cpp/build-support/fuzzing/pack_corpus.py b/cpp/build-support/fuzzing/pack_corpus.py index 2064fed608d..94d9a88b387 100755 --- a/cpp/build-support/fuzzing/pack_corpus.py +++ b/cpp/build-support/fuzzing/pack_corpus.py @@ -27,19 +27,20 @@ def process_dir(corpus_dir, zip_output): - seen = set() + seen_hashes = {} for child in corpus_dir.iterdir(): if not child.is_file(): - raise IOError("Not a file: {0}".format(child)) + raise IOError(f"Not a file: {child}") with child.open('rb') as f: data = f.read() arcname = hashlib.sha1(data).hexdigest() - if arcname in seen: - raise ValueError("Duplicate hash: {0} (in file {1})" - .format(arcname, child)) + if arcname in seen_hashes: + raise ValueError( + f"Duplicate hash: {arcname} (in file {child}), " + f"already seen in file {seen_hashes[arcname]}") zip_output.writestr(str(arcname), data) - seen.add(arcname) + seen_hashes[arcname] = child def main(corpus_dir, zip_output_name): @@ -49,6 +50,6 @@ def main(corpus_dir, zip_output_name): if __name__ == "__main__": if len(sys.argv) != 3: - print("Usage: {0} ".format(sys.argv[0])) + print(f"Usage: {sys.argv[0]} ") sys.exit(1) main(sys.argv[1], sys.argv[2]) diff --git a/cpp/build-support/iwyu/iwyu-filter.awk b/cpp/build-support/iwyu/iwyu-filter.awk deleted file mode 100644 index 943ab115c25..00000000000 --- a/cpp/build-support/iwyu/iwyu-filter.awk +++ /dev/null @@ -1,96 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# -# This is an awk script to process output from the include-what-you-use (IWYU) -# tool. As of now, IWYU is of alpha quality and it gives many incorrect -# recommendations -- obviously invalid or leading to compilation breakage. -# Most of those can be silenced using appropriate IWYU pragmas, but it's not -# the case for the auto-generated files. -# -# Also, it's possible to address invalid recommendation using mappings: -# https://github.com/include-what-you-use/include-what-you-use/blob/master/docs/IWYUMappings.md -# -# Usage: -# 1. Run the CMake with -DCMAKE_CXX_INCLUDE_WHAT_YOU_USE= -# -# The path to the IWYU binary should be absolute. The path to the binary -# and the command-line options should be separated by semicolon -# (that's for feeding it into CMake list variables). -# -# E.g., from the build directory (line breaks are just for readability): -# -# CC=../../thirdparty/clang-toolchain/bin/clang -# CXX=../../thirdparty/clang-toolchain/bin/clang++ -# IWYU="`pwd`../../thirdparty/clang-toolchain/bin/include-what-you-use;\ -# -Xiwyu;--mapping_file=`pwd`../../build-support/iwyu/mappings/map.imp" -# -# ../../build-support/enable_devtoolset.sh \ -# env CC=$CC CXX=$CXX \ -# ../../thirdparty/installed/common/bin/cmake \ -# -DCMAKE_CXX_INCLUDE_WHAT_YOU_USE=\"$IWYU\" \ -# ../.. -# -# NOTE: -# Since the arrow code has some 'ifdef NDEBUG' directives, it's possible -# that IWYU would produce different results if run against release, not -# debug build. However, we plan to use the tool only with debug builds. -# -# 2. Run make, separating the output from the IWYU tool into a separate file -# (it's possible to use piping the output from the tool to the script -# but having a file is good for future reference, if necessary): -# -# make -j$(nproc) 2>/tmp/iwyu.log -# -# 3. Process the output from the IWYU tool using the script: -# -# awk -f ../../build-support/iwyu/iwyu-filter.awk /tmp/iwyu.log -# - -BEGIN { - # This is the list of the files for which the suggestions from IWYU are - # ignored. Eventually, this list should become empty as soon as all the valid - # suggestions are addressed and invalid ones are taken care either by proper - # IWYU pragmas or adding special mappings (e.g. like boost mappings). - # muted["relative/path/to/file"] - muted["arrow/util/bit-util-test.cc"] - muted["arrow/util/rle-encoding-test.cc"] - muted["arrow/vendored"] - muted["include/hdfs.h"] - muted["arrow/visitor.h"] -} - -# mute all suggestions for the auto-generated files -/.*\.(pb|proxy|service)\.(cc|h) should (add|remove) these lines:/, /^$/ { - next -} - -# mute suggestions for the explicitly specified files -/.* should (add|remove) these lines:/ { - do_print = 1 - for (path in muted) { - if (index($0, path)) { - do_print = 0 - break - } - } -} -/^$/ { - if (do_print) print - do_print = 0 -} -{ if (do_print) print } diff --git a/cpp/build-support/iwyu/iwyu.sh b/cpp/build-support/iwyu/iwyu.sh deleted file mode 100755 index 58ffce0c353..00000000000 --- a/cpp/build-support/iwyu/iwyu.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -set -uo pipefail - -ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd) - -IWYU_LOG=$(mktemp -t arrow-cpp-iwyu.XXXXXX) -trap "rm -f $IWYU_LOG" EXIT - -IWYU_MAPPINGS_PATH="$ROOT/cpp/build-support/iwyu/mappings" -IWYU_ARGS="\ - --mapping_file=$IWYU_MAPPINGS_PATH/boost-all.imp \ - --mapping_file=$IWYU_MAPPINGS_PATH/boost-all-private.imp \ - --mapping_file=$IWYU_MAPPINGS_PATH/boost-extra.imp \ - --mapping_file=$IWYU_MAPPINGS_PATH/gflags.imp \ - --mapping_file=$IWYU_MAPPINGS_PATH/glog.imp \ - --mapping_file=$IWYU_MAPPINGS_PATH/gmock.imp \ - --mapping_file=$IWYU_MAPPINGS_PATH/gtest.imp \ - --mapping_file=$IWYU_MAPPINGS_PATH/arrow-misc.imp" - -set -e - -affected_files() { - pushd $ROOT > /dev/null - local commit=$($ROOT/cpp/build-support/get-upstream-commit.sh) - git diff --name-only $commit | awk '/\.(c|cc|h)$/' - popd > /dev/null -} - -# Show the IWYU version. Also causes the script to fail if iwyu is not in your -# PATH -include-what-you-use --version - -if [[ "${1:-}" == "all" ]]; then - ${PYTHON:-python3} $ROOT/cpp/build-support/iwyu/iwyu_tool.py -p ${IWYU_COMPILATION_DATABASE_PATH:-.} \ - -- $IWYU_ARGS | awk -f $ROOT/cpp/build-support/iwyu/iwyu-filter.awk -elif [[ "${1:-}" == "match" ]]; then - ALL_FILES= - IWYU_FILE_LIST= - for path in $(find $ROOT/cpp/src -type f | awk '/\.(c|cc|h)$/'); do - if [[ $path =~ $2 ]]; then - IWYU_FILE_LIST="$IWYU_FILE_LIST $path" - fi - done - - echo "Running IWYU on $IWYU_FILE_LIST" - ${PYTHON:-python3} $ROOT/cpp/build-support/iwyu/iwyu_tool.py \ - -p ${IWYU_COMPILATION_DATABASE_PATH:-.} $IWYU_FILE_LIST -- \ - $IWYU_ARGS | awk -f $ROOT/cpp/build-support/iwyu/iwyu-filter.awk -else - # Build the list of updated files which are of IWYU interest. - file_list_tmp=$(affected_files) - if [ -z "$file_list_tmp" ]; then - exit 0 - fi - - # Adjust the path for every element in the list. The iwyu_tool.py normalizes - # paths (via realpath) to match the records from the compilation database. - IWYU_FILE_LIST= - for p in $file_list_tmp; do - IWYU_FILE_LIST="$IWYU_FILE_LIST $ROOT/$p" - done - - ${PYTHON:-python3} $ROOT/cpp/build-support/iwyu/iwyu_tool.py \ - -p ${IWYU_COMPILATION_DATABASE_PATH:-.} $IWYU_FILE_LIST -- \ - $IWYU_ARGS | awk -f $ROOT/cpp/build-support/iwyu/iwyu-filter.awk > $IWYU_LOG -fi - -if [ -s "$IWYU_LOG" ]; then - # The output is not empty: the changelist needs correction. - cat $IWYU_LOG 1>&2 - exit 1 -fi diff --git a/cpp/build-support/iwyu/iwyu_tool.py b/cpp/build-support/iwyu/iwyu_tool.py deleted file mode 100755 index 1429e0c0ee3..00000000000 --- a/cpp/build-support/iwyu/iwyu_tool.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python - -# This file has been imported into the apache source tree from -# the IWYU source tree as of version 0.8 -# https://github.com/include-what-you-use/include-what-you-use/blob/master/iwyu_tool.py -# and corresponding license has been added: -# https://github.com/include-what-you-use/include-what-you-use/blob/master/LICENSE.TXT -# -# ============================================================================== -# LLVM Release License -# ============================================================================== -# University of Illinois/NCSA -# Open Source License -# -# Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. -# All rights reserved. -# -# Developed by: -# -# LLVM Team -# -# University of Illinois at Urbana-Champaign -# -# http://llvm.org -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal with -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -# of the Software, and to permit persons to whom the Software is furnished to do -# so, subject to the following conditions: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimers. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimers in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the names of the LLVM Team, University of Illinois at -# Urbana-Champaign, nor the names of its contributors may be used to -# endorse or promote products derived from this Software without specific -# prior written permission. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE -# SOFTWARE. - -""" Driver to consume a Clang compilation database and invoke IWYU. - -Example usage with CMake: - - # Unix systems - $ mkdir build && cd build - $ CC="clang" CXX="clang++" cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ... - $ iwyu_tool.py -p . - - # Windows systems - $ mkdir build && cd build - $ cmake -DCMAKE_CXX_COMPILER="%VCINSTALLDIR%/bin/cl.exe" \ - -DCMAKE_C_COMPILER="%VCINSTALLDIR%/VC/bin/cl.exe" \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ - -G Ninja ... - $ python iwyu_tool.py -p . - -See iwyu_tool.py -h for more details on command-line arguments. -""" - -import os -import sys -import json -import argparse -import subprocess -import re - -import logging - -logging.basicConfig(filename='iwyu.log') -LOGGER = logging.getLogger("iwyu") - - -def iwyu_formatter(output): - """ Process iwyu's output, basically a no-op. """ - print('\n'.join(output)) - - -CORRECT_RE = re.compile(r'^\((.*?) has correct #includes/fwd-decls\)$') -SHOULD_ADD_RE = re.compile(r'^(.*?) should add these lines:$') -SHOULD_REMOVE_RE = re.compile(r'^(.*?) should remove these lines:$') -FULL_LIST_RE = re.compile(r'The full include-list for (.*?):$') -END_RE = re.compile(r'^---$') -LINES_RE = re.compile(r'^- (.*?) // lines ([0-9]+)-[0-9]+$') - - -GENERAL, ADD, REMOVE, LIST = range(4) - - -def clang_formatter(output): - """ Process iwyu's output into something clang-like. """ - state = (GENERAL, None) - for line in output: - match = CORRECT_RE.match(line) - if match: - print('%s:1:1: note: #includes/fwd-decls are correct', match.groups(1)) - continue - match = SHOULD_ADD_RE.match(line) - if match: - state = (ADD, match.group(1)) - continue - match = SHOULD_REMOVE_RE.match(line) - if match: - state = (REMOVE, match.group(1)) - continue - match = FULL_LIST_RE.match(line) - if match: - state = (LIST, match.group(1)) - elif END_RE.match(line): - state = (GENERAL, None) - elif not line.strip(): - continue - elif state[0] == GENERAL: - print(line) - elif state[0] == ADD: - print('%s:1:1: error: add the following line', state[1]) - print(line) - elif state[0] == REMOVE: - match = LINES_RE.match(line) - line_no = match.group(2) if match else '1' - print('%s:%s:1: error: remove the following line', state[1], line_no) - print(match.group(1)) - - -DEFAULT_FORMAT = 'iwyu' -FORMATTERS = { - 'iwyu': iwyu_formatter, - 'clang': clang_formatter -} - - -def get_output(cwd, command): - """ Run the given command and return its output as a string. """ - process = subprocess.Popen(command, - cwd=cwd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - return process.communicate()[0].decode("utf-8").splitlines() - - -def run_iwyu(cwd, compile_command, iwyu_args, verbose, formatter): - """ Rewrite compile_command to an IWYU command, and run it. """ - compiler, _, args = compile_command.partition(' ') - if compiler.endswith('cl.exe'): - # If the compiler name is cl.exe, let IWYU be cl-compatible - clang_args = ['--driver-mode=cl'] - else: - clang_args = [] - - iwyu_args = ['-Xiwyu ' + a for a in iwyu_args] - command = ['include-what-you-use'] + clang_args + iwyu_args - command = '%s %s' % (' '.join(command), args.strip()) - - if verbose: - print('%s:', command) - - formatter(get_output(cwd, command)) - - -def main(compilation_db_path, source_files, verbose, formatter, iwyu_args): - """ Entry point. """ - # Canonicalize compilation database path - if os.path.isdir(compilation_db_path): - compilation_db_path = os.path.join(compilation_db_path, - 'compile_commands.json') - - compilation_db_path = os.path.realpath(compilation_db_path) - if not os.path.isfile(compilation_db_path): - print('ERROR: No such file or directory: \'%s\'', compilation_db_path) - return 1 - - # Read compilation db from disk - with open(compilation_db_path, 'r') as fileobj: - compilation_db = json.load(fileobj) - - # expand symlinks - for entry in compilation_db: - entry['file'] = os.path.realpath(entry['file']) - - # Cross-reference source files with compilation database - source_files = [os.path.realpath(s) for s in source_files] - if not source_files: - # No source files specified, analyze entire compilation database - entries = compilation_db - else: - # Source files specified, analyze the ones appearing in compilation db, - # warn for the rest. - entries = [] - for source in source_files: - matches = [e for e in compilation_db if e['file'] == source] - if matches: - entries.extend(matches) - else: - print("{} not in compilation database".format(source)) - # TODO: As long as there is no complete compilation database available this check cannot be performed - pass - #print('WARNING: \'%s\' not found in compilation database.', source) - - # Run analysis - try: - for entry in entries: - cwd, compile_command = entry['directory'], entry['command'] - run_iwyu(cwd, compile_command, iwyu_args, verbose, formatter) - except OSError as why: - print('ERROR: Failed to launch include-what-you-use: %s', why) - return 1 - - return 0 - - -def _bootstrap(): - """ Parse arguments and dispatch to main(). """ - # This hackery is necessary to add the forwarded IWYU args to the - # usage and help strings. - def customize_usage(parser): - """ Rewrite the parser's format_usage. """ - original_format_usage = parser.format_usage - parser.format_usage = lambda: original_format_usage().rstrip() + \ - ' -- []' + os.linesep - - def customize_help(parser): - """ Rewrite the parser's format_help. """ - original_format_help = parser.format_help - - def custom_help(): - """ Customized help string, calls the adjusted format_usage. """ - helpmsg = original_format_help() - helplines = helpmsg.splitlines() - helplines[0] = parser.format_usage().rstrip() - return os.linesep.join(helplines) + os.linesep - - parser.format_help = custom_help - - # Parse arguments - parser = argparse.ArgumentParser( - description='Include-what-you-use compilation database driver.', - epilog='Assumes include-what-you-use is available on the PATH.') - customize_usage(parser) - customize_help(parser) - - parser.add_argument('-v', '--verbose', action='store_true', - help='Print IWYU commands') - parser.add_argument('-o', '--output-format', type=str, - choices=FORMATTERS.keys(), default=DEFAULT_FORMAT, - help='Output format (default: %s)' % DEFAULT_FORMAT) - parser.add_argument('-p', metavar='', required=True, - help='Compilation database path', dest='dbpath') - parser.add_argument('source', nargs='*', - help='Zero or more source files to run IWYU on. ' - 'Defaults to all in compilation database.') - - def partition_args(argv): - """ Split around '--' into driver args and IWYU args. """ - try: - double_dash = argv.index('--') - return argv[:double_dash], argv[double_dash+1:] - except ValueError: - return argv, [] - argv, iwyu_args = partition_args(sys.argv[1:]) - args = parser.parse_args(argv) - - sys.exit(main(args.dbpath, args.source, args.verbose, - FORMATTERS[args.output_format], iwyu_args)) - - -if __name__ == '__main__': - _bootstrap() diff --git a/cpp/build-support/iwyu/mappings/arrow-misc.imp b/cpp/build-support/iwyu/mappings/arrow-misc.imp deleted file mode 100644 index 6f144f1f34e..00000000000 --- a/cpp/build-support/iwyu/mappings/arrow-misc.imp +++ /dev/null @@ -1,61 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -[ - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", public, "", public ] }, - { include: ["", public, "", public ] }, - { include: ["", public, "", public ] }, - { symbol: ["bool", private, "", public ] }, - { symbol: ["false", private, "", public ] }, - { symbol: ["true", private, "", public ] }, - { symbol: ["int8_t", private, "", public ] }, - { symbol: ["int16_t", private, "", public ] }, - { symbol: ["int32_t", private, "", public ] }, - { symbol: ["int64_t", private, "", public ] }, - { symbol: ["uint8_t", private, "", public ] }, - { symbol: ["uint16_t", private, "", public ] }, - { symbol: ["uint32_t", private, "", public ] }, - { symbol: ["uint64_t", private, "", public ] }, - { symbol: ["size_t", private, "", public ] }, - { symbol: ["variant", private, "\"arrow/compute/kernel.h\"", public ] }, - { symbol: ["default_memory_pool", private, "\"arrow/type_fwd.h\"", public ] }, - { symbol: ["make_shared", private, "", public ] }, - { symbol: ["shared_ptr", private, "", public ] }, - { symbol: ["_Node_const_iterator", private, "", public ] }, - { symbol: ["unordered_map<>::mapped_type", private, "", public ] }, - { symbol: ["std::copy", private, "", public ] }, - { symbol: ["std::move", private, "", public ] }, - { symbol: ["std::transform", private, "", public ] }, - { symbol: ["pair", private, "", public ] }, - { symbol: ["errno", private, "", public ] }, - { symbol: ["posix_memalign", private, "", public ] } -] diff --git a/cpp/build-support/iwyu/mappings/boost-all-private.imp b/cpp/build-support/iwyu/mappings/boost-all-private.imp deleted file mode 100644 index 133eef11375..00000000000 --- a/cpp/build-support/iwyu/mappings/boost-all-private.imp +++ /dev/null @@ -1,4166 +0,0 @@ -# This file has been imported into the arrow source tree from -# the IWYU source tree as of version 0.8 -# https://github.com/include-what-you-use/include-what-you-use/blob/master/boost-all-private.imp -# and corresponding license has been added: -# https://github.com/include-what-you-use/include-what-you-use/blob/master/LICENSE.TXT -# -# ============================================================================== -# LLVM Release License -# ============================================================================== -# University of Illinois/NCSA -# Open Source License -# -# Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. -# All rights reserved. -# -# Developed by: -# -# LLVM Team -# -# University of Illinois at Urbana-Champaign -# -# http://llvm.org -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal with -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -# of the Software, and to permit persons to whom the Software is furnished to do -# so, subject to the following conditions: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimers. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimers in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the names of the LLVM Team, University of Illinois at -# Urbana-Champaign, nor the names of its contributors may be used to -# endorse or promote products derived from this Software without specific -# prior written permission. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE -# SOFTWARE. - -[ -#grep -r '^ *# *include' boost/ | grep -e "boost/[^:]*/detail/.*hp*:" -e "boost/[^:]*/impl/.*hp*:" | grep -e "\:.*/detail/" -e "\:.*/impl/" | perl -nle 'm/^([^:]+).*["<]([^>]+)[">]/ && print qq@ { include: ["<$2>", private, "<$1>", private ] },@' | grep -e \\[\"\", private, "", private ] }, -# { include: ["", private, "", private ] }, -# -# { include: ["", private, "", private ] }, -# { include: ["", private, "", private ] }, -# { include: ["", private, "", private ] }, -# { include: ["", private, "", private ] }, - - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] }, - { include: ["", private, "", private ] } -] diff --git a/cpp/build-support/iwyu/mappings/boost-all.imp b/cpp/build-support/iwyu/mappings/boost-all.imp deleted file mode 100644 index 7c48acaf341..00000000000 --- a/cpp/build-support/iwyu/mappings/boost-all.imp +++ /dev/null @@ -1,5679 +0,0 @@ -# This file has been imported into the apache source tree from -# the IWYU source tree as of version 0.8 -# https://github.com/include-what-you-use/include-what-you-use/blob/master/boost-all.imp -# and corresponding license has been added: -# https://github.com/include-what-you-use/include-what-you-use/blob/master/LICENSE.TXT -# -# ============================================================================== -# LLVM Release License -# ============================================================================== -# University of Illinois/NCSA -# Open Source License -# -# Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. -# All rights reserved. -# -# Developed by: -# -# LLVM Team -# -# University of Illinois at Urbana-Champaign -# -# http://llvm.org -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal with -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -# of the Software, and to permit persons to whom the Software is furnished to do -# so, subject to the following conditions: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimers. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimers in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the names of the LLVM Team, University of Illinois at -# Urbana-Champaign, nor the names of its contributors may be used to -# endorse or promote products derived from this Software without specific -# prior written permission. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE -# SOFTWARE. - -[ -# cd /usr/include && grep -r --exclude-dir={detail,impl} '^ *# *include' boost/ | perl -nle 'm/^([^:]+).*["<]([^>]+)[">]/ && print qq@ { include: ["<$2>", private, "<$1>", public ] },@' | grep -e \/detail\/ -e \/impl\/ | grep -e \\[\"\", private, "", public ] }, -{ include: ["@", private, "", public ] }, -{ include: ["@", private, "", public ] }, -{ include: ["@", private, "", public ] }, -#manually delete $ sed '/workarounds*\.hpp/d' -i boost-all.imp -#also good idea to remove all lines referring to folders above (e.g., sed '/\/format\//d' -i boost-all.imp) -#programmatically include: - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] } -] diff --git a/cpp/build-support/iwyu/mappings/boost-extra.imp b/cpp/build-support/iwyu/mappings/boost-extra.imp deleted file mode 100644 index aba1e419168..00000000000 --- a/cpp/build-support/iwyu/mappings/boost-extra.imp +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -[ - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] }, - { include: ["", private, "", public ] } -] diff --git a/cpp/build-support/iwyu/mappings/gflags.imp b/cpp/build-support/iwyu/mappings/gflags.imp deleted file mode 100644 index 46ce63d1e71..00000000000 --- a/cpp/build-support/iwyu/mappings/gflags.imp +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -[ - # confuses the IWYU tool because of the 'using ' - { symbol: [ "fLS::clstring", private, "", public ] } -] diff --git a/cpp/build-support/iwyu/mappings/glog.imp b/cpp/build-support/iwyu/mappings/glog.imp deleted file mode 100644 index 08c5e3529bc..00000000000 --- a/cpp/build-support/iwyu/mappings/glog.imp +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -[ - { symbol: [ "LOG", private, "", public ] }, - { symbol: [ "VLOG", private, "", public ] }, - { symbol: [ "CHECK_EQ", private, "", public ] }, - { symbol: [ "CHECK_NE", private, "", public ] }, - { symbol: [ "CHECK_LT", private, "", public ] }, - { symbol: [ "CHECK_GE", private, "", public ] }, - { symbol: [ "CHECK_GT", private, "", public ] }, - { symbol: [ "ErrnoLogMessage", private, "", public ] }, - { symbol: [ "COMPACT_GOOGLE_LOG_0", private, "", public ] } -] diff --git a/cpp/build-support/iwyu/mappings/gmock.imp b/cpp/build-support/iwyu/mappings/gmock.imp deleted file mode 100644 index 76e7cafddde..00000000000 --- a/cpp/build-support/iwyu/mappings/gmock.imp +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -#include -#include - -[ - { include: [ "", private, "", public ] }, - { include: [ "", private, "", public ] } -] \ No newline at end of file diff --git a/cpp/build-support/iwyu/mappings/gtest.imp b/cpp/build-support/iwyu/mappings/gtest.imp deleted file mode 100644 index a54165027e7..00000000000 --- a/cpp/build-support/iwyu/mappings/gtest.imp +++ /dev/null @@ -1,26 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -[ - { include: [ "", private, "", public ] }, - { include: [ "", private, "", public ] }, - { include: [ "", private, "", public ] }, - { include: [ "", private, "", public ] }, - { include: [ "", private, "", public ] }, - { include: [ "", private, "", public ] }, - { include: [ "", private, "", public ] }, - { include: [ "", private, "", public ] } -] diff --git a/cpp/build-support/lint_cpp_cli.py b/cpp/build-support/lint_cpp_cli.py deleted file mode 100755 index 47abd53fe92..00000000000 --- a/cpp/build-support/lint_cpp_cli.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import re -import os - -parser = argparse.ArgumentParser( - description="Check for illegal headers for C++/CLI applications") -parser.add_argument("source_path", - help="Path to source code") -arguments = parser.parse_args() - - -_STRIP_COMMENT_REGEX = re.compile('(.+)?(?=//)') -_NULLPTR_REGEX = re.compile(r'.*\bnullptr\b.*') -_RETURN_NOT_OK_REGEX = re.compile(r'.*\sRETURN_NOT_OK.*') -_ASSIGN_OR_RAISE_REGEX = re.compile(r'.*\sASSIGN_OR_RAISE.*') -_DCHECK_REGEX = re.compile(r'.*\sDCHECK.*') - - -def _paths(paths): - return [p.strip().replace('/', os.path.sep) for p in paths.splitlines()] - - -def _strip_comments(line): - m = _STRIP_COMMENT_REGEX.match(line) - if not m: - return line - else: - return m.group(0) - - -def lint_file(path): - fail_rules = [ - # rule, error message, rule-specific exclusions list - (lambda x: '' in x, 'Uses ', []), - (lambda x: '' in x, 'Uses ', []), - (lambda x: re.match(_NULLPTR_REGEX, x), 'Uses nullptr', []), - (lambda x: re.match(_RETURN_NOT_OK_REGEX, x), - 'Use ARROW_RETURN_NOT_OK in header files', _paths('''\ - arrow/status.h - arrow/python/util''')), - (lambda x: re.match(_ASSIGN_OR_RAISE_REGEX, x), - 'Use ARROW_ASSIGN_OR_RAISE in header files', []), - (lambda x: re.match(_DCHECK_REGEX, x), - 'Use ARROW_DCHECK in header files', _paths('''\ - arrow/util/logging.h''')) - - ] - - with open(path) as f: - for i, line in enumerate(f): - stripped_line = _strip_comments(line) - for rule, why, rule_exclusions in fail_rules: - if any([True for excl in rule_exclusions if excl in path]): - continue - - if rule(stripped_line): - yield path, why, i, line - - -EXCLUSIONS = _paths('''\ - arrow/arrow-config.cmake - arrow/python/iterators.h - arrow/util/hashing.h - arrow/util/macros.h - arrow/util/parallel.h - arrow/vendored - arrow/visitor_inline.h - gandiva/cache.h - gandiva/jni - jni/ - test - internal - _generated''') - - -def lint_files(): - for dirpath, _, filenames in os.walk(arguments.source_path): - for filename in filenames: - full_path = os.path.join(dirpath, filename) - - exclude = False - for exclusion in EXCLUSIONS: - if exclusion in full_path: - exclude = True - break - - if exclude: - continue - - # Lint file name, except for pkg-config templates - if not filename.endswith('.pc.in'): - if '-' in filename: - why = ("Please use underscores, not hyphens, " - "in source file names") - yield full_path, why, 0, full_path - - # Only run on header files - if filename.endswith('.h'): - for _ in lint_file(full_path): - yield _ - - -if __name__ == '__main__': - failures = list(lint_files()) - for path, why, i, line in failures: - print('File {0} failed C++/CLI lint check: {1}\n' - 'Line {2}: {3}'.format(path, why, i + 1, line)) - if failures: - exit(1) diff --git a/cpp/build-support/lint_exclusions.txt b/cpp/build-support/lint_exclusions.txt deleted file mode 100644 index aa57db72ce9..00000000000 --- a/cpp/build-support/lint_exclusions.txt +++ /dev/null @@ -1,13 +0,0 @@ -*.grpc.fb.* -*.pb.* -*RcppExports.cpp* -*_generated* -*arrowExports.cpp* -*parquet_types.* -*pyarrow_api.h -*pyarrow_lib.h -*python/config.h -*python/platform.h -*thirdparty/* -*vendored/* -*windows_compatibility.h diff --git a/cpp/build-support/lintutils.py b/cpp/build-support/lintutils.py deleted file mode 100644 index 2386eb2e6af..00000000000 --- a/cpp/build-support/lintutils.py +++ /dev/null @@ -1,109 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import multiprocessing as mp -import os -from fnmatch import fnmatch -from subprocess import Popen - - -def chunk(seq, n): - """ - divide a sequence into equal sized chunks - (the last chunk may be smaller, but won't be empty) - """ - chunks = [] - some = [] - for element in seq: - if len(some) == n: - chunks.append(some) - some = [] - some.append(element) - if len(some) > 0: - chunks.append(some) - return chunks - - -def dechunk(chunks): - "flatten chunks into a single list" - seq = [] - for chunk in chunks: - seq.extend(chunk) - return seq - - -def run_parallel(cmds, **kwargs): - """ - Run each of cmds (with shared **kwargs) using subprocess.Popen - then wait for all of them to complete. - Runs batches of multiprocessing.cpu_count() * 2 from cmds - returns a list of tuples containing each process' - returncode, stdout, stderr - """ - complete = [] - for cmds_batch in chunk(cmds, mp.cpu_count() * 2): - procs_batch = [Popen(cmd, **kwargs) for cmd in cmds_batch] - for proc in procs_batch: - stdout, stderr = proc.communicate() - complete.append((proc.returncode, stdout, stderr)) - return complete - - -_source_extensions = ''' -.h -.cc -.cpp -'''.split() - - -def get_sources(source_dir, exclude_globs=[]): - sources = [] - for directory, subdirs, basenames in os.walk(source_dir): - for path in [os.path.join(directory, basename) - for basename in basenames]: - # filter out non-source files - if os.path.splitext(path)[1] not in _source_extensions: - continue - - path = os.path.abspath(path) - - # filter out files that match the globs in the globs file - if any([fnmatch(path, glob) for glob in exclude_globs]): - continue - - sources.append(path) - return sources - - -def stdout_pathcolonline(completed_process, filenames): - """ - given a completed process which may have reported some files as problematic - by printing the path name followed by ':' then a line number, examine - stdout and return the set of actually reported file names - """ - returncode, stdout, stderr = completed_process - bfilenames = set() - for filename in filenames: - bfilenames.add(filename.encode('utf-8') + b':') - problem_files = set() - for line in stdout.splitlines(): - for filename in bfilenames: - if line.startswith(filename): - problem_files.add(filename.decode('utf-8')) - bfilenames.remove(filename) - break - return problem_files, stdout diff --git a/cpp/build-support/lsan-suppressions.txt b/cpp/build-support/lsan-suppressions.txt index a8918e10d94..dbcddb29110 100644 --- a/cpp/build-support/lsan-suppressions.txt +++ b/cpp/build-support/lsan-suppressions.txt @@ -26,3 +26,4 @@ leak:CRYPTO_zalloc # without LSAN_OPTIONS=fast_unwind_on_malloc=0:malloc_context_size=100 leak:opentelemetry::v1::context::ThreadLocalContextStorage::GetStack leak:opentelemetry::v1::context::ThreadLocalContextStorage::Stack::Resize +leak:std::make_shared diff --git a/cpp/build-support/run-infer.sh b/cpp/build-support/run-infer.sh deleted file mode 100755 index 7d185343706..00000000000 --- a/cpp/build-support/run-infer.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -# Runs infer in the given directory -# Arguments: -# $1 - Path to the infer binary -# $2 - Path to the compile_commands.json to use -# $3 - Apply infer step (1=capture, 2=analyze, 3=report) -# -INFER=$1 -shift -COMPILE_COMMANDS=$1 -shift -APPLY_STEP=$1 -shift - -if [ "$APPLY_STEP" == "1" ]; then - $INFER capture --compilation-database $COMPILE_COMMANDS - echo "" - echo "Run 'make infer-analyze' next." -elif [ "$APPLY_STEP" == "2" ]; then - # infer's analyze step can take a very long time to complete - $INFER analyze - echo "" - echo "Run 'make infer-report' next." - echo "See: http://fbinfer.com/docs/steps-for-ci.html" -elif [ "$APPLY_STEP" == "3" ]; then - $INFER report --issues-csv ./infer-out/report.csv 1> /dev/null - $INFER report --issues-txt ./infer-out/report.txt 1> /dev/null - $INFER report --issues-json ./infer-out/report.json 1> /dev/null - echo "" - echo "Reports (report.txt, report.csv, report.json) can be found in the infer-out subdirectory." -else - echo "" - echo "See: http://fbinfer.com/docs/steps-for-ci.html" -fi diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 55e3fe09807..3e3034a3c86 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -75,10 +75,10 @@ function setup_sanitizers() { UBSAN_OPTIONS="$UBSAN_OPTIONS suppressions=$ROOT/build-support/ubsan-suppressions.txt" export UBSAN_OPTIONS - # Enable leak detection even under LLVM 3.4, where it was disabled by default. - # This flag only takes effect when running an ASAN build. - # ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" - # export ASAN_OPTIONS + # Set up suppressions for AddressSanitizer + ASAN_OPTIONS="$ASAN_OPTIONS suppressions=$ROOT/build-support/asan-suppressions.txt" + ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1" + export ASAN_OPTIONS # Set up suppressions for LeakSanitizer LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$ROOT/build-support/lsan-suppressions.txt" diff --git a/cpp/build-support/run_clang_format.py b/cpp/build-support/run_clang_format.py deleted file mode 100755 index 96487251d00..00000000000 --- a/cpp/build-support/run_clang_format.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import print_function -import lintutils -from subprocess import PIPE -import argparse -import difflib -import multiprocessing as mp -import sys -from functools import partial - - -# examine the output of clang-format and if changes are -# present assemble a (unified)patch of the difference -def _check_one_file(filename, formatted): - with open(filename, "rb") as reader: - original = reader.read() - - if formatted != original: - # Run the equivalent of diff -u - diff = list(difflib.unified_diff( - original.decode('utf8').splitlines(True), - formatted.decode('utf8').splitlines(True), - fromfile=filename, - tofile="{} (after clang format)".format( - filename))) - else: - diff = None - - return filename, diff - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Runs clang-format on all of the source " - "files. If --fix is specified enforce format by " - "modifying in place, otherwise compare the output " - "with the existing file and output any necessary " - "changes as a patch in unified diff format") - parser.add_argument("--clang_format_binary", - required=True, - help="Path to the clang-format binary") - parser.add_argument("--exclude_globs", - help="Filename containing globs for files " - "that should be excluded from the checks") - parser.add_argument("--source_dir", - required=True, - action="append", - help="Root directory of the source code") - parser.add_argument("--fix", default=False, - action="store_true", - help="If specified, will re-format the source " - "code instead of comparing the re-formatted " - "output, defaults to %(default)s") - parser.add_argument("--quiet", default=False, - action="store_true", - help="If specified, only print errors") - arguments = parser.parse_args() - - exclude_globs = [] - if arguments.exclude_globs: - with open(arguments.exclude_globs) as f: - exclude_globs.extend(line.strip() for line in f) - - formatted_filenames = [] - for source_dir in arguments.source_dir: - for path in lintutils.get_sources(source_dir, exclude_globs): - formatted_filenames.append(str(path)) - - if arguments.fix: - if not arguments.quiet: - print("\n".join(map(lambda x: "Formatting {}".format(x), - formatted_filenames))) - - # Break clang-format invocations into chunks: each invocation formats - # 16 files. Wait for all processes to complete - results = lintutils.run_parallel([ - [arguments.clang_format_binary, "-i"] + some - for some in lintutils.chunk(formatted_filenames, 16) - ]) - for returncode, stdout, stderr in results: - # if any clang-format reported a parse error, bubble it - if returncode != 0: - sys.exit(returncode) - - else: - # run an instance of clang-format for each source file in parallel, - # then wait for all processes to complete - results = lintutils.run_parallel([ - [arguments.clang_format_binary, filename] - for filename in formatted_filenames - ], stdout=PIPE, stderr=PIPE) - - checker_args = [] - for filename, res in zip(formatted_filenames, results): - # if any clang-format reported a parse error, bubble it - returncode, stdout, stderr = res - if returncode != 0: - print(stderr) - sys.exit(returncode) - checker_args.append((filename, stdout)) - - error = False - pool = mp.Pool() - try: - # check the output from each invocation of clang-format in parallel - for filename, diff in pool.starmap(_check_one_file, checker_args): - if not arguments.quiet: - print("Checking {}".format(filename)) - if diff: - print("{} had clang-format style issues".format(filename)) - # Print out the diff to stderr - error = True - # pad with a newline - print(file=sys.stderr) - sys.stderr.writelines(diff) - except Exception: - error = True - raise - finally: - pool.terminate() - pool.join() - sys.exit(1 if error else 0) diff --git a/cpp/build-support/run_clang_tidy.py b/cpp/build-support/run_clang_tidy.py deleted file mode 100755 index 863c5bd70ab..00000000000 --- a/cpp/build-support/run_clang_tidy.py +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import print_function -import argparse -import multiprocessing as mp -import lintutils -from subprocess import PIPE -import sys -from functools import partial - - -def _get_chunk_key(filenames): - # lists are not hashable so key on the first filename in a chunk - return filenames[0] - - -# clang-tidy outputs complaints in '/path:line_number: complaint' format, -# so we can scan its output to get a list of files to fix -def _check_some_files(completed_processes, filenames): - result = completed_processes[_get_chunk_key(filenames)] - return lintutils.stdout_pathcolonline(result, filenames) - - -def _check_all(cmd, filenames): - # each clang-tidy instance will process 16 files - chunks = lintutils.chunk(filenames, 16) - cmds = [cmd + some for some in chunks] - results = lintutils.run_parallel(cmds, stderr=PIPE, stdout=PIPE) - error = False - # record completed processes (keyed by the first filename in the input - # chunk) for lookup in _check_some_files - completed_processes = { - _get_chunk_key(some): result - for some, result in zip(chunks, results) - } - checker = partial(_check_some_files, completed_processes) - pool = mp.Pool() - try: - # check output of completed clang-tidy invocations in parallel - for problem_files, stdout in pool.imap(checker, chunks): - if problem_files: - msg = "clang-tidy suggested fixes for {}" - print("\n".join(map(msg.format, problem_files))) - error = True - except Exception: - error = True - raise - finally: - pool.terminate() - pool.join() - - if error: - sys.exit(1) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Runs clang-tidy on all ") - parser.add_argument("--clang_tidy_binary", - required=True, - help="Path to the clang-tidy binary") - parser.add_argument("--exclude_globs", - help="Filename containing globs for files " - "that should be excluded from the checks") - parser.add_argument("--compile_commands", - required=True, - help="compile_commands.json to pass clang-tidy") - parser.add_argument("--source_dir", - required=True, - action="append", - help="Root directory of the source code") - parser.add_argument("--fix", default=False, - action="store_true", - help="If specified, will attempt to fix the " - "source code instead of recommending fixes, " - "defaults to %(default)s") - parser.add_argument("--quiet", default=False, - action="store_true", - help="If specified, only print errors") - arguments = parser.parse_args() - - exclude_globs = [] - if arguments.exclude_globs: - for line in open(arguments.exclude_globs): - exclude_globs.append(line.strip()) - - linted_filenames = [] - for source_dir in arguments.source_dir: - for path in lintutils.get_sources(source_dir, exclude_globs): - linted_filenames.append(path) - - if not arguments.quiet: - msg = 'Tidying {}' if arguments.fix else 'Checking {}' - print("\n".join(map(msg.format, linted_filenames))) - - cmd = [ - arguments.clang_tidy_binary, - '-p', - arguments.compile_commands - ] - if arguments.fix: - cmd.append('-fix') - results = lintutils.run_parallel( - [cmd + some for some in lintutils.chunk(linted_filenames, 16)]) - for returncode, stdout, stderr in results: - if returncode != 0: - sys.exit(returncode) - - else: - _check_all(cmd, linted_filenames) diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py deleted file mode 100755 index a81acf2eb2f..00000000000 --- a/cpp/build-support/run_cpplint.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import print_function -import lintutils -from subprocess import PIPE, STDOUT -import argparse -import multiprocessing as mp -import sys -import platform -from functools import partial - - -def _get_chunk_key(filenames): - # lists are not hashable so key on the first filename in a chunk - return filenames[0] - - -def _check_some_files(completed_processes, filenames): - # cpplint outputs complaints in '/path:line_number: complaint' format, - # so we can scan its output to get a list of files to fix - result = completed_processes[_get_chunk_key(filenames)] - return lintutils.stdout_pathcolonline(result, filenames) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Runs cpplint on all of the source files.") - parser.add_argument("--cpplint_binary", - required=True, - help="Path to the cpplint binary") - parser.add_argument("--exclude_globs", - help="Filename containing globs for files " - "that should be excluded from the checks") - parser.add_argument("--source_dir", - required=True, - action="append", - help="Root directory of the source code") - parser.add_argument("--quiet", default=False, - action="store_true", - help="If specified, only print errors") - arguments = parser.parse_args() - - exclude_globs = [] - if arguments.exclude_globs: - with open(arguments.exclude_globs) as f: - exclude_globs.extend(line.strip() for line in f) - - linted_filenames = [] - for source_dir in arguments.source_dir: - for path in lintutils.get_sources(source_dir, exclude_globs): - linted_filenames.append(str(path)) - - cmd = [ - arguments.cpplint_binary, - '--verbose=2', - ] - if (arguments.cpplint_binary.endswith('.py') and - platform.system() == 'Windows'): - # Windows doesn't support executable scripts; execute with - # sys.executable - cmd.insert(0, sys.executable) - if arguments.quiet: - cmd.append('--quiet') - else: - print("\n".join(map(lambda x: "Linting {}".format(x), - linted_filenames))) - - # lint files in chunks: each invocation of cpplint will process 16 files - chunks = lintutils.chunk(linted_filenames, 16) - cmds = [cmd + some for some in chunks] - results = lintutils.run_parallel(cmds, stdout=PIPE, stderr=STDOUT) - - error = False - # record completed processes (keyed by the first filename in the input - # chunk) for lookup in _check_some_files - completed_processes = { - _get_chunk_key(filenames): result - for filenames, result in zip(chunks, results) - } - checker = partial(_check_some_files, completed_processes) - pool = mp.Pool() - try: - # scan the outputs of various cpplint invocations in parallel to - # distill a list of problematic files - for problem_files, stdout in pool.imap(checker, chunks): - if problem_files: - if isinstance(stdout, bytes): - stdout = stdout.decode('utf8') - print(stdout, file=sys.stderr) - error = True - except Exception: - error = True - raise - finally: - pool.terminate() - pool.join() - - sys.exit(1 if error else 0) diff --git a/cpp/build-support/tsan-suppressions.txt b/cpp/build-support/tsan-suppressions.txt index ce897c85911..fc4a2f6ee1d 100644 --- a/cpp/build-support/tsan-suppressions.txt +++ b/cpp/build-support/tsan-suppressions.txt @@ -17,3 +17,6 @@ # Thread leak in CUDA thread:libcuda.so + +# False-positives in OpenTelemetry because of non-instrumented code. +race:^opentelemetry \ No newline at end of file diff --git a/cpp/build-support/update-flatbuffers.sh b/cpp/build-support/update-flatbuffers.sh index a27d947a4b0..6738f81a560 100755 --- a/cpp/build-support/update-flatbuffers.sh +++ b/cpp/build-support/update-flatbuffers.sh @@ -34,7 +34,3 @@ FILES=($(find $FORMAT_DIR -name '*.fbs')) FILES+=("$SOURCE_DIR/arrow/ipc/feather.fbs") $FLATC -o "$OUT_DIR" "${FILES[@]}" - -# Skyhook flatbuffers -$FLATC -o "$SOURCE_DIR/skyhook/protocol" \ - "$SOURCE_DIR/skyhook/protocol/ScanRequest.fbs" diff --git a/cpp/cmake_modules/AWSSDKVariables.cmake b/cpp/cmake_modules/AWSSDKVariables.cmake deleted file mode 100644 index 729790dd0f8..00000000000 --- a/cpp/cmake_modules/AWSSDKVariables.cmake +++ /dev/null @@ -1,388 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Generated by: -# $ cpp/cmake_modules/aws_sdk_cpp_generate_variables.sh 1.10.55 - -set(AWSSDK_UNUSED_DIRECTORIES - .github - AndroidSDKTesting - CI - Docs - android-build - android-unified-tests - aws-cpp-sdk-AWSMigrationHub - aws-cpp-sdk-access-management - aws-cpp-sdk-accessanalyzer - aws-cpp-sdk-account - aws-cpp-sdk-acm - aws-cpp-sdk-acm-pca - aws-cpp-sdk-alexaforbusiness - aws-cpp-sdk-amp - aws-cpp-sdk-amplify - aws-cpp-sdk-amplifybackend - aws-cpp-sdk-amplifyuibuilder - aws-cpp-sdk-apigateway - aws-cpp-sdk-apigatewaymanagementapi - aws-cpp-sdk-apigatewayv2 - aws-cpp-sdk-appconfig - aws-cpp-sdk-appconfigdata - aws-cpp-sdk-appflow - aws-cpp-sdk-appintegrations - aws-cpp-sdk-application-autoscaling - aws-cpp-sdk-application-insights - aws-cpp-sdk-applicationcostprofiler - aws-cpp-sdk-appmesh - aws-cpp-sdk-apprunner - aws-cpp-sdk-appstream - aws-cpp-sdk-appsync - aws-cpp-sdk-arc-zonal-shift - aws-cpp-sdk-athena - aws-cpp-sdk-auditmanager - aws-cpp-sdk-autoscaling - aws-cpp-sdk-autoscaling-plans - aws-cpp-sdk-awstransfer - aws-cpp-sdk-backup - aws-cpp-sdk-backup-gateway - aws-cpp-sdk-backupstorage - aws-cpp-sdk-batch - aws-cpp-sdk-billingconductor - aws-cpp-sdk-braket - aws-cpp-sdk-budgets - aws-cpp-sdk-ce - aws-cpp-sdk-chime - aws-cpp-sdk-chime-sdk-identity - aws-cpp-sdk-chime-sdk-media-pipelines - aws-cpp-sdk-chime-sdk-meetings - aws-cpp-sdk-chime-sdk-messaging - aws-cpp-sdk-chime-sdk-voice - aws-cpp-sdk-cleanrooms - aws-cpp-sdk-cloud9 - aws-cpp-sdk-cloudcontrol - aws-cpp-sdk-clouddirectory - aws-cpp-sdk-cloudformation - aws-cpp-sdk-cloudfront - aws-cpp-sdk-cloudfront-integration-tests - aws-cpp-sdk-cloudhsm - aws-cpp-sdk-cloudhsmv2 - aws-cpp-sdk-cloudsearch - aws-cpp-sdk-cloudsearchdomain - aws-cpp-sdk-cloudtrail - aws-cpp-sdk-codeartifact - aws-cpp-sdk-codebuild - aws-cpp-sdk-codecatalyst - aws-cpp-sdk-codecommit - aws-cpp-sdk-codedeploy - aws-cpp-sdk-codeguru-reviewer - aws-cpp-sdk-codeguruprofiler - aws-cpp-sdk-codepipeline - aws-cpp-sdk-codestar - aws-cpp-sdk-codestar-connections - aws-cpp-sdk-codestar-notifications - aws-cpp-sdk-cognito-idp - aws-cpp-sdk-cognito-sync - aws-cpp-sdk-cognitoidentity-integration-tests - aws-cpp-sdk-comprehend - aws-cpp-sdk-comprehendmedical - aws-cpp-sdk-compute-optimizer - aws-cpp-sdk-connect - aws-cpp-sdk-connect-contact-lens - aws-cpp-sdk-connectcampaigns - aws-cpp-sdk-connectcases - aws-cpp-sdk-connectparticipant - aws-cpp-sdk-controltower - aws-cpp-sdk-cur - aws-cpp-sdk-custom-service-integration-tests - aws-cpp-sdk-customer-profiles - aws-cpp-sdk-databrew - aws-cpp-sdk-dataexchange - aws-cpp-sdk-datapipeline - aws-cpp-sdk-datasync - aws-cpp-sdk-dax - aws-cpp-sdk-detective - aws-cpp-sdk-devicefarm - aws-cpp-sdk-devops-guru - aws-cpp-sdk-directconnect - aws-cpp-sdk-discovery - aws-cpp-sdk-dlm - aws-cpp-sdk-dms - aws-cpp-sdk-docdb - aws-cpp-sdk-docdb-elastic - aws-cpp-sdk-drs - aws-cpp-sdk-ds - aws-cpp-sdk-dynamodb - aws-cpp-sdk-dynamodb-integration-tests - aws-cpp-sdk-dynamodbstreams - aws-cpp-sdk-ebs - aws-cpp-sdk-ec2 - aws-cpp-sdk-ec2-instance-connect - aws-cpp-sdk-ec2-integration-tests - aws-cpp-sdk-ecr - aws-cpp-sdk-ecr-public - aws-cpp-sdk-ecs - aws-cpp-sdk-eks - aws-cpp-sdk-elastic-inference - aws-cpp-sdk-elasticache - aws-cpp-sdk-elasticbeanstalk - aws-cpp-sdk-elasticfilesystem - aws-cpp-sdk-elasticfilesystem-integration-tests - aws-cpp-sdk-elasticloadbalancing - aws-cpp-sdk-elasticloadbalancingv2 - aws-cpp-sdk-elasticmapreduce - aws-cpp-sdk-elastictranscoder - aws-cpp-sdk-email - aws-cpp-sdk-emr-containers - aws-cpp-sdk-emr-serverless - aws-cpp-sdk-es - aws-cpp-sdk-eventbridge - aws-cpp-sdk-eventbridge-tests - aws-cpp-sdk-events - aws-cpp-sdk-evidently - aws-cpp-sdk-finspace - aws-cpp-sdk-finspace-data - aws-cpp-sdk-firehose - aws-cpp-sdk-fis - aws-cpp-sdk-fms - aws-cpp-sdk-forecast - aws-cpp-sdk-forecastquery - aws-cpp-sdk-frauddetector - aws-cpp-sdk-fsx - aws-cpp-sdk-gamelift - aws-cpp-sdk-gamesparks - aws-cpp-sdk-glacier - aws-cpp-sdk-globalaccelerator - aws-cpp-sdk-glue - aws-cpp-sdk-grafana - aws-cpp-sdk-greengrass - aws-cpp-sdk-greengrassv2 - aws-cpp-sdk-groundstation - aws-cpp-sdk-guardduty - aws-cpp-sdk-health - aws-cpp-sdk-healthlake - aws-cpp-sdk-honeycode - aws-cpp-sdk-iam - aws-cpp-sdk-identitystore - aws-cpp-sdk-imagebuilder - aws-cpp-sdk-importexport - aws-cpp-sdk-inspector - aws-cpp-sdk-inspector2 - aws-cpp-sdk-iot - aws-cpp-sdk-iot-data - aws-cpp-sdk-iot-jobs-data - aws-cpp-sdk-iot-roborunner - aws-cpp-sdk-iot1click-devices - aws-cpp-sdk-iot1click-projects - aws-cpp-sdk-iotanalytics - aws-cpp-sdk-iotdeviceadvisor - aws-cpp-sdk-iotevents - aws-cpp-sdk-iotevents-data - aws-cpp-sdk-iotfleethub - aws-cpp-sdk-iotfleetwise - aws-cpp-sdk-iotsecuretunneling - aws-cpp-sdk-iotsitewise - aws-cpp-sdk-iotthingsgraph - aws-cpp-sdk-iottwinmaker - aws-cpp-sdk-iotwireless - aws-cpp-sdk-ivs - aws-cpp-sdk-ivschat - aws-cpp-sdk-kafka - aws-cpp-sdk-kafkaconnect - aws-cpp-sdk-kendra - aws-cpp-sdk-kendra-ranking - aws-cpp-sdk-keyspaces - aws-cpp-sdk-kinesis - aws-cpp-sdk-kinesis-integration-tests - aws-cpp-sdk-kinesis-video-archived-media - aws-cpp-sdk-kinesis-video-media - aws-cpp-sdk-kinesis-video-signaling - aws-cpp-sdk-kinesis-video-webrtc-storage - aws-cpp-sdk-kinesisanalytics - aws-cpp-sdk-kinesisanalyticsv2 - aws-cpp-sdk-kinesisvideo - aws-cpp-sdk-kms - aws-cpp-sdk-lakeformation - aws-cpp-sdk-lambda - aws-cpp-sdk-lambda-integration-tests - aws-cpp-sdk-lex - aws-cpp-sdk-lex-models - aws-cpp-sdk-lexv2-models - aws-cpp-sdk-lexv2-runtime - aws-cpp-sdk-license-manager - aws-cpp-sdk-license-manager-linux-subscriptions - aws-cpp-sdk-license-manager-user-subscriptions - aws-cpp-sdk-lightsail - aws-cpp-sdk-location - aws-cpp-sdk-logs - aws-cpp-sdk-logs-integration-tests - aws-cpp-sdk-lookoutequipment - aws-cpp-sdk-lookoutmetrics - aws-cpp-sdk-lookoutvision - aws-cpp-sdk-m2 - aws-cpp-sdk-machinelearning - aws-cpp-sdk-macie - aws-cpp-sdk-macie2 - aws-cpp-sdk-managedblockchain - aws-cpp-sdk-marketplace-catalog - aws-cpp-sdk-marketplace-entitlement - aws-cpp-sdk-marketplacecommerceanalytics - aws-cpp-sdk-mediaconnect - aws-cpp-sdk-mediaconvert - aws-cpp-sdk-medialive - aws-cpp-sdk-mediapackage - aws-cpp-sdk-mediapackage-vod - aws-cpp-sdk-mediastore - aws-cpp-sdk-mediastore-data - aws-cpp-sdk-mediastore-data-integration-tests - aws-cpp-sdk-mediatailor - aws-cpp-sdk-memorydb - aws-cpp-sdk-meteringmarketplace - aws-cpp-sdk-mgn - aws-cpp-sdk-migration-hub-refactor-spaces - aws-cpp-sdk-migrationhub-config - aws-cpp-sdk-migrationhuborchestrator - aws-cpp-sdk-migrationhubstrategy - aws-cpp-sdk-mobile - aws-cpp-sdk-monitoring - aws-cpp-sdk-mq - aws-cpp-sdk-mturk-requester - aws-cpp-sdk-mwaa - aws-cpp-sdk-neptune - aws-cpp-sdk-network-firewall - aws-cpp-sdk-networkmanager - aws-cpp-sdk-nimble - aws-cpp-sdk-oam - aws-cpp-sdk-omics - aws-cpp-sdk-opensearch - aws-cpp-sdk-opensearchserverless - aws-cpp-sdk-opsworks - aws-cpp-sdk-opsworkscm - aws-cpp-sdk-organizations - aws-cpp-sdk-outposts - aws-cpp-sdk-panorama - aws-cpp-sdk-personalize - aws-cpp-sdk-personalize-events - aws-cpp-sdk-personalize-runtime - aws-cpp-sdk-pi - aws-cpp-sdk-pinpoint - aws-cpp-sdk-pinpoint-email - aws-cpp-sdk-pinpoint-sms-voice-v2 - aws-cpp-sdk-pipes - aws-cpp-sdk-polly - aws-cpp-sdk-polly-sample - aws-cpp-sdk-pricing - aws-cpp-sdk-privatenetworks - aws-cpp-sdk-proton - aws-cpp-sdk-qldb - aws-cpp-sdk-qldb-session - aws-cpp-sdk-queues - aws-cpp-sdk-quicksight - aws-cpp-sdk-ram - aws-cpp-sdk-rbin - aws-cpp-sdk-rds - aws-cpp-sdk-rds-data - aws-cpp-sdk-rds-integration-tests - aws-cpp-sdk-redshift - aws-cpp-sdk-redshift-data - aws-cpp-sdk-redshift-integration-tests - aws-cpp-sdk-redshift-serverless - aws-cpp-sdk-rekognition - aws-cpp-sdk-resiliencehub - aws-cpp-sdk-resource-explorer-2 - aws-cpp-sdk-resource-groups - aws-cpp-sdk-resourcegroupstaggingapi - aws-cpp-sdk-robomaker - aws-cpp-sdk-rolesanywhere - aws-cpp-sdk-route53 - aws-cpp-sdk-route53-recovery-cluster - aws-cpp-sdk-route53-recovery-control-config - aws-cpp-sdk-route53-recovery-readiness - aws-cpp-sdk-route53domains - aws-cpp-sdk-route53resolver - aws-cpp-sdk-rum - aws-cpp-sdk-sagemaker - aws-cpp-sdk-sagemaker-a2i-runtime - aws-cpp-sdk-sagemaker-edge - aws-cpp-sdk-sagemaker-featurestore-runtime - aws-cpp-sdk-sagemaker-geospatial - aws-cpp-sdk-sagemaker-metrics - aws-cpp-sdk-sagemaker-runtime - aws-cpp-sdk-savingsplans - aws-cpp-sdk-scheduler - aws-cpp-sdk-schemas - aws-cpp-sdk-sdb - aws-cpp-sdk-secretsmanager - aws-cpp-sdk-securityhub - aws-cpp-sdk-securitylake - aws-cpp-sdk-serverlessrepo - aws-cpp-sdk-service-quotas - aws-cpp-sdk-servicecatalog - aws-cpp-sdk-servicecatalog-appregistry - aws-cpp-sdk-servicediscovery - aws-cpp-sdk-sesv2 - aws-cpp-sdk-shield - aws-cpp-sdk-signer - aws-cpp-sdk-simspaceweaver - aws-cpp-sdk-sms - aws-cpp-sdk-sms-voice - aws-cpp-sdk-snow-device-management - aws-cpp-sdk-snowball - aws-cpp-sdk-sns - aws-cpp-sdk-sqs - aws-cpp-sdk-sqs-integration-tests - aws-cpp-sdk-ssm - aws-cpp-sdk-ssm-contacts - aws-cpp-sdk-ssm-incidents - aws-cpp-sdk-ssm-sap - aws-cpp-sdk-sso - aws-cpp-sdk-sso-admin - aws-cpp-sdk-sso-oidc - aws-cpp-sdk-states - aws-cpp-sdk-storagegateway - aws-cpp-sdk-support - aws-cpp-sdk-support-app - aws-cpp-sdk-swf - aws-cpp-sdk-synthetics - aws-cpp-sdk-text-to-speech - aws-cpp-sdk-text-to-speech-tests - aws-cpp-sdk-textract - aws-cpp-sdk-timestream-query - aws-cpp-sdk-timestream-write - aws-cpp-sdk-transcribe - aws-cpp-sdk-transcribestreaming - aws-cpp-sdk-transcribestreaming-integration-tests - aws-cpp-sdk-translate - aws-cpp-sdk-voice-id - aws-cpp-sdk-waf - aws-cpp-sdk-waf-regional - aws-cpp-sdk-wafv2 - aws-cpp-sdk-wellarchitected - aws-cpp-sdk-wisdom - aws-cpp-sdk-workdocs - aws-cpp-sdk-worklink - aws-cpp-sdk-workmail - aws-cpp-sdk-workmailmessageflow - aws-cpp-sdk-workspaces - aws-cpp-sdk-workspaces-web - aws-cpp-sdk-xray - code-generation - crt - doc_crosslinks - doc_crosslinks_new - doxygen - generated - scripts - testing-resources) diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 90839cb4462..db760400f7c 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -65,12 +65,6 @@ function(add_thirdparty_lib LIB_NAME LIB_TYPE LIB) endif() endfunction() -function(REUSE_PRECOMPILED_HEADER_LIB TARGET_NAME LIB_NAME) - if(ARROW_USE_PRECOMPILED_HEADERS) - target_precompile_headers(${TARGET_NAME} REUSE_FROM ${LIB_NAME}) - endif() -endfunction() - # Based on MIT-licensed # https://gist.github.com/cristianadam/ef920342939a89fae3e8a85ca9459b49 function(arrow_create_merged_static_lib output_target) @@ -169,7 +163,7 @@ function(arrow_create_merged_static_lib output_target) message(STATUS "Creating bundled static library target ${output_target} at ${output_lib_path}" ) - add_library(${output_target} STATIC IMPORTED) + add_library(${output_target} STATIC IMPORTED GLOBAL) set_target_properties(${output_target} PROPERTIES IMPORTED_LOCATION ${output_lib_path}) add_dependencies(${output_target} ${output_target}_merge) endfunction() @@ -203,11 +197,9 @@ function(ADD_ARROW_LIB LIB_NAME) INSTALL_LIBRARY_DIR INSTALL_RUNTIME_DIR PKG_CONFIG_NAME - PRECOMPILED_HEADER_LIB SHARED_LINK_FLAGS) set(multi_value_args SOURCES - PRECOMPILED_HEADERS OUTPUTS STATIC_LINK_LIBS SHARED_LINK_LIBS @@ -273,12 +265,7 @@ function(ADD_ARROW_LIB LIB_NAME) if(ARG_DEFINITIONS) target_compile_definitions(${LIB_NAME}_objlib PRIVATE ${ARG_DEFINITIONS}) endif() - if(ARG_PRECOMPILED_HEADER_LIB) - reuse_precompiled_header_lib(${LIB_NAME}_objlib ${ARG_PRECOMPILED_HEADER_LIB}) - endif() - if(ARG_PRECOMPILED_HEADERS AND ARROW_USE_PRECOMPILED_HEADERS) - target_precompile_headers(${LIB_NAME}_objlib PRIVATE ${ARG_PRECOMPILED_HEADERS}) - endif() + target_compile_options(${LIB_NAME}_objlib PRIVATE ${ARROW_LIBRARIES_ONLY_CXX_FLAGS}) set(LIB_DEPS $) set(EXTRA_DEPS) @@ -305,7 +292,6 @@ function(ADD_ARROW_LIB LIB_NAME) endif() else() # Prepare arguments for separate compilation of static and shared libs below - # TODO: add PCH directives set(LIB_DEPS ${ARG_SOURCES}) set(EXTRA_DEPS ${ARG_DEPENDENCIES}) endif() @@ -341,10 +327,7 @@ function(ADD_ARROW_LIB LIB_NAME) if(ARG_DEFINITIONS) target_compile_definitions(${LIB_NAME}_shared PRIVATE ${ARG_DEFINITIONS}) endif() - - if(ARG_PRECOMPILED_HEADER_LIB) - reuse_precompiled_header_lib(${LIB_NAME}_shared ${ARG_PRECOMPILED_HEADER_LIB}) - endif() + target_compile_options(${LIB_NAME}_shared PRIVATE ${ARROW_LIBRARIES_ONLY_CXX_FLAGS}) if(ARG_OUTPUTS) list(APPEND ${ARG_OUTPUTS} ${LIB_NAME}_shared) @@ -435,10 +418,7 @@ function(ADD_ARROW_LIB LIB_NAME) if(ARG_DEFINITIONS) target_compile_definitions(${LIB_NAME}_static PRIVATE ${ARG_DEFINITIONS}) endif() - - if(ARG_PRECOMPILED_HEADER_LIB) - reuse_precompiled_header_lib(${LIB_NAME}_static ${ARG_PRECOMPILED_HEADER_LIB}) - endif() + target_compile_options(${LIB_NAME}_static PRIVATE ${ARROW_LIBRARIES_ONLY_CXX_FLAGS}) if(ARG_OUTPUTS) list(APPEND ${ARG_OUTPUTS} ${LIB_NAME}_static) @@ -539,13 +519,13 @@ endfunction() # group names must exist function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(options) - set(one_value_args) + set(one_value_args PREFIX) set(multi_value_args EXTRA_LINK_LIBS STATIC_LINK_LIBS DEPENDENCIES SOURCES - PREFIX + EXTRA_SOURCES LABELS) cmake_parse_arguments(ARG "${options}" @@ -565,10 +545,16 @@ function(ADD_BENCHMARK REL_BENCHMARK_NAME) set(BENCHMARK_NAME "${ARG_PREFIX}-${BENCHMARK_NAME}") endif() + set(SOURCES "") + + if(ARG_EXTRA_SOURCES) + list(APPEND SOURCES ${ARG_EXTRA_SOURCES}) + endif() + if(ARG_SOURCES) - set(SOURCES ${ARG_SOURCES}) + list(APPEND SOURCES ${ARG_SOURCES}) else() - set(SOURCES "${REL_BENCHMARK_NAME}.cc") + list(APPEND SOURCES "${REL_BENCHMARK_NAME}.cc") endif() # Make sure the executable name contains only hyphens, not underscores @@ -576,7 +562,7 @@ function(ADD_BENCHMARK REL_BENCHMARK_NAME) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc) # This benchmark has a corresponding .cc file, set it up as an executable. - set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") + set(BENCHMARK_PATH "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${BENCHMARK_NAME}") add_executable(${BENCHMARK_NAME} ${SOURCES}) if(ARG_STATIC_LINK_LIBS) @@ -605,7 +591,8 @@ function(ADD_BENCHMARK REL_BENCHMARK_NAME) PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH TRUE INSTALL_RPATH - "$ENV{CONDA_PREFIX}/lib;${EXECUTABLE_OUTPUT_PATH}") + "$ENV{CONDA_PREFIX}/lib;${CMAKE_RUNTIME_OUTPUT_DIRECTORY}" + ) endif() # Add test as dependency of relevant label targets @@ -668,10 +655,9 @@ endfunction() # names must exist function(ADD_TEST_CASE REL_TEST_NAME) set(options NO_VALGRIND ENABLED) - set(one_value_args PRECOMPILED_HEADER_LIB) + set(one_value_args PREFIX) set(multi_value_args SOURCES - PRECOMPILED_HEADERS STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES @@ -679,7 +665,6 @@ function(ADD_TEST_CASE REL_TEST_NAME) LABELS EXTRA_LABELS TEST_ARGUMENTS - PREFIX DEFINITIONS) cmake_parse_arguments(ARG "${options}" @@ -708,7 +693,7 @@ function(ADD_TEST_CASE REL_TEST_NAME) # Make sure the executable name contains only hyphens, not underscores string(REPLACE "_" "-" TEST_NAME ${TEST_NAME}) - set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") + set(TEST_PATH "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TEST_NAME}") add_executable(${TEST_NAME} ${SOURCES}) # With OSX and conda, we need to set the correct RPATH so that dependencies @@ -721,7 +706,8 @@ function(ADD_TEST_CASE REL_TEST_NAME) PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH TRUE INSTALL_RPATH - "${EXECUTABLE_OUTPUT_PATH};$ENV{CONDA_PREFIX}/lib") + "${CMAKE_RUNTIME_OUTPUT_DIRECTORY};$ENV{CONDA_PREFIX}/lib" + ) endif() # Ensure using bundled GoogleTest when we use bundled GoogleTest. @@ -736,14 +722,6 @@ function(ADD_TEST_CASE REL_TEST_NAME) target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_TEST_LINK_LIBS}) endif() - if(ARG_PRECOMPILED_HEADER_LIB) - reuse_precompiled_header_lib(${TEST_NAME} ${ARG_PRECOMPILED_HEADER_LIB}) - endif() - - if(ARG_PRECOMPILED_HEADERS AND ARROW_USE_PRECOMPILED_HEADERS) - target_precompile_headers(${TEST_NAME} PRIVATE ${ARG_PRECOMPILED_HEADERS}) - endif() - if(ARG_EXTRA_LINK_LIBS) target_link_libraries(${TEST_NAME} PRIVATE ${ARG_EXTRA_LINK_LIBS}) endif() @@ -835,13 +813,8 @@ endfunction() # create test executable foo-bar-example function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) set(options) - set(one_value_args) - set(multi_value_args - EXTRA_INCLUDES - EXTRA_LINK_LIBS - EXTRA_SOURCES - DEPENDENCIES - PREFIX) + set(one_value_args PREFIX) + set(multi_value_args EXTRA_INCLUDES EXTRA_LINK_LIBS EXTRA_SOURCES DEPENDENCIES) cmake_parse_arguments(ARG "${options}" "${one_value_args}" @@ -865,7 +838,7 @@ function(ADD_ARROW_EXAMPLE REL_EXAMPLE_NAME) if(EXISTS ${CMAKE_SOURCE_DIR}/examples/arrow/${REL_EXAMPLE_NAME}.cc) # This example has a corresponding .cc file, set it up as an executable. - set(EXAMPLE_PATH "${EXECUTABLE_OUTPUT_PATH}/${EXAMPLE_NAME}") + set(EXAMPLE_PATH "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${EXAMPLE_NAME}") add_executable(${EXAMPLE_NAME} "${REL_EXAMPLE_NAME}.cc" ${ARG_EXTRA_SOURCES}) target_link_libraries(${EXAMPLE_NAME} ${ARROW_EXAMPLE_LINK_LIBS}) add_dependencies(runexample ${EXAMPLE_NAME}) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 16360929a9b..4f0981ef1c1 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -107,6 +107,10 @@ macro(tsort_bool_option_dependencies) endmacro() macro(resolve_option_dependencies) + # Arrow Flight SQL ODBC is available only for Windows for now. + if(NOT MSVC_TOOLCHAIN) + set(ARROW_FLIGHT_SQL_ODBC OFF) + endif() if(MSVC_TOOLCHAIN) set(ARROW_USE_GLOG OFF) endif() @@ -172,9 +176,6 @@ takes precedence over ccache if a storage backend is configured" ON) define_option(ARROW_USE_MOLD "Use mold for linking on Linux (if available)" OFF) - define_option(ARROW_USE_PRECOMPILED_HEADERS "Use precompiled headers when compiling" - OFF) - define_option_string(ARROW_SIMD_LEVEL "Compile-time SIMD optimization level" "DEFAULT" # default to SSE4_2 on x86, NEON on Arm, NONE otherwise @@ -212,7 +213,7 @@ takes precedence over ccache if a storage backend is configured" ON) define_option(ARROW_ENABLE_THREADING "Enable threading in Arrow core" ON) #---------------------------------------------------------------------- - set_option_category("Test and benchmark") + set_option_category("Tests and benchmarks") define_option(ARROW_BUILD_EXAMPLES "Build the Arrow examples" OFF) @@ -258,22 +259,25 @@ takes precedence over ccache if a storage backend is configured" ON) "shared" "static") - define_option(ARROW_FUZZING - "Build Arrow Fuzzing executables" + define_option(ARROW_BUILD_FUZZING_UTILITIES + "Build command line utilities for fuzzing" OFF DEPENDS ARROW_TESTING - ARROW_WITH_BROTLI) + ARROW_WITH_BROTLI + ARROW_WITH_LZ4 + ARROW_WITH_ZSTD) + + define_option(ARROW_FUZZING + "Build Arrow fuzz targets" + OFF + DEPENDS + ARROW_BUILD_FUZZING_UTILITIES) define_option(ARROW_LARGE_MEMORY_TESTS "Enable unit tests which use large memory" OFF) #---------------------------------------------------------------------- - set_option_category("Lint") - - define_option(ARROW_ONLY_LINT "Only define the lint and check-format targets" OFF) - - define_option(ARROW_VERBOSE_LINT - "If off, 'quiet' flags will be passed to linting tools" OFF) + set_option_category("Coverage") define_option(ARROW_GENERATE_COVERAGE "Build with C++ code coverage enabled" OFF) @@ -305,7 +309,7 @@ takes precedence over ccache if a storage backend is configured" ON) DEPENDS ARROW_FILESYSTEM) - define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF) + define_option(ARROW_BUILD_UTILITIES "Build Arrow command line utilities" OFF) define_option(ARROW_COMPUTE "Build all Arrow Compute kernels" OFF) @@ -338,6 +342,13 @@ takes precedence over ccache if a storage backend is configured" ON) DEPENDS ARROW_FLIGHT) + define_option(ARROW_FLIGHT_SQL_ODBC + "Build the Arrow Flight SQL ODBC extension" + OFF + DEPENDS + ARROW_FLIGHT_SQL + ARROW_COMPUTE) + define_option(ARROW_GANDIVA "Build the Gandiva libraries" OFF @@ -404,15 +415,6 @@ takes precedence over ccache if a storage backend is configured" ON) DEPENDS ARROW_S3) - define_option(ARROW_SKYHOOK - "Build the Skyhook libraries" - OFF - DEPENDS - ARROW_DATASET - ARROW_PARQUET - ARROW_WITH_LZ4 - ARROW_WITH_SNAPPY) - define_option(ARROW_SUBSTRAIT "Build the Arrow Substrait Consumer Module" OFF @@ -593,10 +595,6 @@ takes precedence over ccache if a storage backend is configured" ON) #---------------------------------------------------------------------- set_option_category("Parquet") - define_option(PARQUET_MINIMAL_DEPENDENCY - "Depend only on Thirdparty headers to build libparquet.;\ -Always OFF if building binaries" OFF) - define_option(PARQUET_BUILD_EXECUTABLES "Build the Parquet executable CLI tools. Requires static libraries to be built." OFF) diff --git a/cpp/cmake_modules/FindBrotliAlt.cmake b/cpp/cmake_modules/FindBrotliAlt.cmake index 3c90329be96..aa749566486 100644 --- a/cpp/cmake_modules/FindBrotliAlt.cmake +++ b/cpp/cmake_modules/FindBrotliAlt.cmake @@ -21,7 +21,7 @@ if(BrotliAlt_FOUND) return() endif() -if(ARROW_PACKAGE_KIND STREQUAL "vcpkg" OR ARROW_PACKAGE_KIND STREQUAL "conan") +if(ARROW_VCPKG OR ARROW_PACKAGE_KIND STREQUAL "conan") set(find_package_args "") if(BrotliAlt_FIND_VERSION) list(APPEND find_package_args ${BrotliAlt_FIND_VERSION}) @@ -32,14 +32,14 @@ if(ARROW_PACKAGE_KIND STREQUAL "vcpkg" OR ARROW_PACKAGE_KIND STREQUAL "conan") if(BrotliAlt_FIND_REQUIRED) list(APPEND find_package_args REQUIRED) endif() - if(ARROW_PACKAGE_KIND STREQUAL "vcpkg") + if(ARROW_VCPKG) find_package(BrotliAlt NAMES unofficial-brotli ${find_package_args}) else() find_package(BrotliAlt NAMES brotli ${find_package_args}) endif() set(Brotli_FOUND ${BrotliAlt_FOUND}) if(BrotliAlt_FOUND) - if(ARROW_PACKAGE_KIND STREQUAL "vcpkg") + if(ARROW_VCPKG) add_library(Brotli::brotlicommon ALIAS unofficial::brotli::brotlicommon) add_library(Brotli::brotlienc ALIAS unofficial::brotli::brotlienc) add_library(Brotli::brotlidec ALIAS unofficial::brotli::brotlidec) diff --git a/cpp/cmake_modules/FindSnappyAlt.cmake b/cpp/cmake_modules/FindSnappyAlt.cmake index 4d313400647..d0a06f0997a 100644 --- a/cpp/cmake_modules/FindSnappyAlt.cmake +++ b/cpp/cmake_modules/FindSnappyAlt.cmake @@ -19,6 +19,18 @@ if(SnappyAlt_FOUND) return() endif() +if(ARROW_SNAPPY_USE_SHARED) + if(TARGET Snappy::snappy) + set(Snappy_TARGET Snappy::snappy) + set(SnappyAlt_FOUND TRUE) + return() + elseif(TARGET Snappy::snappy-static) + set(Snappy_TARGET Snappy::snappy-static) + set(SnappyAlt_FOUND TRUE) + return() + endif() +endif() + set(find_package_args) if(SnappyAlt_FIND_VERSION) list(APPEND find_package_args ${SnappyAlt_FIND_VERSION}) diff --git a/cpp/cmake_modules/Findutf8proc.cmake b/cpp/cmake_modules/Findutf8proc.cmake index 9721f76f063..75d459d0ec7 100644 --- a/cpp/cmake_modules/Findutf8proc.cmake +++ b/cpp/cmake_modules/Findutf8proc.cmake @@ -19,7 +19,7 @@ if(utf8proc_FOUND) return() endif() -if(ARROW_PACKAGE_KIND STREQUAL "vcpkg" OR VCPKG_TOOLCHAIN) +if(ARROW_VCPKG) set(find_package_args "") if(utf8proc_FIND_VERSION) list(APPEND find_package_args ${utf8proc_FIND_VERSION}) diff --git a/cpp/cmake_modules/GandivaAddBitcode.cmake b/cpp/cmake_modules/GandivaAddBitcode.cmake index 98847f8a186..6b5e5b3e60c 100644 --- a/cpp/cmake_modules/GandivaAddBitcode.cmake +++ b/cpp/cmake_modules/GandivaAddBitcode.cmake @@ -71,5 +71,5 @@ function(gandiva_add_bitcode SOURCE) endif() add_custom_command(OUTPUT ${BC_FILE} COMMAND ${PRECOMPILE_COMMAND} - DEPENDS ${SOURCE_FILE}) + DEPENDS ${SOURCE}) endfunction() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index fdb28b540e2..afc0446a780 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -49,7 +49,11 @@ endif() if(ARROW_CPU_FLAG STREQUAL "x86") # x86/amd64 compiler flags, msvc/gcc/clang if(MSVC) - set(ARROW_SSE4_2_FLAG "") + set(ARROW_SSE4_2_FLAG "/arch:SSE4.2") + # These definitions are needed for xsimd to consider the corresponding instruction + # sets available, but they are not set by MSVC (unlike other compilers). + # See https://github.com/AcademySoftwareFoundation/OpenImageIO/issues/4265 + add_definitions(-D__SSE2__ -D__SSE4_1__ -D__SSE4_2__) set(ARROW_AVX2_FLAG "/arch:AVX2") # MSVC has no specific flag for BMI2, it seems to be enabled with AVX2 set(ARROW_BMI2_FLAG "/arch:AVX2") @@ -152,11 +156,12 @@ set(CMAKE_CXX_EXTENSIONS OFF) # shared libraries set(CMAKE_POSITION_INDEPENDENT_CODE ${ARROW_POSITION_INDEPENDENT_CODE}) -string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) - set(UNKNOWN_COMPILER_MESSAGE "Unknown compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +# Compiler flags used when building Arrow libraries (but not tests, utilities, etc.) +set(ARROW_LIBRARIES_ONLY_CXX_FLAGS) + # compiler flags that are common across debug/release builds if(WIN32) # TODO(wesm): Change usages of C runtime functions that MSVC says are @@ -265,7 +270,7 @@ endif() # `RELEASE`, then it will default to `PRODUCTION`. The goal of defaulting to # `CHECKIN` is to avoid friction with long response time from CI. if(NOT BUILD_WARNING_LEVEL) - if("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + if("${UPPERCASE_BUILD_TYPE}" STREQUAL "RELEASE") set(BUILD_WARNING_LEVEL PRODUCTION) else() set(BUILD_WARNING_LEVEL CHECKIN) @@ -294,8 +299,9 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /wd4365") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /wd4267") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /wd4838") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL - "Clang") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" + OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" + OR CMAKE_CXX_COMPILER_ID STREQUAL "IBMClang") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdocumentation") @@ -319,6 +325,9 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wimplicit-fallthrough") string(APPEND CXX_ONLY_FLAGS " -Wredundant-move") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wunused-result") + # Flag non-static functions that don't have corresponding declaration in a .h file. + # Only for Arrow libraries, since this is not a problem in tests or utilities. + list(APPEND ARROW_LIBRARIES_ONLY_CXX_FLAGS "-Wmissing-declarations") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") if(WIN32) @@ -602,7 +611,7 @@ if(NOT WIN32 AND NOT APPLE) if(MUST_USE_GOLD) message(STATUS "Using hard-wired gold linker (version ${GOLD_VERSION})") if(ARROW_BUGGY_GOLD) - if("${ARROW_LINK}" STREQUAL "d" AND "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + if("${ARROW_LINK}" STREQUAL "d" AND "${UPPERCASE_BUILD_TYPE}" STREQUAL "RELEASE") message(SEND_ERROR "Configured to use buggy gold with dynamic linking " "in a RELEASE build") endif() @@ -808,7 +817,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(CMAKE_SHARED_LINKER_FLAGS "-sSIDE_MODULE=1 ${ARROW_EMSCRIPTEN_LINKER_FLAGS}") if(ARROW_TESTING) # flags for building test executables for use in node - if("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + if("${UPPERCASE_BUILD_TYPE}" STREQUAL "RELEASE") set(CMAKE_EXE_LINKER_FLAGS "${ARROW_EMSCRIPTEN_LINKER_FLAGS} -sALLOW_MEMORY_GROWTH -lnodefs.js -lnoderawfs.js --pre-js ${BUILD_SUPPORT_DIR}/emscripten-test-init.js" ) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 1af347914fa..7b8cef5fb5e 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -374,16 +374,13 @@ target_include_directories(arrow::flatbuffers # ---------------------------------------------------------------------- # Some EP's require other EP's -if(PARQUET_REQUIRE_ENCRYPTION) - set(ARROW_JSON ON) -endif() - if(ARROW_WITH_OPENTELEMETRY) set(ARROW_WITH_NLOHMANN_JSON ON) set(ARROW_WITH_PROTOBUF ON) endif() if(ARROW_PARQUET) + set(ARROW_WITH_RAPIDJSON ON) set(ARROW_WITH_THRIFT ON) endif() @@ -410,7 +407,7 @@ if(ARROW_AZURE) set(ARROW_WITH_AZURE_SDK ON) endif() -if(ARROW_JSON) +if(ARROW_JSON OR ARROW_FLIGHT_SQL_ODBC) set(ARROW_WITH_RAPIDJSON ON) endif() @@ -601,17 +598,8 @@ endif() if(DEFINED ENV{ARROW_BOOST_URL}) set(BOOST_SOURCE_URL "$ENV{ARROW_BOOST_URL}") else() - string(REPLACE "." "_" ARROW_BOOST_BUILD_VERSION_UNDERSCORES - ${ARROW_BOOST_BUILD_VERSION}) set_urls(BOOST_SOURCE_URL - # These are trimmed boost bundles we maintain. - # See cpp/build-support/trim-boost.sh - # FIXME(ARROW-6407) automate uploading this archive to ensure it reflects - # our currently used packages and doesn't fall out of sync with - # ${ARROW_BOOST_BUILD_VERSION_UNDERSCORES} - "${THIRDPARTY_MIRROR_URL}/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" - "https://boostorg.jfrog.io/artifactory/main/release/${ARROW_BOOST_BUILD_VERSION}/source/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" - "https://sourceforge.net/projects/boost/files/boost/${ARROW_BOOST_BUILD_VERSION}/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" + "https://github.com/boostorg/boost/releases/download/boost-${ARROW_BOOST_BUILD_VERSION}/boost-${ARROW_BOOST_BUILD_VERSION}-cmake.tar.gz" ) endif() @@ -1017,7 +1005,7 @@ endif() set(MAKE_BUILD_ARGS "-j${NPROC}") include(FetchContent) -set(FC_DECLARE_COMMON_OPTIONS) +set(FC_DECLARE_COMMON_OPTIONS SYSTEM) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.28) list(APPEND FC_DECLARE_COMMON_OPTIONS EXCLUDE_FROM_ALL TRUE) endif() @@ -1025,13 +1013,26 @@ endif() macro(prepare_fetchcontent) set(BUILD_SHARED_LIBS OFF) set(BUILD_STATIC_LIBS ON) - set(CMAKE_COMPILE_WARNING_AS_ERROR FALSE) - set(CMAKE_EXPORT_NO_PACKAGE_REGISTRY TRUE) + set(BUILD_TESTING OFF) + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "") + set(CMAKE_COMPILE_WARNING_AS_ERROR OFF) + set(CMAKE_EXPORT_NO_PACKAGE_REGISTRY ON) + set(CMAKE_EXPORT_PACKAGE_REGISTRY OFF) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "") set(CMAKE_MACOSX_RPATH ${ARROW_INSTALL_NAME_RPATH}) # We set CMAKE_POLICY_VERSION_MINIMUM temporarily due to failures with CMake 4 # We should remove it once we have updated the dependencies: # https://github.com/apache/arrow/issues/45985 set(CMAKE_POLICY_VERSION_MINIMUM 3.5) + # Use "NEW" for CMP0077 by default. + # + # https://cmake.org/cmake/help/latest/policy/CMP0077.html + # + # option() honors normal variables. + set(CMAKE_POLICY_DEFAULT_CMP0077 + NEW + CACHE STRING "") + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "") if(MSVC) string(REPLACE "/WX" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") @@ -1053,119 +1054,132 @@ endif() # ---------------------------------------------------------------------- # Add Boost dependencies (code adapted from Apache Kudu) -macro(build_boost) - set(BOOST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/boost_ep-prefix/src/boost_ep") +function(build_boost) + list(APPEND CMAKE_MESSAGE_INDENT "Boost: ") + message(STATUS "Building from source") - # This is needed by the thrift_ep build - set(BOOST_ROOT ${BOOST_PREFIX}) - set(Boost_INCLUDE_DIR "${BOOST_PREFIX}") + fetchcontent_declare(boost + ${FC_DECLARE_COMMON_OPTIONS} OVERRIDE_FIND_PACKAGE + URL ${BOOST_SOURCE_URL} + URL_HASH "SHA256=${ARROW_BOOST_BUILD_SHA256_CHECKSUM}") - if(ARROW_BOOST_REQUIRE_LIBRARY) - set(BOOST_LIB_DIR "${BOOST_PREFIX}/stage/lib") - set(BOOST_BUILD_LINK "static") - if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(BOOST_BUILD_VARIANT "debug") - else() - set(BOOST_BUILD_VARIANT "release") + prepare_fetchcontent() + set(BOOST_ENABLE_COMPATIBILITY_TARGETS ON) + set(BOOST_EXCLUDE_LIBRARIES) + set(BOOST_INCLUDE_LIBRARIES + ${ARROW_BOOST_COMPONENTS} + ${ARROW_BOOST_OPTIONAL_COMPONENTS} + algorithm + crc + numeric/conversion + scope_exit + throw_exception + tokenizer) + if(ARROW_TESTING + OR ARROW_GANDIVA + OR (NOT ARROW_USE_NATIVE_INT128)) + set(ARROW_BOOST_NEED_MULTIPRECISION TRUE) + else() + set(ARROW_BOOST_NEED_MULTIPRECISION FALSE) + endif() + if(ARROW_ENABLE_THREADING) + if(ARROW_WITH_THRIFT OR (ARROW_FLIGHT_SQL_ODBC AND MSVC)) + list(APPEND BOOST_INCLUDE_LIBRARIES locale) endif() - if(MSVC) - set(BOOST_CONFIGURE_COMMAND ".\\\\bootstrap.bat") - else() - set(BOOST_CONFIGURE_COMMAND "./bootstrap.sh") + if(ARROW_BOOST_NEED_MULTIPRECISION) + list(APPEND BOOST_INCLUDE_LIBRARIES multiprecision) endif() + list(APPEND BOOST_INCLUDE_LIBRARIES thread) + else() + list(APPEND + BOOST_EXCLUDE_LIBRARIES + asio + container + date_time + lexical_cast + locale + lockfree + math + thread) + endif() + if(ARROW_WITH_THRIFT) + list(APPEND BOOST_INCLUDE_LIBRARIES uuid) + else() + list(APPEND BOOST_EXCLUDE_LIBRARIES uuid) + endif() + set(BOOST_SKIP_INSTALL_RULES ON) + if(NOT ARROW_ENABLE_THREADING) + set(BOOST_UUID_LINK_LIBATOMIC OFF) + endif() + if(MSVC) + string(APPEND CMAKE_C_FLAGS " /EHsc") + string(APPEND CMAKE_CXX_FLAGS " /EHsc") + else() + # This is for https://github.com/boostorg/container/issues/305 + string(APPEND CMAKE_C_FLAGS " -Wno-strict-prototypes") + endif() + set(CMAKE_UNITY_BUILD OFF) - set(BOOST_BUILD_WITH_LIBRARIES "filesystem" "system") - string(REPLACE ";" "," BOOST_CONFIGURE_LIBRARIES "${BOOST_BUILD_WITH_LIBRARIES}") - list(APPEND BOOST_CONFIGURE_COMMAND "--prefix=${BOOST_PREFIX}" - "--with-libraries=${BOOST_CONFIGURE_LIBRARIES}") - set(BOOST_BUILD_COMMAND "./b2" "-j${NPROC}" "link=${BOOST_BUILD_LINK}" - "variant=${BOOST_BUILD_VARIANT}") - if(MSVC) - string(REGEX REPLACE "([0-9])$" ".\\1" BOOST_TOOLSET_MSVC_VERSION - ${MSVC_TOOLSET_VERSION}) - list(APPEND BOOST_BUILD_COMMAND "toolset=msvc-${BOOST_TOOLSET_MSVC_VERSION}") - set(BOOST_BUILD_WITH_LIBRARIES_MSVC) - foreach(_BOOST_LIB ${BOOST_BUILD_WITH_LIBRARIES}) - list(APPEND BOOST_BUILD_WITH_LIBRARIES_MSVC "--with-${_BOOST_LIB}") - endforeach() - list(APPEND BOOST_BUILD_COMMAND ${BOOST_BUILD_WITH_LIBRARIES_MSVC}) - else() - list(APPEND BOOST_BUILD_COMMAND "cxxflags=-fPIC") - endif() + fetchcontent_makeavailable(boost) - if(MSVC) - string(REGEX - REPLACE "^([0-9]+)\\.([0-9]+)\\.[0-9]+$" "\\1_\\2" - ARROW_BOOST_BUILD_VERSION_NO_MICRO_UNDERSCORE - ${ARROW_BOOST_BUILD_VERSION}) - set(BOOST_LIBRARY_SUFFIX "-vc${MSVC_TOOLSET_VERSION}-mt") - if(BOOST_BUILD_VARIANT STREQUAL "debug") - set(BOOST_LIBRARY_SUFFIX "${BOOST_LIBRARY_SUFFIX}-gd") - endif() - set(BOOST_LIBRARY_SUFFIX - "${BOOST_LIBRARY_SUFFIX}-x64-${ARROW_BOOST_BUILD_VERSION_NO_MICRO_UNDERSCORE}") + set(boost_include_dirs) + foreach(library ${BOOST_INCLUDE_LIBRARIES}) + # boost_numeric/conversion -> + # boost_numeric_conversion + string(REPLACE "/" "_" target_name "boost_${library}") + target_link_libraries(${target_name} INTERFACE Boost::disable_autolinking) + list(APPEND boost_include_dirs + $) + endforeach() + target_link_libraries(boost_headers + INTERFACE Boost::algorithm + Boost::crc + Boost::numeric_conversion + Boost::scope_exit + Boost::throw_exception + Boost::tokenizer) + target_compile_definitions(boost_mpl INTERFACE "BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS") + + if(ARROW_BOOST_NEED_MULTIPRECISION) + if(ARROW_ENABLE_THREADING) + target_link_libraries(boost_headers INTERFACE Boost::multiprecision) else() - set(BOOST_LIBRARY_SUFFIX "") - endif() - set(BOOST_STATIC_SYSTEM_LIBRARY - "${BOOST_LIB_DIR}/libboost_system${BOOST_LIBRARY_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set(BOOST_STATIC_FILESYSTEM_LIBRARY - "${BOOST_LIB_DIR}/libboost_filesystem${BOOST_LIBRARY_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set(BOOST_SYSTEM_LIBRARY boost_system_static) - set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) - set(BOOST_BUILD_PRODUCTS ${BOOST_STATIC_SYSTEM_LIBRARY} - ${BOOST_STATIC_FILESYSTEM_LIBRARY}) - - add_thirdparty_lib(Boost::system - STATIC - "${BOOST_STATIC_SYSTEM_LIBRARY}" - INCLUDE_DIRECTORIES - "${Boost_INCLUDE_DIR}") - add_thirdparty_lib(Boost::filesystem - STATIC - "${BOOST_STATIC_FILESYSTEM_LIBRARY}" - INCLUDE_DIRECTORIES - "${Boost_INCLUDE_DIR}") - - externalproject_add(boost_ep - ${EP_COMMON_OPTIONS} - URL ${BOOST_SOURCE_URL} - URL_HASH "SHA256=${ARROW_BOOST_BUILD_SHA256_CHECKSUM}" - BUILD_BYPRODUCTS ${BOOST_BUILD_PRODUCTS} - BUILD_IN_SOURCE 1 - CONFIGURE_COMMAND ${BOOST_CONFIGURE_COMMAND} - BUILD_COMMAND ${BOOST_BUILD_COMMAND} - INSTALL_COMMAND "") - add_dependencies(Boost::system boost_ep) - add_dependencies(Boost::filesystem boost_ep) - else() - externalproject_add(boost_ep - ${EP_COMMON_OPTIONS} - BUILD_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - URL ${BOOST_SOURCE_URL} - URL_HASH "SHA256=${ARROW_BOOST_BUILD_SHA256_CHECKSUM}") - endif() - add_library(Boost::headers INTERFACE IMPORTED) - target_include_directories(Boost::headers INTERFACE "${Boost_INCLUDE_DIR}") - add_dependencies(Boost::headers boost_ep) - # If Boost is found but one of system or filesystem components aren't found, - # Boost::disable_autolinking and Boost::dynamic_linking are already defined. - if(NOT TARGET Boost::disable_autolinking) - add_library(Boost::disable_autolinking INTERFACE IMPORTED) - if(WIN32) - target_compile_definitions(Boost::disable_autolinking INTERFACE "BOOST_ALL_NO_LIB") + # We want to use Boost.multiprecision as standalone mode + # without threading because non-standalone mode requires + # threading. We can't use BOOST_MP_STANDALONE CMake variable for + # this with Boost CMake build. So we create our CMake target for + # it. + add_library(arrow::Boost::multiprecision INTERFACE IMPORTED) + target_include_directories(arrow::Boost::multiprecision + INTERFACE "${boost_SOURCE_DIR}/libs/multiprecision/include" + ) + target_compile_definitions(arrow::Boost::multiprecision + INTERFACE BOOST_MP_STANDALONE=1) + target_link_libraries(boost_headers INTERFACE arrow::Boost::multiprecision) endif() endif() - if(NOT TARGET Boost::dynamic_linking) - # This doesn't add BOOST_ALL_DYN_LINK because bundled Boost is a static library. - add_library(Boost::dynamic_linking INTERFACE IMPORTED) + if(ARROW_WITH_THRIFT) + if(ARROW_ENABLE_THREADING) + add_library(arrow::Boost::locale ALIAS boost_locale) + else() + # Apache Parquet depends on Apache Thrift. + # Apache Thrift uses Boost.locale but it only uses header files. + # So we can use this for building Apache Thrift. + add_library(arrow::Boost::locale INTERFACE IMPORTED) + target_include_directories(arrow::Boost::locale + INTERFACE "${boost_SOURCE_DIR}/libs/locale/include") + endif() endif() - set(BOOST_VENDORED TRUE) -endmacro() + + set(Boost_INCLUDE_DIRS + ${boost_include_dirs} + PARENT_SCOPE) + set(BOOST_VENDORED + TRUE + PARENT_SCOPE) + + list(POP_BACK CMAKE_MESSAGE_INDENT) +endfunction() if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 15) @@ -1182,7 +1196,16 @@ set(Boost_USE_MULTITHREADED ON) if(MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() +# CMake 3.25.0 has 1.80 and older versions. set(Boost_ADDITIONAL_VERSIONS + "1.88.0" + "1.88" + "1.87.0" + "1.87" + "1.86.0" + "1.86" + "1.85.0" + "1.85" "1.84.0" "1.84" "1.83.0" @@ -1190,49 +1213,7 @@ set(Boost_ADDITIONAL_VERSIONS "1.82.0" "1.82" "1.81.0" - "1.81" - "1.80.0" - "1.80" - "1.79.0" - "1.79" - "1.78.0" - "1.78" - "1.77.0" - "1.77" - "1.76.0" - "1.76" - "1.75.0" - "1.75" - "1.74.0" - "1.74" - "1.73.0" - "1.73" - "1.72.0" - "1.72" - "1.71.0" - "1.71" - "1.70.0" - "1.70" - "1.69.0" - "1.69" - "1.68.0" - "1.68" - "1.67.0" - "1.67" - "1.66.0" - "1.66" - "1.65.0" - "1.65" - "1.64.0" - "1.64" - "1.63.0" - "1.63" - "1.62.0" - "1.61" - "1.61.0" - "1.62" - "1.60.0" - "1.60") + "1.81") # Compilers that don't support int128_t have a compile-time # (header-only) dependency on Boost for int128_t. @@ -1261,6 +1242,7 @@ endif() if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS OR (ARROW_FLIGHT AND (ARROW_TESTING OR ARROW_BUILD_BENCHMARKS)) + OR ARROW_FLIGHT_SQL_ODBC OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS) OR (ARROW_TESTING AND ARROW_BUILD_SHARED)) set(ARROW_USE_BOOST TRUE) @@ -1287,7 +1269,12 @@ if(ARROW_USE_BOOST) endif() if(ARROW_BOOST_REQUIRE_LIBRARY) set(ARROW_BOOST_COMPONENTS filesystem system) - set(ARROW_BOOST_OPTIONAL_COMPONENTS process) + if(ARROW_FLIGHT_SQL_ODBC AND MSVC) + list(APPEND ARROW_BOOST_COMPONENTS locale) + endif() + if(ARROW_ENABLE_THREADING) + set(ARROW_BOOST_OPTIONAL_COMPONENTS process) + endif() else() set(ARROW_BOOST_COMPONENTS) set(ARROW_BOOST_OPTIONAL_COMPONENTS) @@ -1307,62 +1294,71 @@ if(ARROW_USE_BOOST) unset(BUILD_SHARED_LIBS_KEEP) endif() - foreach(BOOST_LIBRARY Boost::headers Boost::filesystem Boost::system) - if(NOT TARGET ${BOOST_LIBRARY}) - continue() - endif() - target_link_libraries(${BOOST_LIBRARY} INTERFACE Boost::disable_autolinking) - if(ARROW_BOOST_USE_SHARED) - target_link_libraries(${BOOST_LIBRARY} INTERFACE Boost::dynamic_linking) - endif() - endforeach() + if(NOT BOOST_VENDORED) + foreach(BOOST_COMPONENT ${ARROW_BOOST_COMPONENTS} ${ARROW_BOOST_OPTIONAL_COMPONENTS}) + set(BOOST_LIBRARY Boost::${BOOST_COMPONENT}) + if(NOT TARGET ${BOOST_LIBRARY}) + continue() + endif() + target_link_libraries(${BOOST_LIBRARY} INTERFACE Boost::disable_autolinking) + if(ARROW_BOOST_USE_SHARED) + target_link_libraries(${BOOST_LIBRARY} INTERFACE Boost::dynamic_linking) + endif() + endforeach() + endif() - set(BOOST_PROCESS_HAVE_V2 FALSE) - if(TARGET Boost::process) - # Boost >= 1.86 - target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V1") - target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") - set(BOOST_PROCESS_HAVE_V2 TRUE) - else() - # Boost < 1.86 - add_library(Boost::process INTERFACE IMPORTED) - if(TARGET Boost::filesystem) - target_link_libraries(Boost::process INTERFACE Boost::filesystem) - endif() - if(TARGET Boost::system) - target_link_libraries(Boost::process INTERFACE Boost::system) - endif() - if(TARGET Boost::headers) - target_link_libraries(Boost::process INTERFACE Boost::headers) - endif() - if(Boost_VERSION VERSION_GREATER_EQUAL 1.80) - target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") + if(ARROW_ENABLE_THREADING) + set(BOOST_PROCESS_HAVE_V2 FALSE) + if(TARGET Boost::process) + # Boost >= 1.86 + add_library(arrow::Boost::process INTERFACE IMPORTED) + target_link_libraries(arrow::Boost::process INTERFACE Boost::process) + target_compile_definitions(arrow::Boost::process INTERFACE "BOOST_PROCESS_HAVE_V1") + target_compile_definitions(arrow::Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") set(BOOST_PROCESS_HAVE_V2 TRUE) - # Boost < 1.86 has a bug that - # boost::process::v2::process_environment::on_setup() isn't - # defined. We need to build Boost Process source to define it. - # - # See also: - # https://github.com/boostorg/process/issues/312 - target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_NEED_SOURCE") - if(WIN32) - target_link_libraries(Boost::process INTERFACE bcrypt ntdll) + else() + # Boost < 1.86 + add_library(arrow::Boost::process INTERFACE IMPORTED) + if(TARGET Boost::filesystem) + target_link_libraries(arrow::Boost::process INTERFACE Boost::filesystem) + endif() + if(TARGET Boost::system) + target_link_libraries(arrow::Boost::process INTERFACE Boost::system) + endif() + if(TARGET Boost::headers) + target_link_libraries(arrow::Boost::process INTERFACE Boost::headers) + endif() + if(Boost_VERSION VERSION_GREATER_EQUAL 1.80) + target_compile_definitions(arrow::Boost::process + INTERFACE "BOOST_PROCESS_HAVE_V2") + set(BOOST_PROCESS_HAVE_V2 TRUE) + # Boost < 1.86 has a bug that + # boost::process::v2::process_environment::on_setup() isn't + # defined. We need to build Boost Process source to define it. + # + # See also: + # https://github.com/boostorg/process/issues/312 + target_compile_definitions(arrow::Boost::process + INTERFACE "BOOST_PROCESS_NEED_SOURCE") + if(WIN32) + target_link_libraries(arrow::Boost::process INTERFACE bcrypt ntdll) + endif() endif() endif() - endif() - if(BOOST_PROCESS_HAVE_V2 - AND # We can't use v2 API on Windows because v2 API doesn't support - # process group[1] and GCS testbench uses multiple processes[2]. - # - # [1] https://github.com/boostorg/process/issues/259 - # [2] https://github.com/googleapis/storage-testbench/issues/669 - (NOT WIN32) - AND # We can't use v2 API with musl libc with Boost Process < 1.86 - # because Boost Process < 1.86 doesn't support musl libc[3]. - # - # [3] https://github.com/boostorg/process/commit/aea22dbf6be1695ceb42367590b6ca34d9433500 - (NOT (ARROW_WITH_MUSL AND (Boost_VERSION VERSION_LESS 1.86)))) - target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_USE_V2") + if(BOOST_PROCESS_HAVE_V2 + AND # We can't use v2 API on Windows because v2 API doesn't support + # process group[1] and GCS testbench uses multiple processes[2]. + # + # [1] https://github.com/boostorg/process/issues/259 + # [2] https://github.com/googleapis/storage-testbench/issues/669 + (NOT WIN32) + AND # We can't use v2 API with musl libc with Boost Process < 1.86 + # because Boost Process < 1.86 doesn't support musl libc[3]. + # + # [3] https://github.com/boostorg/process/commit/aea22dbf6be1695ceb42367590b6ca34d9433500 + (NOT (ARROW_WITH_MUSL AND (Boost_VERSION VERSION_LESS 1.86)))) + target_compile_definitions(arrow::Boost::process INTERFACE "BOOST_PROCESS_USE_V2") + endif() endif() message(STATUS "Boost include dir: ${Boost_INCLUDE_DIRS}") @@ -1415,15 +1411,6 @@ macro(build_snappy) ) endforeach() - if(APPLE AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) - # On macOS 10.13 we need to explicitly add to avoid a missing include error - # This can be removed once CRAN no longer checks on macOS 10.13 - find_program(PATCH patch REQUIRED) - set(SNAPPY_PATCH_COMMAND ${PATCH} -p1 -i ${CMAKE_CURRENT_LIST_DIR}/snappy.diff) - else() - set(SNAPPY_PATCH_COMMAND) - endif() - if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") # ignore linker flag errors, as Snappy sets # -Werror -Wall, and Emscripten doesn't support -soname @@ -1438,7 +1425,6 @@ macro(build_snappy) INSTALL_DIR ${SNAPPY_PREFIX} URL ${SNAPPY_SOURCE_URL} URL_HASH "SHA256=${ARROW_SNAPPY_BUILD_SHA256_CHECKSUM}" - PATCH_COMMAND ${SNAPPY_PATCH_COMMAND} CMAKE_ARGS ${SNAPPY_CMAKE_ARGS} BUILD_BYPRODUCTS "${SNAPPY_STATIC_LIB}") @@ -1737,6 +1723,8 @@ if(ARROW_NEED_GFLAGS) set(GFLAGS_LIBRARIES gflags-shared) elseif(TARGET gflags_shared) set(GFLAGS_LIBRARIES gflags_shared) + elseif(TARGET gflags::gflags) + set(GFLAGS_LIBRARIES gflags::gflags) endif() endif() endif() @@ -1744,102 +1732,108 @@ endif() # ---------------------------------------------------------------------- # Thrift -macro(build_thrift) - message(STATUS "Building Apache Thrift from source") - set(THRIFT_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/thrift_ep-install") - set(THRIFT_INCLUDE_DIR "${THRIFT_PREFIX}/include") - set(THRIFT_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} - "-DCMAKE_INSTALL_PREFIX=${THRIFT_PREFIX}" - "-DCMAKE_INSTALL_RPATH=${THRIFT_PREFIX}/lib" - # Work around https://gitlab.kitware.com/cmake/cmake/issues/18865 - -DBoost_NO_BOOST_CMAKE=ON - -DBUILD_COMPILER=OFF - -DBUILD_EXAMPLES=OFF - -DBUILD_TUTORIALS=OFF - -DCMAKE_DEBUG_POSTFIX= - -DWITH_AS3=OFF - -DWITH_CPP=ON - -DWITH_C_GLIB=OFF - -DWITH_JAVA=OFF - -DWITH_JAVASCRIPT=OFF - -DWITH_LIBEVENT=OFF - -DWITH_NODEJS=OFF - -DWITH_PYTHON=OFF - -DWITH_QT5=OFF - -DWITH_ZLIB=OFF) - - # Thrift also uses boost. Forward important boost settings if there were ones passed. - if(DEFINED BOOST_ROOT) - list(APPEND THRIFT_CMAKE_ARGS "-DBOOST_ROOT=${BOOST_ROOT}") +function(build_thrift) + list(APPEND CMAKE_MESSAGE_INDENT "Thrift: ") + message(STATUS "Building from source") + + if(CMAKE_VERSION VERSION_LESS 3.26) + message(FATAL_ERROR "Require CMake 3.26 or later for building bundled Apache Thrift") endif() - list(APPEND - THRIFT_CMAKE_ARGS - "-DBoost_INCLUDE_DIR=$" - ) - if(DEFINED Boost_NAMESPACE) - list(APPEND THRIFT_CMAKE_ARGS "-DBoost_NAMESPACE=${Boost_NAMESPACE}") + set(THRIFT_PATCH_COMMAND) + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + find_program(PATCH patch) + if(PATCH) + list(APPEND + THRIFT_PATCH_COMMAND + ${PATCH} + -p1 + -i) + else() + find_program(GIT git) + if(GIT) + list(APPEND THRIFT_PATCH_COMMAND ${GIT} apply) + endif() + endif() + if(THRIFT_PATCH_COMMAND) + # https://github.com/apache/thrift/pull/3187 + list(APPEND THRIFT_PATCH_COMMAND ${CMAKE_CURRENT_LIST_DIR}/thrift-3187.patch) + endif() endif() + fetchcontent_declare(thrift + ${FC_DECLARE_COMMON_OPTIONS} + PATCH_COMMAND ${THRIFT_PATCH_COMMAND} + URL ${THRIFT_SOURCE_URL} + URL_HASH "SHA256=${ARROW_THRIFT_BUILD_SHA256_CHECKSUM}") + prepare_fetchcontent() + set(BUILD_COMPILER OFF) + set(BUILD_EXAMPLES OFF) + set(BUILD_TUTORIALS OFF) + set(CMAKE_UNITY_BUILD OFF) + set(WITH_AS3 OFF) + set(WITH_CPP ON) + set(WITH_C_GLIB OFF) + set(WITH_JAVA OFF) + set(WITH_JAVASCRIPT OFF) + set(WITH_LIBEVENT OFF) if(MSVC) if(ARROW_USE_STATIC_CRT) - set(THRIFT_LIB_SUFFIX "mt") - list(APPEND THRIFT_CMAKE_ARGS "-DWITH_MT=ON") + set(WITH_MT ON) else() - set(THRIFT_LIB_SUFFIX "md") - list(APPEND THRIFT_CMAKE_ARGS "-DWITH_MT=OFF") + set(WITH_MT OFF) endif() - # NOTE(amoeba): When you bump Thrift to >=0.21.0, change bin to lib - set(THRIFT_LIB - "${THRIFT_PREFIX}/bin/${CMAKE_IMPORT_LIBRARY_PREFIX}thrift${THRIFT_LIB_SUFFIX}${CMAKE_IMPORT_LIBRARY_SUFFIX}" - ) - else() - set(THRIFT_LIB - "${THRIFT_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}thrift${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) endif() + set(WITH_NODEJS OFF) + set(WITH_PYTHON OFF) + set(WITH_QT5 OFF) + set(WITH_ZLIB OFF) - if(BOOST_VENDORED) - set(THRIFT_DEPENDENCIES ${THRIFT_DEPENDENCIES} boost_ep) - endif() + # Apache Thrift may change CMAKE_DEBUG_POSTFIX. So we'll restore the + # original CMAKE_DEBUG_POSTFIX later. + set(CMAKE_DEBUG_POSTFIX_KEEP ${CMAKE_DEBUG_POSTFIX}) - set(THRIFT_PATCH_COMMAND) - if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 15.0) - # Thrift 0.21.0 doesn't support GCC 15. - # https://github.com/apache/arrow/issues/45096 - # https://github.com/apache/thrift/pull/3078 - find_program(PATCH patch REQUIRED) - list(APPEND - THRIFT_PATCH_COMMAND - ${PATCH} - -p1 - -i - ${CMAKE_CURRENT_LIST_DIR}/thrift-cstdint.patch) + # Remove Apache Arrow's CMAKE_MODULE_PATH to ensure using Apache + # Thrift's cmake_modules/. + # + # We can remove this once https://github.com/apache/thrift/pull/3176 + # is merged. + list(POP_FRONT CMAKE_MODULE_PATH) + fetchcontent_makeavailable(thrift) + + # Apache Thrift may change CMAKE_DEBUG_POSTFIX. So we restore + # CMAKE_DEBUG_POSTFIX. + set(CMAKE_DEBUG_POSTFIX + ${CMAKE_DEBUG_POSTFIX_KEEP} + CACHE BOOL "" FORCE) + + if(CMAKE_VERSION VERSION_LESS 3.28) + set_property(DIRECTORY ${thrift_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL TRUE) endif() - externalproject_add(thrift_ep - ${EP_COMMON_OPTIONS} - URL ${THRIFT_SOURCE_URL} - URL_HASH "SHA256=${ARROW_THRIFT_BUILD_SHA256_CHECKSUM}" - BUILD_BYPRODUCTS "${THRIFT_LIB}" - CMAKE_ARGS ${THRIFT_CMAKE_ARGS} - DEPENDS ${THRIFT_DEPENDENCIES} - PATCH_COMMAND ${THRIFT_PATCH_COMMAND}) - - add_library(thrift::thrift STATIC IMPORTED) - # The include directory must exist before it is referenced by a target. - file(MAKE_DIRECTORY "${THRIFT_INCLUDE_DIR}") - set_target_properties(thrift::thrift PROPERTIES IMPORTED_LOCATION "${THRIFT_LIB}") - target_include_directories(thrift::thrift BEFORE INTERFACE "${THRIFT_INCLUDE_DIR}") - if(ARROW_USE_BOOST) - target_link_libraries(thrift::thrift INTERFACE Boost::headers) + target_include_directories(thrift + INTERFACE $ + $ + ) + if(BOOST_VENDORED) + target_link_libraries(thrift PUBLIC $) + target_link_libraries(thrift PRIVATE $) endif() - add_dependencies(thrift::thrift thrift_ep) - set(Thrift_VERSION ${ARROW_THRIFT_BUILD_VERSION}) - set(THRIFT_VENDORED TRUE) - list(APPEND ARROW_BUNDLED_STATIC_LIBS thrift::thrift) -endmacro() + add_library(thrift::thrift INTERFACE IMPORTED) + target_link_libraries(thrift::thrift INTERFACE thrift) + + set(Thrift_VERSION + ${ARROW_THRIFT_BUILD_VERSION} + PARENT_SCOPE) + set(THRIFT_VENDORED + TRUE + PARENT_SCOPE) + set(ARROW_BUNDLED_STATIC_LIBS + ${ARROW_BUNDLED_STATIC_LIBS} thrift + PARENT_SCOPE) + + list(POP_BACK CMAKE_MESSAGE_INDENT) +endfunction() if(ARROW_WITH_THRIFT) # Thrift C++ code generated by 0.13 requires 0.11 or greater @@ -2276,32 +2270,47 @@ if(ARROW_MIMALLOC) # We only use a vendored mimalloc as we want to control its build options. set(MIMALLOC_LIB_BASE_NAME "mimalloc") - if(WIN32) - set(MIMALLOC_LIB_BASE_NAME "${MIMALLOC_LIB_BASE_NAME}-static") - endif() if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") set(MIMALLOC_LIB_BASE_NAME "${MIMALLOC_LIB_BASE_NAME}-${LOWERCASE_BUILD_TYPE}") endif() set(MIMALLOC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/mimalloc_ep/src/mimalloc_ep") - set(MIMALLOC_INCLUDE_DIR "${MIMALLOC_PREFIX}/include/mimalloc-2.0") + set(MIMALLOC_INCLUDE_DIR "${MIMALLOC_PREFIX}/include") set(MIMALLOC_STATIC_LIB - "${MIMALLOC_PREFIX}/lib/mimalloc-2.0/${CMAKE_STATIC_LIBRARY_PREFIX}${MIMALLOC_LIB_BASE_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${MIMALLOC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${MIMALLOC_LIB_BASE_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) + set(MIMALLOC_C_FLAGS ${EP_C_FLAGS}) + if(MINGW) + # Workaround https://github.com/microsoft/mimalloc/issues/910 on RTools40 + set(MIMALLOC_C_FLAGS "${MIMALLOC_C_FLAGS} -DERROR_COMMITMENT_MINIMUM=635") + endif() + + set(MIMALLOC_PATCH_COMMAND "") + if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") + find_program(PATCH patch REQUIRED) + set(MIMALLOC_PATCH_COMMAND ${PATCH} -p1 -i + ${CMAKE_CURRENT_LIST_DIR}/mimalloc-1138.patch) + endif() + set(MIMALLOC_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} + "-DCMAKE_C_FLAGS=${MIMALLOC_C_FLAGS}" "-DCMAKE_INSTALL_PREFIX=${MIMALLOC_PREFIX}" + -DMI_INSTALL_TOPLEVEL=ON -DMI_OVERRIDE=OFF -DMI_LOCAL_DYNAMIC_TLS=ON -DMI_BUILD_OBJECT=OFF -DMI_BUILD_SHARED=OFF - -DMI_BUILD_TESTS=OFF) + -DMI_BUILD_TESTS=OFF + # GH-47229: Force mimalloc to generate armv8.0 binary + -DMI_NO_OPT_ARCH=ON) externalproject_add(mimalloc_ep ${EP_COMMON_OPTIONS} URL ${MIMALLOC_SOURCE_URL} URL_HASH "SHA256=${ARROW_MIMALLOC_BUILD_SHA256_CHECKSUM}" + PATCH_COMMAND ${MIMALLOC_PATCH_COMMAND} CMAKE_ARGS ${MIMALLOC_CMAKE_ARGS} BUILD_BYPRODUCTS "${MIMALLOC_STATIC_LIB}") @@ -2432,6 +2441,7 @@ if(ARROW_TESTING) set(ARROW_GTEST_GMOCK GTest::gmock) set(ARROW_GTEST_GTEST GTest::gtest) set(ARROW_GTEST_GTEST_MAIN GTest::gtest_main) + set(ARROW_GTEST_GMOCK_MAIN GTest::gmock_main) else() string(APPEND ARROW_TESTING_PC_CFLAGS " -I\${includedir}/arrow-gtest") string(APPEND ARROW_TESTING_PC_LIBS " -larrow_gtest") @@ -2439,6 +2449,7 @@ if(ARROW_TESTING) set(ARROW_GTEST_GMOCK arrow::GTest::gmock) set(ARROW_GTEST_GTEST arrow::GTest::gtest) set(ARROW_GTEST_GTEST_MAIN arrow::GTest::gtest_main) + set(ARROW_GTEST_GMOCK_MAIN arrow::GTest::gmock_main) endif() endif() @@ -2582,7 +2593,7 @@ if(ARROW_USE_XSIMD) IS_RUNTIME_DEPENDENCY FALSE REQUIRED_VERSION - "8.1.0") + "13.0.0") if(xsimd_SOURCE STREQUAL "BUNDLED") set(ARROW_XSIMD arrow::xsimd) @@ -2654,35 +2665,46 @@ if(ARROW_WITH_ZLIB) resolve_dependency(ZLIB PC_PACKAGE_NAMES zlib) endif() -macro(build_lz4) - message(STATUS "Building LZ4 from source") +function(build_lz4) + message(STATUS "Building LZ4 from source using FetchContent") - set(LZ4_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/lz4_ep-install") + # Set LZ4 as vendored + set(LZ4_VENDORED + TRUE + PARENT_SCOPE) - set(LZ4_STATIC_LIB - "${LZ4_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}lz4${CMAKE_STATIC_LIBRARY_SUFFIX}") + # Declare the content + fetchcontent_declare(lz4 + URL ${LZ4_SOURCE_URL} + URL_HASH "SHA256=${ARROW_LZ4_BUILD_SHA256_CHECKSUM}" + SOURCE_SUBDIR "build/cmake") - set(LZ4_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX= - -DLZ4_BUILD_CLI=OFF -DLZ4_BUILD_LEGACY_LZ4C=OFF) + # Prepare fetch content environment + prepare_fetchcontent() - # We need to copy the header in lib to directory outside of the build - externalproject_add(lz4_ep - ${EP_COMMON_OPTIONS} - CMAKE_ARGS ${LZ4_CMAKE_ARGS} - SOURCE_SUBDIR "build/cmake" - INSTALL_DIR ${LZ4_PREFIX} - URL ${LZ4_SOURCE_URL} - URL_HASH "SHA256=${ARROW_LZ4_BUILD_SHA256_CHECKSUM}" - BUILD_BYPRODUCTS ${LZ4_STATIC_LIB}) - - file(MAKE_DIRECTORY "${LZ4_PREFIX}/include") - add_library(LZ4::lz4 STATIC IMPORTED) - set_target_properties(LZ4::lz4 PROPERTIES IMPORTED_LOCATION "${LZ4_STATIC_LIB}") - target_include_directories(LZ4::lz4 BEFORE INTERFACE "${LZ4_PREFIX}/include") - add_dependencies(LZ4::lz4 lz4_ep) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS LZ4::lz4) -endmacro() + # Set LZ4-specific build options as cache variables + set(LZ4_BUILD_CLI + OFF + CACHE BOOL "Don't build LZ4 CLI" FORCE) + set(LZ4_BUILD_LEGACY_LZ4C + OFF + CACHE BOOL "Don't build legacy LZ4 tools" FORCE) + + # Make the dependency available - this will actually perform the download and configure + fetchcontent_makeavailable(lz4) + + # Use LZ4::lz4 as an imported library not an alias of lz4_static so other targets such as orc + # can depend on it as an external library. External libraries are ignored in + # install(TARGETS orc EXPORT orc_targets) and install(EXPORT orc_targets). + add_library(LZ4::lz4 INTERFACE IMPORTED) + target_link_libraries(LZ4::lz4 INTERFACE lz4_static) + + # Add to bundled static libs. + # We must use lz4_static (not imported target) not LZ4::lz4 (imported target). + set(ARROW_BUNDLED_STATIC_LIBS + ${ARROW_BUNDLED_STATIC_LIBS} lz4_static + PARENT_SCOPE) +endfunction() if(ARROW_WITH_LZ4) resolve_dependency(lz4 @@ -2863,7 +2885,7 @@ if(ARROW_WITH_BZ2) if(BZIP2_TYPE STREQUAL "INTERFACE_LIBRARY") # Conan string(APPEND ARROW_PC_LIBS_PRIVATE - " $>") + " $>>") else() string(APPEND ARROW_PC_LIBS_PRIVATE " $") endif() @@ -2911,7 +2933,7 @@ endmacro() if(ARROW_WITH_UTF8PROC) set(utf8proc_resolve_dependency_args utf8proc PC_PACKAGE_NAMES libutf8proc) - if(NOT VCPKG_TOOLCHAIN) + if(NOT ARROW_VCPKG) # utf8proc in vcpkg doesn't provide version information: # https://github.com/microsoft/vcpkg/issues/39176 list(APPEND utf8proc_resolve_dependency_args REQUIRED_VERSION "2.2.0") @@ -4590,105 +4612,98 @@ target_include_directories(arrow::hadoop INTERFACE "${HADOOP_HOME}/include") # Apache ORC function(build_orc) + list(APPEND CMAKE_MESSAGE_INDENT "Apache ORC: ") + message(STATUS "Building Apache ORC from source") + set(ORC_PATCHES) + if(MSVC) + # We can remove this once bundled Apache ORC is 2.2.1 or later. + list(APPEND ORC_PATCHES ${CMAKE_CURRENT_LIST_DIR}/orc-2345.patch) + endif() + if(Protobuf_VERSION VERSION_GREATER_EQUAL 32.0) + # We can remove this once bundled Apache ORC is 2.2.1 or later. + list(APPEND ORC_PATCHES ${CMAKE_CURRENT_LIST_DIR}/orc-2357.patch) + endif() + if(ORC_PATCHES) + find_program(PATCH patch REQUIRED) + set(ORC_PATCH_COMMAND ${PATCH} -p1 -i ${ORC_PATCHES}) + else() + set(ORC_PATCH_COMMAND) + endif() + + if(LZ4_VENDORED) + set(ORC_LZ4_TARGET lz4_static) + set(ORC_LZ4_ROOT "${lz4_SOURCE_DIR}") + set(ORC_LZ4_INCLUDE_DIR "${lz4_SOURCE_DIR}/lib") + else() + set(ORC_LZ4_TARGET LZ4::lz4) + get_target_property(ORC_LZ4_INCLUDE_DIR ${ORC_LZ4_TARGET} + INTERFACE_INCLUDE_DIRECTORIES) + get_filename_component(ORC_LZ4_ROOT "${ORC_LZ4_INCLUDE_DIR}" DIRECTORY) + endif() + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.29) fetchcontent_declare(orc ${FC_DECLARE_COMMON_OPTIONS} + PATCH_COMMAND ${ORC_PATCH_COMMAND} URL ${ORC_SOURCE_URL} URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() set(CMAKE_UNITY_BUILD FALSE) - set(ORC_PREFER_STATIC_LZ4 - OFF - CACHE BOOL "" FORCE) - get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) - if(NOT LZ4_INCLUDE_DIR) - find_path(LZ4_INCLUDE_DIR NAMES lz4.h) - endif() - get_filename_component(LZ4_ROOT "${LZ4_INCLUDE_DIR}" DIRECTORY) - set(LZ4_HOME - "${LZ4_ROOT}" - CACHE STRING "" FORCE) - set(LZ4_LIBRARY - LZ4::lz4 - CACHE STRING "" FORCE) + set(ORC_PREFER_STATIC_LZ4 OFF) + set(LZ4_HOME "${ORC_LZ4_ROOT}") + set(LZ4_INCLUDE_DIR "${ORC_LZ4_INCLUDE_DIR}") + set(LZ4_LIBRARY ${ORC_LZ4_TARGET}) - set(ORC_PREFER_STATIC_PROTOBUF - OFF - CACHE BOOL "" FORCE) + set(ORC_PREFER_STATIC_PROTOBUF OFF) get_target_property(PROTOBUF_INCLUDE_DIR ${ARROW_PROTOBUF_LIBPROTOBUF} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(Protobuf_ROOT "${PROTOBUF_INCLUDE_DIR}" DIRECTORY) - set(PROTOBUF_HOME - ${Protobuf_ROOT} - CACHE STRING "" FORCE) + set(PROTOBUF_HOME ${Protobuf_ROOT}) # ORC uses this. - target_include_directories(${ARROW_PROTOBUF_LIBPROTOC} - INTERFACE "${PROTOBUF_INCLUDE_DIR}") + if(PROTOBUF_VENDORED) + target_include_directories(${ARROW_PROTOBUF_LIBPROTOC} + INTERFACE "${PROTOBUF_INCLUDE_DIR}") + endif() set(PROTOBUF_EXECUTABLE ${ARROW_PROTOBUF_PROTOC}) set(PROTOBUF_LIBRARY ${ARROW_PROTOBUF_LIBPROTOBUF}) set(PROTOC_LIBRARY ${ARROW_PROTOBUF_LIBPROTOC}) - set(ORC_PREFER_STATIC_SNAPPY - OFF - CACHE BOOL "" FORCE) + set(ORC_PREFER_STATIC_SNAPPY OFF) get_target_property(SNAPPY_INCLUDE_DIR ${Snappy_TARGET} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(Snappy_ROOT "${SNAPPY_INCLUDE_DIR}" DIRECTORY) - set(SNAPPY_HOME - ${Snappy_ROOT} - CACHE STRING "" FORCE) - set(SNAPPY_LIBRARY - ${Snappy_TARGET} - CACHE STRING "" FORCE) + set(SNAPPY_HOME ${Snappy_ROOT}) + set(SNAPPY_LIBRARY ${Snappy_TARGET}) - set(ORC_PREFER_STATIC_ZLIB - OFF - CACHE BOOL "" FORCE) + set(ORC_PREFER_STATIC_ZLIB OFF) get_target_property(ZLIB_INCLUDE_DIR ZLIB::ZLIB INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ZLIB_ROOT "${ZLIB_INCLUDE_DIR}" DIRECTORY) - set(ZLIB_HOME - ${ZLIB_ROOT} - CACHE STRING "" FORCE) - # From CMake 3.21 onwards the set(CACHE) command does not remove any normal - # variable of the same name from the current scope. We have to manually remove - # the variable via unset to avoid ORC not finding the ZLIB_LIBRARY. + set(ZLIB_HOME ${ZLIB_ROOT}) + # From CMake 3.21 onwards the set(CACHE) command does not remove + # any normal variable of the same name from the current scope. We + # have to manually remove the variable via unset to avoid ORC not + # finding the ZLIB_LIBRARY. unset(ZLIB_LIBRARY) set(ZLIB_LIBRARY ZLIB::ZLIB CACHE STRING "" FORCE) - set(ORC_PREFER_STATIC_ZSTD - OFF - CACHE BOOL "" FORCE) + set(ORC_PREFER_STATIC_ZSTD OFF) get_target_property(ZSTD_INCLUDE_DIR ${ARROW_ZSTD_LIBZSTD} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ZSTD_ROOT "${ZSTD_INCLUDE_DIR}" DIRECTORY) - set(ZSTD_HOME - ${ZSTD_ROOT} - CACHE STRING "" FORCE) + set(ZSTD_HOME ${ZSTD_ROOT}) set(ZSTD_LIBRARY ${ARROW_ZSTD_LIBZSTD}) - set(BUILD_CPP_TESTS - OFF - CACHE BOOL "" FORCE) - set(BUILD_JAVA - OFF - CACHE BOOL "" FORCE) - set(BUILD_LIBHDFSPP - OFF - CACHE BOOL "" FORCE) - set(BUILD_TOOLS - OFF - CACHE BOOL "" FORCE) - set(INSTALL_VENDORED_LIBS - OFF - CACHE BOOL "" FORCE) - set(STOP_BUILD_ON_WARNING - OFF - CACHE BOOL "" FORCE) + set(BUILD_CPP_TESTS OFF) + set(BUILD_JAVA OFF) + set(BUILD_LIBHDFSPP OFF) + set(BUILD_TOOLS OFF) + set(INSTALL_VENDORED_LIBS OFF) + set(STOP_BUILD_ON_WARNING OFF) fetchcontent_makeavailable(orc) @@ -4712,9 +4727,6 @@ function(build_orc) INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ORC_SNAPPY_ROOT "${ORC_SNAPPY_INCLUDE_DIR}" DIRECTORY) - get_target_property(ORC_LZ4_ROOT LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) - get_filename_component(ORC_LZ4_ROOT "${ORC_LZ4_ROOT}" DIRECTORY) - get_target_property(ORC_ZSTD_ROOT ${ARROW_ZSTD_LIBZSTD} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ORC_ZSTD_ROOT "${ORC_ZSTD_ROOT}" DIRECTORY) @@ -4738,9 +4750,9 @@ function(build_orc) "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" "-DSNAPPY_LIBRARY=$" "-DLZ4_HOME=${ORC_LZ4_ROOT}" - "-DLZ4_LIBRARY=$" - "-DLZ4_STATIC_LIB=$" - "-DLZ4_INCLUDE_DIR=${ORC_LZ4_ROOT}/include" + "-DLZ4_LIBRARY=$" + "-DLZ4_STATIC_LIB=$" + "-DLZ4_INCLUDE_DIR=${ORC_LZ4_INCLUDE_DIR}" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" "-DZSTD_HOME=${ORC_ZSTD_ROOT}" "-DZSTD_INCLUDE_DIR=$" @@ -4754,16 +4766,17 @@ function(build_orc) externalproject_add(orc_ep ${EP_COMMON_OPTIONS} - URL ${ORC_SOURCE_URL} - URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS ${ORC_STATIC_LIB} CMAKE_ARGS ${ORC_CMAKE_ARGS} DEPENDS ${ARROW_PROTOBUF_LIBPROTOBUF} ${ARROW_PROTOBUF_PROTOC} ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET} - LZ4::lz4 - ZLIB::ZLIB) + ${ORC_LZ4_TARGET} + ZLIB::ZLIB + PATCH_COMMAND ${ORC_PATCH_COMMAND} + URL ${ORC_SOURCE_URL} + URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}") add_library(orc::orc STATIC IMPORTED) set_target_properties(orc::orc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") target_include_directories(orc::orc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") @@ -4791,6 +4804,8 @@ function(build_orc) set(ARROW_BUNDLED_STATIC_LIBS ${ARROW_BUNDLED_STATIC_LIBS} PARENT_SCOPE) + + list(POP_BACK CMAKE_MESSAGE_INDENT) endfunction() if(ARROW_ORC) @@ -4914,7 +4929,7 @@ macro(build_opentelemetry) -DWITH_OTLP_HTTP_SSL_PREVIEW=OFF -DWITH_OTLP_HTTP_SSL_TLS_PREVIEW=OFF "-DProtobuf_INCLUDE_DIR=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" - "-DProtobuf_LIBRARY=${OPENTELEMETRY_PROTOBUF_INCLUDE_DIR}" + "-DProtobuf_LIBRARY=${OPENTELEMETRY_PROTOBUF_LIBRARY}" "-DProtobuf_PROTOC_EXECUTABLE=${OPENTELEMETRY_PROTOC_EXECUTABLE}") # OpenTelemetry with OTLP enabled requires Protobuf definitions from a @@ -5039,442 +5054,230 @@ endif() # ---------------------------------------------------------------------- # AWS SDK for C++ -include(AWSSDKVariables) +function(build_awssdk) + list(APPEND CMAKE_MESSAGE_INDENT "AWS SDK for C++: ") -macro(build_awssdk) - message(STATUS "Building AWS C++ SDK from source") - set(AWSSDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/awssdk_ep-install") - set(AWSSDK_INCLUDE_DIR "${AWSSDK_PREFIX}/include") + message(STATUS "Building AWS SDK for C++ from source") - # The AWS SDK has a few warnings around shortening lengths - set(AWS_C_FLAGS "${EP_C_FLAGS}") - set(AWS_CXX_FLAGS "${EP_CXX_FLAGS}") - if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL - "Clang") - # Negate warnings that AWS SDK cannot build under - string(APPEND AWS_C_FLAGS " -Wno-error=shorten-64-to-32") - string(APPEND AWS_CXX_FLAGS " -Wno-error=shorten-64-to-32") - endif() - if(NOT MSVC) - string(APPEND AWS_C_FLAGS " -Wno-deprecated") - string(APPEND AWS_CXX_FLAGS " -Wno-deprecated") + # aws-c-common must be the first product because others depend on + # this. + set(AWSSDK_PRODUCTS aws-c-common) + if(LINUX) + list(APPEND AWSSDK_PRODUCTS aws-lc s2n-tls) endif() - # GH-44950: This is required to build under Rtools40 and we may be able to - # remove it if/when we no longer need to build under Rtools40 - if(WIN32 AND NOT MSVC) - string(APPEND - AWS_C_FLAGS - " -D_WIN32_WINNT=0x0601 -D__USE_MINGW_ANSI_STDIO=1 -Wno-error -Wno-error=format= -Wno-error=format-extra-args -Wno-unused-local-typedefs -Wno-unused-variable" - ) - string(APPEND - AWS_CXX_FLAGS - " -D_WIN32_WINNT=0x0601 -D__USE_MINGW_ANSI_STDIO=1 -Wno-error -Wno-error=format= -Wno-error=format-extra-args -Wno-unused-local-typedefs -Wno-unused-variable" + list(APPEND + AWSSDK_PRODUCTS + # We can't sort this in alphabetical order because some + # products depend on other products. + aws-checksums + aws-c-cal + aws-c-io + aws-c-event-stream + aws-c-sdkutils + aws-c-compression + aws-c-http + aws-c-mqtt + aws-c-auth + aws-c-s3 + aws-crt-cpp + aws-sdk-cpp) + set(AWS_SDK_CPP_SOURCE_URL "${AWSSDK_SOURCE_URL}") + set(ARROW_AWS_SDK_CPP_BUILD_SHA256_CHECKSUM "${ARROW_AWSSDK_BUILD_SHA256_CHECKSUM}") + foreach(AWSSDK_PRODUCT ${AWSSDK_PRODUCTS}) + # aws-c-cal -> + # AWS-C-CAL + string(TOUPPER "${AWSSDK_PRODUCT}" BASE_VARIABLE_NAME) + # AWS-C-CAL -> + # AWS_C_CAL + string(REGEX REPLACE "-" "_" BASE_VARIABLE_NAME "${BASE_VARIABLE_NAME}") + fetchcontent_declare(${AWSSDK_PRODUCT} + ${FC_DECLARE_COMMON_OPTIONS} OVERRIDE_FIND_PACKAGE + URL ${${BASE_VARIABLE_NAME}_SOURCE_URL} + URL_HASH "SHA256=${ARROW_${BASE_VARIABLE_NAME}_BUILD_SHA256_CHECKSUM}" ) - endif() + endforeach() - set(AWSSDK_COMMON_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} - -DCMAKE_C_FLAGS=${AWS_C_FLAGS} - -DCMAKE_CXX_FLAGS=${AWS_CXX_FLAGS} - -DCPP_STANDARD=${CMAKE_CXX_STANDARD} - -DCMAKE_INSTALL_PREFIX=${AWSSDK_PREFIX} - -DCMAKE_PREFIX_PATH=${AWSSDK_PREFIX} - -DENABLE_TESTING=OFF - -DENABLE_UNITY_BUILD=ON - -DOPENSSL_CRYPTO_LIBRARY=${OPENSSL_CRYPTO_LIBRARY} - -DOPENSSL_INCLUDE_DIR=${OPENSSL_INCLUDE_DIR} - -DOPENSSL_SSL_LIBRARY=${OPENSSL_SSL_LIBRARY} - -Dcrypto_INCLUDE_DIR=${OPENSSL_INCLUDE_DIR} - -Dcrypto_LIBRARY=${OPENSSL_CRYPTO_LIBRARY}) - if(ARROW_OPENSSL_USE_SHARED) - list(APPEND AWSSDK_COMMON_CMAKE_ARGS - -Dcrypto_SHARED_LIBRARY=${OPENSSL_CRYPTO_LIBRARY}) - else() - list(APPEND AWSSDK_COMMON_CMAKE_ARGS - -Dcrypto_STATIC_LIBRARY=${OPENSSL_CRYPTO_LIBRARY}) - endif() - set(AWSSDK_CMAKE_ARGS - ${AWSSDK_COMMON_CMAKE_ARGS} - -DBUILD_DEPS=OFF - -DBUILD_ONLY=config\\$s3\\$transfer\\$identity-management\\$sts - -DMINIMIZE_SIZE=ON) - # Remove unused directories to save build directory storage. - # 807MB -> 31MB - set(AWSSDK_PATCH_COMMAND ${CMAKE_COMMAND} -E) - if(CMAKE_VERSION VERSION_LESS 3.17) - list(APPEND AWSSDK_PATCH_COMMAND remove_directory) - else() - list(APPEND AWSSDK_PATCH_COMMAND rm -rf) + prepare_fetchcontent() + set(BUILD_DEPS OFF) + set(BUILD_TOOL OFF) + set(CMAKE_UNITY_BUILD OFF) # Unity build causes some build errors. + set(ENABLE_TESTING OFF) + set(IN_SOURCE_BUILD ON) + set(MINIMIZE_SIZE ON) + set(USE_OPENSSL ON) + + # For aws-c-common + if(MINGW) + # PPROCESSOR_NUMBER requires Windows 7 or later. + # + # 0x0601 == _WIN32_WINNT_WIN7 + string(APPEND CMAKE_C_FLAGS " -D_WIN32_WINNT=0x0601") + string(APPEND CMAKE_CXX_FLAGS " -D_WIN32_WINNT=0x0601") endif() - list(APPEND AWSSDK_PATCH_COMMAND ${AWSSDK_UNUSED_DIRECTORIES}) - # Patch parts of the AWSSDK EP so it builds cleanly under Rtools40 - if(WIN32 AND NOT MSVC) - find_program(PATCH patch REQUIRED) - # Patch aws_c_common to build under Rtools40 - set(AWS_C_COMMON_PATCH_COMMAND ${PATCH} -p1 -i - ${CMAKE_SOURCE_DIR}/../ci/rtools/aws_c_common_ep.patch) - message(STATUS "Hello ${AWS_C_COMMON_PATCH_COMMAND}") - # aws_c_io_ep to build under Rtools40 - set(AWS_C_IO_PATCH_COMMAND ${PATCH} -p1 -i - ${CMAKE_SOURCE_DIR}/../ci/rtools/aws_c_io_ep.patch) - message(STATUS "Hello ${AWS_C_IO_PATCH_COMMAND}") - # awssdk_ep to build under Rtools40 - list(APPEND - AWSSDK_PATCH_COMMAND - && - ${PATCH} - -p1 - -i - ${CMAKE_SOURCE_DIR}/../ci/rtools/awssdk_ep.patch) - message(STATUS "Hello ${AWSSDK_PATCH_COMMAND}") - endif() + # For aws-lc + set(DISABLE_GO ON) + set(DISABLE_PERL ON) - if(UNIX) - # on Linux and macOS curl seems to be required - find_curl() - get_filename_component(CURL_ROOT_HINT "${CURL_INCLUDE_DIRS}" DIRECTORY) - get_filename_component(ZLIB_ROOT_HINT "${ZLIB_INCLUDE_DIRS}" DIRECTORY) + # For s2n-tls + set(crypto_INCLUDE_DIR "$") + set(crypto_STATIC_LIBRARY "$") + set(S2N_INTERN_LIBCRYPTO ON) - # provide hint for AWS SDK to link with the already located libcurl and zlib - list(APPEND - AWSSDK_CMAKE_ARGS - -DCURL_INCLUDE_DIR=${CURL_ROOT_HINT}/include - -DCURL_LIBRARY=${CURL_ROOT_HINT}/lib - -DZLIB_INCLUDE_DIR=${ZLIB_ROOT_HINT}/include - -DZLIB_LIBRARY=${ZLIB_ROOT_HINT}/lib) - endif() - - file(MAKE_DIRECTORY ${AWSSDK_INCLUDE_DIR}) - - # AWS C++ SDK related libraries to link statically - set(_AWSSDK_LIBS - aws-cpp-sdk-identity-management - aws-cpp-sdk-sts - aws-cpp-sdk-cognito-identity - aws-cpp-sdk-s3 - aws-cpp-sdk-core - aws-crt-cpp - aws-c-s3 - aws-c-auth - aws-c-mqtt - aws-c-http - aws-c-compression - aws-c-sdkutils - aws-c-event-stream - aws-c-io - aws-c-cal - aws-checksums - aws-c-common) - - # aws-lc needs to be installed on a separate folder to hide from unintended use - set(AWS_LC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/aws_lc_ep-install") - set(AWS_LC_INCLUDE_DIR "${AWS_LC_PREFIX}/include") - - if(UNIX AND NOT APPLE) # aws-lc and s2n-tls only needed on linux - file(MAKE_DIRECTORY ${AWS_LC_INCLUDE_DIR}) - list(APPEND _AWSSDK_LIBS s2n-tls aws-lc) - endif() - - set(AWSSDK_LIBRARIES) - foreach(_AWSSDK_LIB ${_AWSSDK_LIBS}) - # aws-c-common -> AWS-C-COMMON - string(TOUPPER ${_AWSSDK_LIB} _AWSSDK_LIB_UPPER) - # AWS-C-COMMON -> AWS_C_COMMON - string(REPLACE "-" "_" _AWSSDK_LIB_NAME_PREFIX ${_AWSSDK_LIB_UPPER}) - set(_AWSSDK_STATIC_LIBRARY - "${AWSSDK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${_AWSSDK_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - if(${_AWSSDK_LIB} STREQUAL "s2n-tls") # Build output of s2n-tls is libs2n.a - set(_AWSSDK_STATIC_LIBRARY - "${AWSSDK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}s2n${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - elseif(${_AWSSDK_LIB} STREQUAL "aws-lc") # We only need libcrypto from aws-lc - set(_AWSSDK_STATIC_LIBRARY - "${AWS_LC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}crypto${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - endif() - if(${_AWSSDK_LIB} MATCHES "^aws-cpp-sdk-") - set(_AWSSDK_TARGET_NAME ${_AWSSDK_LIB}) - elseif(${_AWSSDK_LIB} STREQUAL "aws-lc") - set(_AWSSDK_TARGET_NAME AWS::crypto) - else() - set(_AWSSDK_TARGET_NAME AWS::${_AWSSDK_LIB}) - endif() - add_library(${_AWSSDK_TARGET_NAME} STATIC IMPORTED) - set_target_properties(${_AWSSDK_TARGET_NAME} PROPERTIES IMPORTED_LOCATION - ${_AWSSDK_STATIC_LIBRARY}) - target_include_directories(${_AWSSDK_TARGET_NAME} BEFORE - INTERFACE "${AWSSDK_INCLUDE_DIR}") - if(${_AWSSDK_LIB} STREQUAL "aws-lc") - set_target_properties(${_AWSSDK_TARGET_NAME} PROPERTIES IMPORTED_LOCATION - ${_AWSSDK_STATIC_LIBRARY}) - target_include_directories(${_AWSSDK_TARGET_NAME} BEFORE - INTERFACE "${AWS_LC_INCLUDE_DIR}") - endif() - set("${_AWSSDK_LIB_NAME_PREFIX}_STATIC_LIBRARY" ${_AWSSDK_STATIC_LIBRARY}) + # For aws-lc and s2n-tls + # + # Link time optimization is causing trouble like GH-34349 + string(REPLACE "-flto=auto" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + string(REPLACE "-ffat-lto-objects" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - if(NOT ${_AWSSDK_LIB} STREQUAL "aws-lc") - # aws-lc only linked against s2n but not arrow - list(APPEND AWSSDK_LIBRARIES ${_AWSSDK_TARGET_NAME}) - endif() - endforeach() + # For aws-c-io + if(MINGW AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9") + # This is for RTools 40. We can remove this after we dropped + # support for R < 4.2. schannel.h in RTools 40 is old. - externalproject_add(aws_c_common_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_COMMON_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_COMMON_BUILD_SHA256_CHECKSUM}" - PATCH_COMMAND ${AWS_C_COMMON_PATCH_COMMAND} - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_COMMON_STATIC_LIBRARY}) - add_dependencies(AWS::aws-c-common aws_c_common_ep) - - set(AWS_CHECKSUMS_CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS}) - if(NOT WIN32) - # On non-Windows, always build in release mode. - # Especially with gcc, debug builds can fail with "asm constraint" errors: - # https://github.com/TileDB-Inc/TileDB/issues/1351 - list(APPEND AWS_CHECKSUMS_CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release) + # For schannel.h + # + # See also: + # https://learn.microsoft.com/en-us/windows/win32/api/schannel/ns-schannel-schannel_cred + string(APPEND CMAKE_C_FLAGS " -DSP_PROT_TLS1_0_SERVER=0x00000040") + string(APPEND CMAKE_C_FLAGS " -DSP_PROT_TLS1_0_CLIENT=0x00000080") + string(APPEND CMAKE_C_FLAGS " -DSP_PROT_TLS1_1_SERVER=0x00000100") + string(APPEND CMAKE_C_FLAGS " -DSP_PROT_TLS1_1_CLIENT=0x00000200") + string(APPEND CMAKE_C_FLAGS " -DSP_PROT_TLS1_2_SERVER=0x00000400") + string(APPEND CMAKE_C_FLAGS " -DSP_PROT_TLS1_2_CLIENT=0x00000800") + string(APPEND CMAKE_C_FLAGS " -DSP_PROT_TLS1_3_SERVER=0x00001000") + string(APPEND CMAKE_C_FLAGS " -DSP_PROT_TLS1_3_CLIENT=0x00002000") + string(APPEND CMAKE_C_FLAGS " -DSCH_USE_STRONG_CRYPTO=0x00400000") + + # For sspi.h + # + # See also: + # https://learn.microsoft.com/en-us/windows/win32/api/sspi/ne-sspi-sec_application_protocol_negotiation_ext + string(APPEND CMAKE_C_FLAGS " -DSecApplicationProtocolNegotiationExt_ALPN=2") + # See also: + # https://learn.microsoft.com/en-us/windows/win32/api/sspi/ns-sspi-secbuffer + string(APPEND CMAKE_C_FLAGS " -DSECBUFFER_ALERT=17") endif() - externalproject_add(aws_checksums_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_CHECKSUMS_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_CHECKSUMS_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWS_CHECKSUMS_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_CHECKSUMS_STATIC_LIBRARY} - DEPENDS aws_c_common_ep) - add_dependencies(AWS::aws-checksums aws_checksums_ep) - - if("s2n-tls" IN_LIST _AWSSDK_LIBS) - # Remove unused directories to save build directory storage. - # 169MB -> 105MB - set(AWS_LC_PATCH_COMMAND ${CMAKE_COMMAND} -E) - if(CMAKE_VERSION VERSION_LESS 3.17) - list(APPEND AWS_LC_PATCH_COMMAND remove_directory) - else() - list(APPEND AWS_LC_PATCH_COMMAND rm -rf) - endif() - list(APPEND AWS_LC_PATCH_COMMAND fuzz) - - set(AWS_LC_C_FLAGS ${EP_C_FLAGS}) - string(APPEND AWS_LC_C_FLAGS " -Wno-error=overlength-strings -Wno-error=pedantic") - # Link time optimization is causing trouble like #34349 - string(REPLACE "-flto=auto" "" AWS_LC_C_FLAGS "${AWS_LC_C_FLAGS}") - string(REPLACE "-ffat-lto-objects" "" AWS_LC_C_FLAGS "${AWS_LC_C_FLAGS}") - set(AWS_LC_CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS}) - list(APPEND AWS_LC_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${AWS_LC_PREFIX} - -DCMAKE_C_FLAGS=${AWS_LC_C_FLAGS}) - - externalproject_add(aws_lc_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_LC_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_LC_BUILD_SHA256_CHECKSUM}" - PATCH_COMMAND ${AWS_LC_PATCH_COMMAND} - CMAKE_ARGS ${AWS_LC_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_LC_STATIC_LIBRARY}) - add_dependencies(AWS::crypto aws_lc_ep) - - set(S2N_TLS_C_FLAGS ${EP_C_FLAGS}) - # Link time optimization is causing trouble like #34349 - string(REPLACE "-flto=auto" "" S2N_TLS_C_FLAGS "${S2N_TLS_C_FLAGS}") - string(REPLACE "-ffat-lto-objects" "" S2N_TLS_C_FLAGS "${S2N_TLS_C_FLAGS}") - - set(S2N_TLS_CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS}) - list(APPEND - S2N_TLS_CMAKE_ARGS - # internalize libcrypto to avoid name conflict with OpenSSL - -DS2N_INTERN_LIBCRYPTO=ON - # path to find crypto provided by aws-lc - -DCMAKE_PREFIX_PATH=${AWS_LC_PREFIX} - -DCMAKE_C_FLAGS=${S2N_TLS_C_FLAGS} - # paths to find crypto provided by aws-lc - -Dcrypto_INCLUDE_DIR=${AWS_LC_PREFIX}/include - -Dcrypto_LIBRARY=${AWS_LC_STATIC_LIBRARY} - -Dcrypto_STATIC_LIBRARY=${AWS_LC_STATIC_LIBRARY}) - - externalproject_add(s2n_tls_ep - ${EP_COMMON_OPTIONS} - URL ${S2N_TLS_SOURCE_URL} - URL_HASH "SHA256=${ARROW_S2N_TLS_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${S2N_TLS_CMAKE_ARGS} - BUILD_BYPRODUCTS ${S2N_TLS_STATIC_LIBRARY} - DEPENDS aws_lc_ep) - add_dependencies(AWS::s2n-tls s2n_tls_ep) + # For aws-sdk-cpp + # + # We need to use CACHE variables because aws-sdk-cpp < 1.12.0 uses + # CMP0077 OLD policy. We can use normal variables when we use + # aws-sdk-cpp >= 1.12.0. + set(AWS_SDK_WARNINGS_ARE_ERRORS + OFF + CACHE BOOL "" FORCE) + set(BUILD_DEPS + OFF + CACHE BOOL "" FORCE) + set(BUILD_ONLY + "" + CACHE STRING "" FORCE) + list(APPEND + BUILD_ONLY + config + core + identity-management + s3 + sts + transfer) + set(BUILD_SHARED_LIBS + OFF + CACHE BOOL "" FORCE) + set(ENABLE_TESTING + OFF + CACHE BOOL "" FORCE) + if(NOT WIN32) + if(ZLIB_VENDORED) + # Use vendored zlib. + set(ZLIB_INCLUDE_DIR + "$" + CACHE STRING "" FORCE) + set(ZLIB_LIBRARY + "$" + CACHE STRING "" FORCE) + endif() endif() + if(MINGW AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9") + # This is for RTools 40. We can remove this after we dropped + # support for R < 4.2. schannel.h in RTools 40 is old. - externalproject_add(aws_c_cal_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_CAL_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_CAL_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_CAL_STATIC_LIBRARY} - DEPENDS aws_c_common_ep) - add_dependencies(AWS::aws-c-cal aws_c_cal_ep) - - set(AWS_C_IO_DEPENDS aws_c_common_ep aws_c_cal_ep) - if(TARGET s2n_tls_ep) - list(APPEND AWS_C_IO_DEPENDS s2n_tls_ep) - endif() - externalproject_add(aws_c_io_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_IO_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_IO_BUILD_SHA256_CHECKSUM}" - PATCH_COMMAND ${AWS_C_IO_PATCH_COMMAND} - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_IO_STATIC_LIBRARY} - DEPENDS ${AWS_C_IO_DEPENDS}) - add_dependencies(AWS::aws-c-io aws_c_io_ep) - - externalproject_add(aws_c_event_stream_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_EVENT_STREAM_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_EVENT_STREAM_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_EVENT_STREAM_STATIC_LIBRARY} - DEPENDS aws_checksums_ep aws_c_io_ep) - add_dependencies(AWS::aws-c-event-stream aws_c_event_stream_ep) - - externalproject_add(aws_c_sdkutils_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_SDKUTILS_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_SDKUTILS_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_SDKUTILS_STATIC_LIBRARY} - DEPENDS aws_c_common_ep) - add_dependencies(AWS::aws-c-sdkutils aws_c_sdkutils_ep) - - externalproject_add(aws_c_compression_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_COMPRESSION_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_COMPRESSION_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_COMPRESSION_STATIC_LIBRARY} - DEPENDS aws_c_common_ep) - add_dependencies(AWS::aws-c-compression aws_c_compression_ep) - - externalproject_add(aws_c_http_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_HTTP_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_HTTP_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_HTTP_STATIC_LIBRARY} - DEPENDS aws_c_io_ep aws_c_compression_ep) - add_dependencies(AWS::aws-c-http aws_c_http_ep) - - externalproject_add(aws_c_mqtt_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_MQTT_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_MQTT_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_MQTT_STATIC_LIBRARY} - DEPENDS aws_c_http_ep) - add_dependencies(AWS::aws-c-mqtt aws_c_mqtt_ep) - - externalproject_add(aws_c_auth_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_AUTH_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_AUTH_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_AUTH_STATIC_LIBRARY} - DEPENDS aws_c_sdkutils_ep aws_c_cal_ep aws_c_http_ep) - add_dependencies(AWS::aws-c-auth aws_c_auth_ep) - - externalproject_add(aws_c_s3_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_C_S3_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_C_S3_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_COMMON_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_C_S3_STATIC_LIBRARY} - DEPENDS aws_checksums_ep aws_c_auth_ep) - add_dependencies(AWS::aws-c-s3 aws_c_s3_ep) - - externalproject_add(aws_crt_cpp_ep - ${EP_COMMON_OPTIONS} - URL ${AWS_CRT_CPP_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWS_CRT_CPP_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${AWSSDK_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_CRT_CPP_STATIC_LIBRARY} - DEPENDS aws_c_auth_ep - aws_c_cal_ep - aws_c_common_ep - aws_c_event_stream_ep - aws_c_http_ep - aws_c_io_ep - aws_c_mqtt_ep - aws_c_s3_ep - aws_checksums_ep) - add_dependencies(AWS::aws-crt-cpp aws_crt_cpp_ep) - - externalproject_add(awssdk_ep - ${EP_COMMON_OPTIONS} - URL ${AWSSDK_SOURCE_URL} - URL_HASH "SHA256=${ARROW_AWSSDK_BUILD_SHA256_CHECKSUM}" - PATCH_COMMAND ${AWSSDK_PATCH_COMMAND} - CMAKE_ARGS ${AWSSDK_CMAKE_ARGS} - BUILD_BYPRODUCTS ${AWS_CPP_SDK_COGNITO_IDENTITY_STATIC_LIBRARY} - ${AWS_CPP_SDK_CORE_STATIC_LIBRARY} - ${AWS_CPP_SDK_IDENTITY_MANAGEMENT_STATIC_LIBRARY} - ${AWS_CPP_SDK_S3_STATIC_LIBRARY} - ${AWS_CPP_SDK_STS_STATIC_LIBRARY} - DEPENDS aws_crt_cpp_ep) - foreach(_AWSSDK_LIB ${_AWSSDK_LIBS}) - if(${_AWSSDK_LIB} MATCHES "^aws-cpp-sdk-") - add_dependencies(${_AWSSDK_LIB} awssdk_ep) + # For winhttp.h + # + # See also: + # https://learn.microsoft.com/en-us/windows/win32/winhttp/error-messages + string(APPEND CMAKE_CXX_FLAGS " -DERROR_WINHTTP_UNHANDLED_SCRIPT_TYPE=12176") + string(APPEND CMAKE_CXX_FLAGS " -DERROR_WINHTTP_SCRIPT_EXECUTION_ERROR=12177") + # See also: + # https://learn.microsoft.com/en-us/windows/win32/api/winhttp/ns-winhttp-winhttp_async_result + string(APPEND CMAKE_CXX_FLAGS " -DAPI_GET_PROXY_FOR_URL=6") + # See also: + # https://learn.microsoft.com/en-us/windows/win32/api/winhttp/nc-winhttp-winhttp_status_callback + string(APPEND CMAKE_CXX_FLAGS " -DWINHTTP_CALLBACK_STATUS_CLOSE_COMPLETE=0x02000000") + string(APPEND CMAKE_CXX_FLAGS + " -DWINHTTP_CALLBACK_STATUS_SHUTDOWN_COMPLETE=0x04000000") + # See also: + # https://learn.microsoft.com/en-us/windows/win32/winhttp/option-flags + string(APPEND CMAKE_CXX_FLAGS " -DWINHTTP_FLAG_SECURE_PROTOCOL_TLS1_2=0x00000800") + string(APPEND CMAKE_CXX_FLAGS " -DWINHTTP_NO_CLIENT_CERT_CONTEXT=0") + endif() + + set(AWSSDK_LINK_LIBRARIES) + foreach(AWSSDK_PRODUCT ${AWSSDK_PRODUCTS}) + if("${AWSSDK_PRODUCT}" STREQUAL "s2n-tls") + # Use aws-lc's openssl/*.h not openssl/*.h in system. + set(ADDITIONAL_FLAGS "-DCOMPILE_DEFINITIONS=-I${aws-lc_SOURCE_DIR}/include") endif() - endforeach() - - set(AWSSDK_VENDORED TRUE) - list(APPEND ARROW_BUNDLED_STATIC_LIBS ${AWSSDK_LIBRARIES}) - set(AWSSDK_LINK_LIBRARIES ${AWSSDK_LIBRARIES}) - if(UNIX) - # on Linux and macOS curl seems to be required - set_property(TARGET aws-cpp-sdk-core - APPEND - PROPERTY INTERFACE_LINK_LIBRARIES CURL::libcurl) - set_property(TARGET AWS::aws-c-cal - APPEND - PROPERTY INTERFACE_LINK_LIBRARIES OpenSSL::Crypto OpenSSL::SSL) - if(APPLE) - set_property(TARGET AWS::aws-c-cal - APPEND - PROPERTY INTERFACE_LINK_LIBRARIES "-framework Security") + fetchcontent_makeavailable(${AWSSDK_PRODUCT}) + if(CMAKE_VERSION VERSION_LESS 3.28) + set_property(DIRECTORY ${${AWSSDK_PRODUCT}_SOURCE_DIR} PROPERTY EXCLUDE_FROM_ALL + TRUE) endif() - if(ZLIB_VENDORED) - set_property(TARGET aws-cpp-sdk-core - APPEND - PROPERTY INTERFACE_LINK_LIBRARIES ZLIB::ZLIB) - add_dependencies(awssdk_ep zlib_ep) + list(PREPEND CMAKE_MODULE_PATH "${${AWSSDK_PRODUCT}_SOURCE_DIR}/cmake") + if(NOT "${AWSSDK_PRODUCT}" STREQUAL "aws-sdk-cpp") + if("${AWSSDK_PRODUCT}" STREQUAL "aws-lc") + # We don't need to link aws-lc. It's used only by s2n-tls. + elseif("${AWSSDK_PRODUCT}" STREQUAL "s2n-tls") + list(PREPEND AWSSDK_LINK_LIBRARIES s2n) + else() + list(PREPEND AWSSDK_LINK_LIBRARIES ${AWSSDK_PRODUCT}) + # This is for find_package(aws-*) in aws-crt-cpp and aws-sdk-cpp. + add_library(AWS::${AWSSDK_PRODUCT} ALIAS ${AWSSDK_PRODUCT}) + endif() endif() - set_property(TARGET AWS::aws-c-io - APPEND - PROPERTY INTERFACE_LINK_LIBRARIES ${CMAKE_DL_LIBS}) - elseif(WIN32) - set_property(TARGET aws-cpp-sdk-core - APPEND - PROPERTY INTERFACE_LINK_LIBRARIES - "winhttp.lib" - "bcrypt.lib" - "wininet.lib" - "userenv.lib" - "version.lib") - set_property(TARGET AWS::aws-c-cal - APPEND - PROPERTY INTERFACE_LINK_LIBRARIES - "bcrypt.lib" - "ncrypt.lib" - "Secur32.lib" - "Shlwapi.lib") - set_property(TARGET AWS::aws-c-io - APPEND - PROPERTY INTERFACE_LINK_LIBRARIES "crypt32.lib") - endif() + endforeach() + list(PREPEND + AWSSDK_LINK_LIBRARIES + aws-cpp-sdk-identity-management + aws-cpp-sdk-sts + aws-cpp-sdk-cognito-identity + aws-cpp-sdk-s3 + aws-cpp-sdk-core) + + set(AWSSDK_VENDORED + TRUE + PARENT_SCOPE) + set(ARROW_BUNDLED_STATIC_LIBS + ${ARROW_BUNDLED_STATIC_LIBS} ${AWSSDK_LINK_LIBRARIES} + PARENT_SCOPE) + set(AWSSDK_LINK_LIBRARIES + ${AWSSDK_LINK_LIBRARIES} + PARENT_SCOPE) - # AWSSDK is static-only build -endmacro() + list(POP_BACK CMAKE_MESSAGE_INDENT) +endfunction() if(ARROW_S3) - resolve_dependency(AWSSDK HAVE_ALT TRUE) + if(NOT WIN32) + # This is for adding system curl dependency. + find_curl() + endif() + # Keep this in sync with s3fs.cc + resolve_dependency(AWSSDK + HAVE_ALT + TRUE + REQUIRED_VERSION + 1.11.0) message(STATUS "Found AWS SDK headers: ${AWSSDK_INCLUDE_DIR}") message(STATUS "Found AWS SDK libraries: ${AWSSDK_LINK_LIBRARIES}") @@ -5494,32 +5297,12 @@ if(ARROW_S3) endif() endif() endif() - - if(APPLE) - # CoreFoundation's path is hardcoded in the CMake files provided by - # aws-sdk-cpp to use the macOS SDK provided by XCode which makes - # XCode a hard dependency. Command Line Tools is often used instead - # of the full XCode suite, so let the linker to find it. - set_target_properties(AWS::aws-c-common - PROPERTIES INTERFACE_LINK_LIBRARIES - "-pthread;pthread;-framework CoreFoundation") - endif() endif() # ---------------------------------------------------------------------- # Azure SDK for C++ function(build_azure_sdk) - if(CMAKE_VERSION VERSION_LESS 3.22) - # We can't disable installing Azure SDK for C++ by - # "set_property(DIRECTORY ${azure_sdk_SOURCE_DIR} PROPERTY - # EXCLUDE_FROM_ALL TRUE)" with CMake 3.16. - # - # At least CMake 3.22 on Ubuntu 22.04 works. So we use 3.22 - # here. We may be able to use more earlier version here. - message(FATAL_ERROR "Building Azure SDK for C++ requires at least CMake 3.22. " - "(At least we can't use CMake 3.16)") - endif() message(STATUS "Building Azure SDK for C++ from source") fetchcontent_declare(azure_sdk ${FC_DECLARE_COMMON_OPTIONS} @@ -5547,15 +5330,13 @@ function(build_azure_sdk) set(AZURE_SDK_VENDORED TRUE PARENT_SCOPE) - list(APPEND - ARROW_BUNDLED_STATIC_LIBS - Azure::azure-core - Azure::azure-identity - Azure::azure-storage-blobs - Azure::azure-storage-common - Azure::azure-storage-files-datalake) set(ARROW_BUNDLED_STATIC_LIBS ${ARROW_BUNDLED_STATIC_LIBS} + Azure::azure-core + Azure::azure-identity + Azure::azure-storage-blobs + Azure::azure-storage-common + Azure::azure-storage-files-datalake PARENT_SCOPE) endfunction() @@ -5565,4 +5346,11 @@ if(ARROW_WITH_AZURE_SDK) Azure::azure-storage-blobs Azure::azure-identity) endif() +# ---------------------------------------------------------------------- +# Apache Flight SQL ODBC + +if(ARROW_FLIGHT_SQL_ODBC) + find_package(ODBC REQUIRED) +endif() + message(STATUS "All bundled static libraries: ${ARROW_BUNDLED_STATIC_LIBS}") diff --git a/cpp/cmake_modules/aws_sdk_cpp_generate_variables.sh b/cpp/cmake_modules/aws_sdk_cpp_generate_variables.sh deleted file mode 100755 index 79b560a4a14..00000000000 --- a/cpp/cmake_modules/aws_sdk_cpp_generate_variables.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -eu - -version=$1 - -base_dir="$(dirname "$0")" -output="${base_dir}/AWSSDKVariables.cmake" - -cat <
${output} -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Generated by: -# $ cpp/cmake_modules/aws_sdk_cpp_generate_variables.sh ${version} - -HEADER - -rm -f ${version}.tar.gz -wget https://github.com/aws/aws-sdk-cpp/archive/${version}.tar.gz -base_name=aws-sdk-cpp-${version} -rm -rf ${base_name} -tar xf ${version}.tar.gz - -echo "set(AWSSDK_UNUSED_DIRECTORIES" >> ${output} -find ${base_name} -mindepth 1 -maxdepth 1 -type d | \ - sort | \ - grep -v cmake | \ - grep -v toolchains | \ - grep -v aws-cpp-sdk-cognito-identity | \ - grep -v aws-cpp-sdk-core | \ - grep -v aws-cpp-sdk-config | \ - grep -v aws-cpp-sdk-s3 | \ - grep -v aws-cpp-sdk-transfer | \ - grep -v aws-cpp-sdk-identity-management | \ - grep -v aws-cpp-sdk-sts | \ - sed -E -e "s,^${base_name}/, ,g" >> ${output} -echo ")" >> ${output} - -rm -rf ${base_name} -rm -f ${version}.tar.gz diff --git a/cpp/cmake_modules/mimalloc-1138.patch b/cpp/cmake_modules/mimalloc-1138.patch new file mode 100644 index 00000000000..1ffa4bffbba --- /dev/null +++ b/cpp/cmake_modules/mimalloc-1138.patch @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +Fix for https://github.com/microsoft/mimalloc/issues/1138 + +diff --git a/src/arena.c b/src/arena.c +index b26f4442..d7e99b55 100644 +--- a/src/arena.c ++++ b/src/arena.c +@@ -797,6 +797,9 @@ mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t bloc + else { + page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment); + } ++ if mi_unlikely(page == NULL) { ++ return NULL; ++ } + // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); diff --git a/cpp/cmake_modules/orc-2345.patch b/cpp/cmake_modules/orc-2345.patch new file mode 100644 index 00000000000..ee5e38d6e6a --- /dev/null +++ b/cpp/cmake_modules/orc-2345.patch @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +From a76249e13a6e364e0507a12cb71abaaf1647252e Mon Sep 17 00:00:00 2001 +From: Yuriy Chernyshov +Date: Thu, 31 Jul 2025 13:20:15 +0200 +Subject: [PATCH] Fix Windows build + +See +https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/byteswap-uint64-byteswap-ulong-byteswap-ushort?view=msvc-170 +--- + c++/src/Geospatial.cc | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc +index 6d7d268703..2b110cacb6 100644 +--- a/c++/src/Geospatial.cc ++++ b/c++/src/Geospatial.cc +@@ -66,8 +66,8 @@ namespace orc::geospatial { + + #if defined(_MSC_VER) + #include // IWYU pragma: keep +-#define ORC_BYTE_SWAP64 _byteSwap_uint64 +-#define ORC_BYTE_SWAP32 _byteSwap_ulong ++#define ORC_BYTE_SWAP64 _byteswap_uint64 ++#define ORC_BYTE_SWAP32 _byteswap_ulong + #else + #define ORC_BYTE_SWAP64 __builtin_bswap64 + #define ORC_BYTE_SWAP32 __builtin_bswap32 diff --git a/cpp/cmake_modules/orc-2357.patch b/cpp/cmake_modules/orc-2357.patch new file mode 100644 index 00000000000..41096e10429 --- /dev/null +++ b/cpp/cmake_modules/orc-2357.patch @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +From a66baec5731b65a81189f48c242433d01580f344 Mon Sep 17 00:00:00 2001 +From: Dongjoon Hyun +Date: Fri, 15 Aug 2025 12:31:09 -0700 +Subject: [PATCH] ORC-1973: [C++] Use `int64_t` instead of + `google::protobuf::int64` + +--- + c++/src/io/InputStream.cc | 4 ++-- + c++/src/io/InputStream.hh | 2 +- + c++/src/io/OutputStream.cc | 4 ++-- + c++/src/io/OutputStream.hh | 2 +- + 4 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/c++/src/io/InputStream.cc b/c++/src/io/InputStream.cc +index 06ef40bd4c..5e1dc00ccd 100644 +--- a/c++/src/io/InputStream.cc ++++ b/c++/src/io/InputStream.cc +@@ -112,8 +112,8 @@ namespace orc { + return false; + } + +- google::protobuf::int64 SeekableArrayInputStream::ByteCount() const { +- return static_cast(position_); ++ int64_t SeekableArrayInputStream::ByteCount() const { ++ return static_cast(position_); + } + + void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { +diff --git a/c++/src/io/InputStream.hh b/c++/src/io/InputStream.hh +index 07aa623b5f..8b251c9301 100644 +--- a/c++/src/io/InputStream.hh ++++ b/c++/src/io/InputStream.hh +@@ -72,7 +72,7 @@ namespace orc { + virtual bool Next(const void** data, int* size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; +- virtual google::protobuf::int64 ByteCount() const override; ++ virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + }; +diff --git a/c++/src/io/OutputStream.cc b/c++/src/io/OutputStream.cc +index fbf1ca61dd..a55050d122 100644 +--- a/c++/src/io/OutputStream.cc ++++ b/c++/src/io/OutputStream.cc +@@ -65,8 +65,8 @@ namespace orc { + // PASS + } + +- google::protobuf::int64 BufferedOutputStream::ByteCount() const { +- return static_cast(dataBuffer_->size()); ++ int64_t BufferedOutputStream::ByteCount() const { ++ return static_cast(dataBuffer_->size()); + } + + bool BufferedOutputStream::WriteAliasedRaw(const void*, int) { +diff --git a/c++/src/io/OutputStream.hh b/c++/src/io/OutputStream.hh +index 6319de96d6..b029818125 100644 +--- a/c++/src/io/OutputStream.hh ++++ b/c++/src/io/OutputStream.hh +@@ -61,7 +61,7 @@ namespace orc { + + virtual bool Next(void** data, int* size) override; + virtual void BackUp(int count) override; +- virtual google::protobuf::int64 ByteCount() const override; ++ virtual int64_t ByteCount() const override; + virtual bool WriteAliasedRaw(const void* data, int size) override; + virtual bool AllowsAliasing() const override; + diff --git a/cpp/cmake_modules/snappy.diff b/cpp/cmake_modules/snappy.diff deleted file mode 100644 index e763636e1da..00000000000 --- a/cpp/cmake_modules/snappy.diff +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# https://github.com/google/snappy/pull/172 - -diff --git a/snappy.cc b/snappy.cc -index d414718..5b0d0d6 100644 ---- a/snappy.cc -+++ b/snappy.cc -@@ -83,6 +83,7 @@ - #include - #include - #include -+#include - - namespace snappy { - diff --git a/cpp/cmake_modules/thrift-3187.patch b/cpp/cmake_modules/thrift-3187.patch new file mode 100644 index 00000000000..44a91614881 --- /dev/null +++ b/cpp/cmake_modules/thrift-3187.patch @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +From ad893633097b05ecdba8aa0f27aaf173dc7839b2 Mon Sep 17 00:00:00 2001 +From: Sutou Kouhei +Date: Fri, 8 Aug 2025 16:19:10 +0900 +Subject: [PATCH] THRIFT-3268: Suppress gnu-zero-variadic-macro-arguments + warnings + +Client: cpp + +We can reproduce these warnings by: + + CC=clang CXX=clang++ \ + cmake \ + -S . \ + -B ../thrift.build \ + -DWITH_{AS3,JAVA,JAVASCRIPT,NODEJS,PYTHON,C_GLIB}=OFF \ + -DCMAKE_CXX_FLAGS="-Wgnu-zero-variadic-macro-arguments" + cmake --build ../thrift.build + +Sample warning: + + lib/cpp/src/thrift/TLogging.h:119:13: warning: token pasting of ',' and __VA_ARGS__ is a GNU extension [-Wgnu-zero-variadic-macro-arguments] + 119 | ##__VA_ARGS__); \ + | ^ +--- + lib/cpp/src/thrift/TLogging.h | 12 ++++++------ + lib/cpp/test/TransportTest.cpp | 12 ++++++------ + lib/cpp/test/ZlibTest.cpp | 6 ++---- + 3 files changed, 14 insertions(+), 16 deletions(-) + +diff --git a/lib/cpp/src/thrift/TLogging.h b/lib/cpp/src/thrift/TLogging.h +index 07ff030f7da..64e9bf80bbb 100644 +--- a/lib/cpp/src/thrift/TLogging.h ++++ b/lib/cpp/src/thrift/TLogging.h +@@ -55,7 +55,7 @@ + #if T_GLOBAL_DEBUGGING_LEVEL > 0 + #define T_DEBUG(format_string, ...) \ + if (T_GLOBAL_DEBUGGING_LEVEL > 0) { \ +- fprintf(stderr, "[%s,%d] " format_string " \n", __FILE__, __LINE__, ##__VA_ARGS__); \ ++ fprintf(stderr, "[%s,%d] " format_string " \n", __FILE__, __LINE__, __VA_ARGS__); \ + } + #else + #define T_DEBUG(format_string, ...) +@@ -80,7 +80,7 @@ + __FILE__, \ + __LINE__, \ + dbgtime, \ +- ##__VA_ARGS__); \ ++ __VA_ARGS__); \ + } \ + } + #else +@@ -96,7 +96,7 @@ + */ + #define T_DEBUG_L(level, format_string, ...) \ + if ((level) > 0) { \ +- fprintf(stderr, "[%s,%d] " format_string " \n", __FILE__, __LINE__, ##__VA_ARGS__); \ ++ fprintf(stderr, "[%s,%d] " format_string " \n", __FILE__, __LINE__, __VA_ARGS__); \ + } + + /** +@@ -116,7 +116,7 @@ + __FILE__, \ + __LINE__, \ + dbgtime, \ +- ##__VA_ARGS__); \ ++ __VA_ARGS__); \ + } + + /** +@@ -137,7 +137,7 @@ + __FILE__, \ + __LINE__, \ + dbgtime, \ +- ##__VA_ARGS__); \ ++ __VA_ARGS__); \ + exit(1); \ + } + +@@ -155,7 +155,7 @@ + time(&now); \ + THRIFT_CTIME_R(&now, dbgtime); \ + dbgtime[24] = '\0'; \ +- fprintf(stderr, "[%s] " format_string " \n", dbgtime, ##__VA_ARGS__); \ ++ fprintf(stderr, "[%s] " format_string " \n", dbgtime, __VA_ARGS__); \ + } \ + } + #else +diff --git a/lib/cpp/test/TransportTest.cpp b/lib/cpp/test/TransportTest.cpp +index d6d38595a6b..8a05465773a 100644 +--- a/lib/cpp/test/TransportTest.cpp ++++ b/lib/cpp/test/TransportTest.cpp +@@ -784,23 +784,23 @@ void test_borrow_none_available() { + **************************************************************************/ + + #define ADD_TEST_RW(CoupledTransports, totalSize, ...) \ +- addTestRW(BOOST_STRINGIZE(CoupledTransports), totalSize, ##__VA_ARGS__); ++ addTestRW(BOOST_STRINGIZE(CoupledTransports), totalSize, __VA_ARGS__); + + #define TEST_RW(CoupledTransports, totalSize, ...) \ + do { \ + /* Add the test as specified, to test the non-virtual function calls */ \ +- ADD_TEST_RW(CoupledTransports, totalSize, ##__VA_ARGS__); \ ++ ADD_TEST_RW(CoupledTransports, totalSize, __VA_ARGS__); \ + /* \ + * Also test using the transport as a TTransport*, to test \ + * the read_virt()/write_virt() calls \ + */ \ +- ADD_TEST_RW(CoupledTTransports, totalSize, ##__VA_ARGS__); \ ++ ADD_TEST_RW(CoupledTTransports, totalSize, __VA_ARGS__); \ + /* Test wrapping the transport with TBufferedTransport */ \ +- ADD_TEST_RW(CoupledBufferedTransportsT, totalSize, ##__VA_ARGS__); \ ++ ADD_TEST_RW(CoupledBufferedTransportsT, totalSize, __VA_ARGS__); \ + /* Test wrapping the transport with TFramedTransports */ \ +- ADD_TEST_RW(CoupledFramedTransportsT, totalSize, ##__VA_ARGS__); \ ++ ADD_TEST_RW(CoupledFramedTransportsT, totalSize, __VA_ARGS__); \ + /* Test wrapping the transport with TZlibTransport */ \ +- ADD_TEST_RW(CoupledZlibTransportsT, totalSize, ##__VA_ARGS__); \ ++ ADD_TEST_RW(CoupledZlibTransportsT, totalSize, __VA_ARGS__); \ + } while (0) + + #define ADD_TEST_BLOCKING(CoupledTransports) \ +diff --git a/lib/cpp/test/ZlibTest.cpp b/lib/cpp/test/ZlibTest.cpp +index 274a243913c..ea9c617f625 100644 +--- a/lib/cpp/test/ZlibTest.cpp ++++ b/lib/cpp/test/ZlibTest.cpp +@@ -347,8 +347,7 @@ void test_get_underlying_transport() { + do { \ + ::std::ostringstream name_ss; \ + name_ss << name << "-" << BOOST_STRINGIZE(_FUNC); \ +- ::std::function test_func = \ +- ::std::bind(_FUNC, ##__VA_ARGS__); \ ++ ::std::function test_func = ::std::bind(_FUNC, __VA_ARGS__); \ + ::boost::unit_test::test_case* tc \ + = ::boost::unit_test::make_test_case(test_func, name_ss.str(), __FILE__, __LINE__); \ + (suite)->add(tc); \ +@@ -359,8 +358,7 @@ void test_get_underlying_transport() { + ::std::ostringstream name_ss; \ + name_ss << name << "-" << BOOST_STRINGIZE(_FUNC); \ + ::boost::unit_test::test_case* tc \ +- = ::boost::unit_test::make_test_case(::std::bind(_FUNC, \ +- ##__VA_ARGS__), \ ++ = ::boost::unit_test::make_test_case(::std::bind(_FUNC, __VA_ARGS__), \ + name_ss.str()); \ + (suite)->add(tc); \ + } while (0) diff --git a/cpp/cmake_modules/thrift-cstdint.patch b/cpp/cmake_modules/thrift-cstdint.patch deleted file mode 100644 index b670ba695e2..00000000000 --- a/cpp/cmake_modules/thrift-cstdint.patch +++ /dev/null @@ -1,68 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# https://github.com/apache/thrift/pull/3078 - -From 1920f04398ca32e320f6cf942534ba9d8b3231fd Mon Sep 17 00:00:00 2001 -From: Sutou Kouhei -Date: Mon, 23 Dec 2024 12:33:22 +0900 -Subject: [PATCH] THRIFT-5842: Add missing cstdint include for int64_t in - Mutex.h - -Client: cpp - -GCC 15 (not released yet) requires `#include ` for `int64_t` -but `lib/cpp/src/thrift/concurrency/Mutex.h` doesn't have it. So we -can't build Thrift with GCC 15: - - [80/359] Building CXX object lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o - FAILED: lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o - /bin/g++-15 -DBOOST_ALL_DYN_LINK -DBOOST_TEST_DYN_LINK -DTHRIFT_STATIC_DEFINE -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/kou/work/cpp/thrift.kou.build/lib/cpp -I/home/kou/work/cpp/thrift.kou/lib/cpp -I/home/kou/work/cpp/thrift.kou.build -I/home/kou/work/cpp/thrift.kou/lib/cpp/src -g -std=c++11 -MD -MT lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o -MF lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o.d -o lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLServerSocket.cpp.o -c /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/transport/TSSLServerSocket.cpp - In file included from /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/transport/TServerSocket.h:25, - from /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/transport/TSSLServerSocket.h:23, - from /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/transport/TSSLServerSocket.cpp:21: - /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/concurrency/Mutex.h:47:26: error: 'int64_t' has not been declared - 47 | virtual bool timedlock(int64_t milliseconds) const; - | ^~~~~~~ - /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/concurrency/Mutex.h:25:1: note: 'int64_t' is defined in header ''; this is probably fixable by adding '#include ' - 24 | #include - +++ |+#include - 25 | - /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/concurrency/Mutex.h:60:29: error: 'int64_t' has not been declared - 60 | Guard(const Mutex& value, int64_t timeout = 0) : mutex_(&value) { - | ^~~~~~~ - /home/kou/work/cpp/thrift.kou/lib/cpp/src/thrift/concurrency/Mutex.h:60:29: note: 'int64_t' is defined in header ''; this is probably fixable by adding '#include ' - -See also: https://github.com/apache/arrow/issues/45096 ---- - lib/cpp/src/thrift/concurrency/Mutex.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/lib/cpp/src/thrift/concurrency/Mutex.h b/lib/cpp/src/thrift/concurrency/Mutex.h -index 1e5c3fba3..12f1729d6 100644 ---- a/lib/cpp/src/thrift/concurrency/Mutex.h -+++ b/lib/cpp/src/thrift/concurrency/Mutex.h -@@ -20,6 +20,7 @@ - #ifndef _THRIFT_CONCURRENCY_MUTEX_H_ - #define _THRIFT_CONCURRENCY_MUTEX_H_ 1 - -+#include - #include - #include - --- -2.45.2 diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index ef4beaaca2c..82c075c51df 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -19,6 +19,7 @@ add_arrow_example(row_wise_conversion_example) if(ARROW_WITH_RAPIDJSON) add_arrow_example(rapidjson_row_converter EXTRA_LINK_LIBS RapidJSON) + add_arrow_example(from_json_string_example EXTRA_LINK_LIBS RapidJSON) endif() if(ARROW_ACERO) @@ -42,7 +43,13 @@ if(ARROW_SUBSTRAIT) endif() if(ARROW_COMPUTE AND ARROW_CSV) - add_arrow_example(compute_and_write_csv_example) + if(ARROW_BUILD_SHARED) + set(COMPUTE_KERNELS_LINK_LIBS arrow_compute_shared) + else() + set(COMPUTE_KERNELS_LINK_LIBS arrow_compute_static) + endif() + add_arrow_example(compute_and_write_csv_example EXTRA_LINK_LIBS + ${COMPUTE_KERNELS_LINK_LIBS}) endif() if(ARROW_FLIGHT) @@ -181,19 +188,6 @@ if(ARROW_PARQUET AND ARROW_DATASET) endif() add_arrow_example(udf_example) - - if(ARROW_SKYHOOK) - if(ARROW_BUILD_SHARED) - list(APPEND DATASET_EXAMPLES_LINK_LIBS arrow_skyhook_shared) - else() - list(APPEND DATASET_EXAMPLES_LINK_LIBS arrow_skyhook_static) - endif() - - add_arrow_example(dataset_skyhook_scan_example EXTRA_LINK_LIBS - ${DATASET_EXAMPLES_LINK_LIBS}) - add_dependencies(dataset-skyhook-scan-example parquet) - endif() - endif() if(ARROW_GANDIVA) diff --git a/cpp/examples/arrow/compute_and_write_csv_example.cc b/cpp/examples/arrow/compute_and_write_csv_example.cc index 7e0f6cdf1ce..234d6abf570 100644 --- a/cpp/examples/arrow/compute_and_write_csv_example.cc +++ b/cpp/examples/arrow/compute_and_write_csv_example.cc @@ -41,6 +41,7 @@ // in the current directory arrow::Status RunMain(int argc, char** argv) { + ARROW_RETURN_NOT_OK(arrow::compute::Initialize()); // Make Arrays arrow::NumericBuilder int64_builder; arrow::BooleanBuilder boolean_builder; diff --git a/cpp/examples/arrow/dataset_documentation_example.cc b/cpp/examples/arrow/dataset_documentation_example.cc index c78f6d59849..3320918c623 100644 --- a/cpp/examples/arrow/dataset_documentation_example.cc +++ b/cpp/examples/arrow/dataset_documentation_example.cc @@ -19,6 +19,7 @@ // intended to be paired with the documentation. #include +#include #include #include #include @@ -326,6 +327,8 @@ arrow::Result> FilterPartitionedDataset( arrow::Status RunDatasetDocumentation(const std::string& format_name, const std::string& uri, const std::string& mode) { + ARROW_RETURN_NOT_OK(arrow::compute::Initialize()); + std::string base_path; std::shared_ptr format; std::string root_path; diff --git a/cpp/examples/arrow/dataset_skyhook_scan_example.cc b/cpp/examples/arrow/dataset_skyhook_scan_example.cc deleted file mode 100644 index a32a6f5c4fe..00000000000 --- a/cpp/examples/arrow/dataset_skyhook_scan_example.cc +++ /dev/null @@ -1,184 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include "arrow/compute/expression.h" - -#include -#include - -using arrow::field; -using arrow::int16; -using arrow::Schema; -using arrow::Table; - -namespace fs = arrow::fs; - -namespace ds = arrow::dataset; - -namespace cp = arrow::compute; - -struct Configuration { - // Indicates if the Scanner::ToTable should consume in parallel. - bool use_threads = true; - - // Indicates to the Scan operator which columns are requested. This - // optimization avoid deserializing unneeded columns. - std::vector projected_columns = {"total_amount"}; - - // Indicates the filter by which rows will be filtered. This optimization can - // make use of partition information and/or file metadata if possible. - cp::Expression filter = cp::greater(cp::field_ref("payment_type"), cp::literal(1)); - - ds::InspectOptions inspect_options{}; - ds::FinishOptions finish_options{}; -} kConf; - -arrow::Result> GetDatasetFromDirectory( - std::shared_ptr fs, std::shared_ptr format, - std::string dir) { - // Find all files under `path` - fs::FileSelector s; - s.base_dir = dir; - s.recursive = true; - - // Set partitioning strategy - ds::FileSystemFactoryOptions options; - options.partitioning = std::make_shared( - arrow::schema({arrow::field("payment_type", arrow::int32()), - arrow::field("VendorID", arrow::int32())})); - - // The factory will try to build a dataset. - ARROW_ASSIGN_OR_RAISE(auto factory, - ds::FileSystemDatasetFactory::Make(fs, s, format, options)); - - // Try to infer a common schema for all files. - ARROW_ASSIGN_OR_RAISE(auto schema, factory->Inspect(kConf.inspect_options)); - // Caller can optionally decide another schema as long as it is compatible - // with the previous one, e.g. `factory->Finish(compatible_schema)`. - ARROW_ASSIGN_OR_RAISE(auto dataset, factory->Finish(kConf.finish_options)); - - return dataset; -} - -arrow::Result> GetDatasetFromFile( - std::shared_ptr fs, std::shared_ptr format, - std::string file) { - ds::FileSystemFactoryOptions options; - // The factory will try to build a dataset. - ARROW_ASSIGN_OR_RAISE(auto factory, - ds::FileSystemDatasetFactory::Make(fs, {file}, format, options)); - - // Try to infer a common schema for all files. - ARROW_ASSIGN_OR_RAISE(auto schema, factory->Inspect(kConf.inspect_options)); - // Caller can optionally decide another schema as long as it is compatible - // with the previous one, e.g. `factory->Finish(compatible_schema)`. - ARROW_ASSIGN_OR_RAISE(auto dataset, factory->Finish(kConf.finish_options)); - - return dataset; -} - -arrow::Result> GetDatasetFromPath( - std::shared_ptr fs, std::shared_ptr format, - std::string path) { - ARROW_ASSIGN_OR_RAISE(auto info, fs->GetFileInfo(path)); - if (info.IsDirectory()) { - return GetDatasetFromDirectory(fs, format, path); - } - return GetDatasetFromFile(fs, format, path); -} - -arrow::Result> GetScannerFromDataset( - std::shared_ptr dataset, std::vector columns, - cp::Expression filter, bool use_threads) { - ARROW_ASSIGN_OR_RAISE(auto scanner_builder, dataset->NewScan()); - - if (!columns.empty()) { - ARROW_RETURN_NOT_OK(scanner_builder->Project(columns)); - } - - ARROW_RETURN_NOT_OK(scanner_builder->Filter(filter)); - - ARROW_RETURN_NOT_OK(scanner_builder->UseThreads(use_threads)); - - return scanner_builder->Finish(); -} - -arrow::Result> InstantiateSkyhookFormat() { - // Path to the Ceph configuration file. It contains cluster wide configuration - // and most importantly the connection information to the Ceph cluster. - std::string ceph_config_path = "/etc/ceph/ceph.conf"; - - // Ceph data pool containing the objects to be scanned. - // The default data pool is "cephfs_data". - std::string ceph_data_pool = "cephfs_data"; - - // The user accessing the Ceph cluster. The default username is "client.admin". - std::string ceph_user_name = "client.admin"; - - // Cluster name is an unique identifier for a Ceph cluster. It is especially - // required when you run multiple Ceph clusters on a multi-site architecture - // where the cluster name identifies the Ceph cluster for the - // current session. The default cluster name is "ceph". - std::string ceph_cluster_name = "ceph"; - - // CLS name is used to identify the shared library that needs to be loaded - // in the Ceph OSDs when invoking an object class method. For Skyhook, the - // library name is "libcls_skyhook.so", and the object class name is "skyhook". - std::string ceph_cls_name = "skyhook"; - std::shared_ptr rados_ctx = - std::make_shared(ceph_config_path, ceph_data_pool, - ceph_user_name, ceph_cluster_name, - ceph_cls_name); - ARROW_ASSIGN_OR_RAISE(auto format, - skyhook::SkyhookFileFormat::Make(rados_ctx, "parquet")); - return format; -} - -arrow::Status Main(std::string dataset_root) { - ARROW_ASSIGN_OR_RAISE(auto format, InstantiateSkyhookFormat()); - std::string path; - - ARROW_ASSIGN_OR_RAISE(auto fs, fs::FileSystemFromUri(dataset_root, &path)); - ARROW_ASSIGN_OR_RAISE(auto dataset, GetDatasetFromPath(fs, format, path)); - ARROW_ASSIGN_OR_RAISE( - auto scanner, GetScannerFromDataset(dataset, kConf.projected_columns, kConf.filter, - kConf.use_threads)); - ARROW_ASSIGN_OR_RAISE(auto table, scanner->ToTable()); - std::cout << "Table size: " << table->num_rows() << "\n"; - return arrow::Status::OK(); -} - -int main(int argc, char** argv) { - if (argc != 2) { - // Fake success for CI purposes. - return EXIT_SUCCESS; - } - auto status = Main(argv[1]); - if (!status.ok()) { - std::cerr << status.ToString() << std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; -} diff --git a/cpp/examples/arrow/execution_plan_documentation_examples.cc b/cpp/examples/arrow/execution_plan_documentation_examples.cc index b92f5801c14..a8d50b22e6b 100644 --- a/cpp/examples/arrow/execution_plan_documentation_examples.cc +++ b/cpp/examples/arrow/execution_plan_documentation_examples.cc @@ -825,14 +825,13 @@ enum ExampleMode { }; int main(int argc, char** argv) { - if (argc < 3) { - // Fake success for CI purposes. - return EXIT_SUCCESS; + int mode = argc > 1 ? std::atoi(argv[2]) : SOURCE_SINK; + std::string base_save_path = argc > 2 ? argv[2] : ""; + arrow::Status status = arrow::compute::Initialize(); + if (!status.ok()) { + std::cout << "Error occurred: " << status.message() << std::endl; + return EXIT_FAILURE; } - - std::string base_save_path = argv[1]; - int mode = std::atoi(argv[2]); - arrow::Status status; // ensure arrow::dataset node factories are in the registry arrow::dataset::internal::Initialize(); switch (mode) { diff --git a/cpp/examples/arrow/from_json_string_example.cc b/cpp/examples/arrow/from_json_string_example.cc new file mode 100644 index 00000000000..eb919303fee --- /dev/null +++ b/cpp/examples/arrow/from_json_string_example.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This example shows how to use some of the *FromJSONString helpers. + +#include +#include + +#include +#include +#include +#include + +using arrow::json::ArrayFromJSONString; +using arrow::json::ChunkedArrayFromJSONString; +using arrow::json::DictArrayFromJSONString; + +/** + * \brief Run Example + * + * ./debug/from-json-string-example + */ +arrow::Status RunExample() { + // Simple types + ARROW_ASSIGN_OR_RAISE(auto int32_array, + ArrayFromJSONString(arrow::int32(), "[1, 2, 3]")); + ARROW_ASSIGN_OR_RAISE(auto float64_array, + ArrayFromJSONString(arrow::float64(), "[4.0, 5.0, 6.0]")); + ARROW_ASSIGN_OR_RAISE(auto bool_array, + ArrayFromJSONString(arrow::boolean(), "[true, false, true]")); + ARROW_ASSIGN_OR_RAISE( + auto string_array, + ArrayFromJSONString(arrow::utf8(), R"(["Hello", "World", null])")); + + // Timestamps can be created from string representations + ARROW_ASSIGN_OR_RAISE( + auto ts_array, + ArrayFromJSONString(timestamp(arrow::TimeUnit::SECOND), + R"(["1970-01-01", "2000-02-29","3989-07-14","1900-02-28"])")); + + // List, Map, Struct + ARROW_ASSIGN_OR_RAISE( + auto list_array, + ArrayFromJSONString(list(arrow::int64()), + "[[null], [], null, [4, 5, 6, 7, 8], [2, 3]]")); + ARROW_ASSIGN_OR_RAISE( + auto map_array, + ArrayFromJSONString(map(arrow::utf8(), arrow::int32()), + R"([[["joe", 0], ["mark", null]], null, [["cap", 8]], []])")); + ARROW_ASSIGN_OR_RAISE( + auto struct_array, + ArrayFromJSONString( + arrow::struct_({field("one", arrow::int32()), field("two", arrow::int32())}), + "[[11, 22], null, [null, 33]]")); + + // ChunkedArrayFromJSONString + ARROW_ASSIGN_OR_RAISE( + auto chunked_array, + ChunkedArrayFromJSONString(arrow::int32(), {"[5, 10]", "[null]", "[16]"})); + + // DictArrayFromJSONString + ARROW_ASSIGN_OR_RAISE( + auto dict_array, + DictArrayFromJSONString(dictionary(arrow::int32(), arrow::utf8()), + "[0, 1, 0, 2, 0, 3]", R"(["k1", "k2", "k3", "k4"])")); + + return arrow::Status::OK(); +} + +int main(int argc, char** argv) { + auto status = RunExample(); + if (!status.ok()) { + std::cerr << status.ToString() << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/cpp/examples/arrow/join_example.cc b/cpp/examples/arrow/join_example.cc index c1c6e5e82ff..738420d48e1 100644 --- a/cpp/examples/arrow/join_example.cc +++ b/cpp/examples/arrow/join_example.cc @@ -82,6 +82,7 @@ arrow::Result> CreateDataSetFromCSVData } arrow::Status DoHashJoin() { + ARROW_RETURN_NOT_OK(arrow::compute::Initialize()); arrow::dataset::internal::Initialize(); ARROW_ASSIGN_OR_RAISE(auto l_dataset, CreateDataSetFromCSVData(true)); diff --git a/cpp/examples/arrow/parquet_column_encryption.cc b/cpp/examples/arrow/parquet_column_encryption.cc index 2ea4f44f172..7b2d5d80bf4 100644 --- a/cpp/examples/arrow/parquet_column_encryption.cc +++ b/cpp/examples/arrow/parquet_column_encryption.cc @@ -19,6 +19,7 @@ #include "arrow/dataset/file_parquet.h" #include "arrow/dataset/parquet_encryption_config.h" #include "arrow/filesystem/localfs.h" +#include "arrow/util/secure_string.h" #include "parquet/encryption/crypto_factory.h" #include "parquet/encryption/test_in_memory_kms.h" @@ -106,9 +107,9 @@ arrow::Result> GetTable() { std::shared_ptr GetCryptoFactory() { // Configure KMS. - std::unordered_map key_map; - key_map.emplace("footerKeyId", "0123456789012345"); - key_map.emplace("columnKeyId", "1234567890123456"); + std::unordered_map key_map; + key_map.emplace("footerKeyId", arrow::util::SecureString("0123456789012345")); + key_map.emplace("columnKeyId", arrow::util::SecureString("1234567890123456")); auto crypto_factory = std::make_shared(); auto kms_client_factory = diff --git a/cpp/examples/minimal_build/CMakeLists.txt b/cpp/examples/minimal_build/CMakeLists.txt index 95dad34221a..626b987b093 100644 --- a/cpp/examples/minimal_build/CMakeLists.txt +++ b/cpp/examples/minimal_build/CMakeLists.txt @@ -15,14 +15,19 @@ # specific language governing permissions and limitations # under the License. -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.25) project(ArrowMinimalExample) -option(ARROW_LINK_SHARED "Link to the Arrow shared library" ON) - find_package(Arrow REQUIRED) +include(CMakeDependentOption) +cmake_dependent_option(ARROW_LINK_SHARED + "Link to the Arrow shared library if possible" + ON + ARROW_BUILD_SHARED + OFF) + if(NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() diff --git a/cpp/examples/parquet/low_level_api/encryption_reader_writer.cc b/cpp/examples/parquet/low_level_api/encryption_reader_writer.cc index aa0f07ff5c6..8e39ca100fc 100644 --- a/cpp/examples/parquet/low_level_api/encryption_reader_writer.cc +++ b/cpp/examples/parquet/low_level_api/encryption_reader_writer.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include @@ -39,9 +40,9 @@ constexpr int NUM_ROWS_PER_ROW_GROUP = 500; const char* PARQUET_FILENAME = "parquet_cpp_example.parquet.encrypted"; -const char* kFooterEncryptionKey = "0123456789012345"; // 128bit/16 -const char* kColumnEncryptionKey1 = "1234567890123450"; -const char* kColumnEncryptionKey2 = "1234567890123451"; +const arrow::util::SecureString kFooterEncryptionKey("0123456789012345"); +const arrow::util::SecureString kColumnEncryptionKey1("1234567890123450"); +const arrow::util::SecureString kColumnEncryptionKey2("1234567890123451"); int main(int argc, char** argv) { /********************************************************************************** @@ -81,7 +82,7 @@ int main(int argc, char** argv) { parquet::WriterProperties::Builder builder; // Add the current encryption configuration to WriterProperties. builder.encryption(file_encryption_builder.footer_key_metadata("kf") - ->encrypted_columns(encryption_cols) + ->encrypted_columns(std::move(encryption_cols)) ->build()); // Add other writer properties @@ -216,7 +217,7 @@ int main(int argc, char** argv) { // Add the current decryption configuration to ReaderProperties. reader_properties.file_decryption_properties( - file_decryption_builder.key_retriever(kr1)->build()); + file_decryption_builder.key_retriever(std::move(kr1))->build()); // Create a ParquetReader instance std::unique_ptr parquet_reader = diff --git a/cpp/examples/parquet/low_level_api/encryption_reader_writer_all_crypto_options.cc b/cpp/examples/parquet/low_level_api/encryption_reader_writer_all_crypto_options.cc index b564cddcb50..435443098fc 100644 --- a/cpp/examples/parquet/low_level_api/encryption_reader_writer_all_crypto_options.cc +++ b/cpp/examples/parquet/low_level_api/encryption_reader_writer_all_crypto_options.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -92,9 +93,9 @@ constexpr int NUM_ROWS_PER_ROW_GROUP = 500; -const char* kFooterEncryptionKey = "0123456789012345"; // 128bit/16 -const char* kColumnEncryptionKey1 = "1234567890123450"; -const char* kColumnEncryptionKey2 = "1234567890123451"; +const arrow::util::SecureString kFooterEncryptionKey("0123456789012345"); +const arrow::util::SecureString kColumnEncryptionKey1("1234567890123450"); +const arrow::util::SecureString kColumnEncryptionKey2("1234567890123451"); const char* fileName = "tester"; using FileClass = ::arrow::io::FileOutputStream; @@ -185,7 +186,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { vector_of_encryption_configurations.push_back( file_encryption_builder_2.footer_key_metadata("kf") - ->encrypted_columns(encryption_cols2) + ->encrypted_columns(std::move(encryption_cols2)) ->build()); // Encryption configuration 3: Encrypt two columns, with different keys. @@ -205,7 +206,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { vector_of_encryption_configurations.push_back( file_encryption_builder_3.footer_key_metadata("kf") - ->encrypted_columns(encryption_cols3) + ->encrypted_columns(std::move(encryption_cols3)) ->set_plaintext_footer() ->build()); @@ -225,7 +226,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { vector_of_encryption_configurations.push_back( file_encryption_builder_4.footer_key_metadata("kf") - ->encrypted_columns(encryption_cols4) + ->encrypted_columns(std::move(encryption_cols4)) ->aad_prefix(fileName) ->build()); @@ -244,7 +245,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { kFooterEncryptionKey); vector_of_encryption_configurations.push_back( - file_encryption_builder_5.encrypted_columns(encryption_cols5) + file_encryption_builder_5.encrypted_columns(std::move(encryption_cols5)) ->footer_key_metadata("kf") ->aad_prefix(fileName) ->disable_aad_prefix_storage() @@ -266,7 +267,7 @@ void InteropTestWriteEncryptedParquetFiles(std::string root_path) { vector_of_encryption_configurations.push_back( file_encryption_builder_6.footer_key_metadata("kf") - ->encrypted_columns(encryption_cols6) + ->encrypted_columns(std::move(encryption_cols6)) ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1) ->build()); @@ -373,7 +374,7 @@ void InteropTestReadEncryptedParquetFiles(std::string root_path) { parquet::FileDecryptionProperties::Builder file_decryption_builder_1; vector_of_decryption_configurations.push_back( - file_decryption_builder_1.key_retriever(kr1)->build()); + file_decryption_builder_1.key_retriever(std::move(kr1))->build()); // Decryption configuration 2: Decrypt using key retriever callback that holds the keys // of two encrypted columns and the footer key. Supply aad_prefix. @@ -387,7 +388,9 @@ void InteropTestReadEncryptedParquetFiles(std::string root_path) { parquet::FileDecryptionProperties::Builder file_decryption_builder_2; vector_of_decryption_configurations.push_back( - file_decryption_builder_2.key_retriever(kr2)->aad_prefix(fileName)->build()); + file_decryption_builder_2.key_retriever(std::move(kr2)) + ->aad_prefix(fileName) + ->build()); // Decryption configuration 3: Decrypt using explicit column and footer keys. std::string path_double = "double_field"; @@ -405,7 +408,7 @@ void InteropTestReadEncryptedParquetFiles(std::string root_path) { parquet::FileDecryptionProperties::Builder file_decryption_builder_3; vector_of_decryption_configurations.push_back( file_decryption_builder_3.footer_key(kFooterEncryptionKey) - ->column_keys(decryption_cols) + ->column_keys(std::move(decryption_cols)) ->build()); /********************************************************************************** @@ -417,7 +420,7 @@ void InteropTestReadEncryptedParquetFiles(std::string root_path) { for (unsigned example_id = 0; example_id < vector_of_decryption_configurations.size(); ++example_id) { PrintDecryptionConfiguration(example_id + 1); - for (auto const& file : files_in_directory) { + for (const auto& file : files_in_directory) { std::string exception_msg = ""; if (!FileNameEndsWith(file, "parquet.encrypted")) // Skip non encrypted files continue; diff --git a/cpp/examples/parquet/meson.build b/cpp/examples/parquet/meson.build new file mode 100644 index 00000000000..96e2711f323 --- /dev/null +++ b/cpp/examples/parquet/meson.build @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +example_execs = { + 'parquet-low-level-example': { + 'sources': files('low_level_api/reader_writer.cc'), + 'include_dir': include_directories('low_level_api'), + }, + 'parquet-low-level-example2': { + 'sources': files('low_level_api/reader_writer2.cc'), + 'include_dir': include_directories('low_level_api'), + }, + 'parquet-arrow-example': { + 'sources': files('parquet_arrow/reader_writer.cc'), + }, + 'parquet-stream-api-example': { + 'sources': files('parquet_stream_api/stream_reader_writer.cc'), + }, +} + +if needs_parquet_encryption + example_execs += { + 'parquet-encryption-example': { + 'sources': files('low_level_api/encryption_reader_writer.cc'), + 'include_dir': include_directories('low_level_api'), + }, + 'parquet-encryption-example-all-crypto-options': { + 'sources': files( + 'low_level_api/encryption_reader_writer_all_crypto_options.cc', + ), + 'include_dir': include_directories('low_level_api'), + }, + } +endif + +foreach key, val : example_execs + executable( + key, + sources: val['sources'], + include_directories: val.get('include_dir', []), + dependencies: [arrow_dep, parquet_dep], + ) +endforeach diff --git a/cpp/examples/tutorial_examples/CMakeLists.txt b/cpp/examples/tutorial_examples/CMakeLists.txt index a6f8350c41d..1466bce48af 100644 --- a/cpp/examples/tutorial_examples/CMakeLists.txt +++ b/cpp/examples/tutorial_examples/CMakeLists.txt @@ -37,7 +37,7 @@ target_link_libraries(file_access_example PRIVATE Arrow::arrow_shared Parquet::parquet_shared) add_executable(compute_example compute_example.cc) -target_link_libraries(compute_example PRIVATE Arrow::arrow_shared) +target_link_libraries(compute_example PRIVATE ArrowCompute::arrow_compute_shared) add_executable(dataset_example dataset_example.cc) target_link_libraries(dataset_example PRIVATE ArrowDataset::arrow_dataset_shared) diff --git a/cpp/examples/tutorial_examples/compute_example.cc b/cpp/examples/tutorial_examples/compute_example.cc index 3a65214c0ef..767719c52b0 100644 --- a/cpp/examples/tutorial_examples/compute_example.cc +++ b/cpp/examples/tutorial_examples/compute_example.cc @@ -49,6 +49,9 @@ arrow::Status RunMain() { schema = arrow::schema({field_a, field_b}); + // Initialize the compute module to register the required compute kernels. + ARROW_RETURN_NOT_OK(arrow::compute::Initialize()); + std::shared_ptr table; table = arrow::Table::Make(schema, {some_nums, more_nums}, 5); // (Doc section: Create Tables) diff --git a/cpp/examples/tutorial_examples/dataset_example.cc b/cpp/examples/tutorial_examples/dataset_example.cc index a980fa54939..c32cf6ec4c6 100644 --- a/cpp/examples/tutorial_examples/dataset_example.cc +++ b/cpp/examples/tutorial_examples/dataset_example.cc @@ -19,6 +19,7 @@ // (Doc section: Includes) #include +#include #include // We use Parquet headers for setting up examples; they are not required for using // datasets. @@ -75,6 +76,8 @@ arrow::Result CreateExampleParquetDataset( } arrow::Status PrepareEnv() { + // Initilize the compute module to register the required kernels for Dataset + ARROW_RETURN_NOT_OK(arrow::compute::Initialize()); // Get our environment prepared for reading, by setting up some quick writing. ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable()) std::shared_ptr setup_fs; diff --git a/cpp/meson.build b/cpp/meson.build index 2b633cf7ad2..194da0ccef2 100644 --- a/cpp/meson.build +++ b/cpp/meson.build @@ -19,15 +19,10 @@ project( 'arrow', 'cpp', 'c', - version: '20.0.0-SNAPSHOT', + version: '22.0.0', license: 'Apache-2.0', meson_version: '>=1.3.0', - default_options: [ - 'buildtype=release', - 'c_std=c99', - 'warning_level=2', - 'cpp_std=c++17', - ], + default_options: ['c_std=gnu11,c11', 'warning_level=2', 'cpp_std=c++17'], ) project_args = [ @@ -47,26 +42,73 @@ cpp_args = cpp_compiler.get_supported_arguments(project_args) add_project_arguments(cpp_args, language: 'cpp') git_id = get_option('git_id') -if git_id == '' +if git_id == '' and not meson.is_subproject() git_id = run_command('git', 'log', '-n1', '--format=%H', check: false).stdout().strip() endif git_description = get_option('git_description') -if git_description == '' +if git_description == '' and not meson.is_subproject() git_description = run_command('git', 'describe', '--tags', check: false).stdout().strip() endif -needs_benchmarks = get_option('benchmarks') -needs_csv = get_option('csv') -needs_azure = get_option('azure') -needs_gcs = get_option('gcs') -needs_hdfs = get_option('hdfs') -needs_s3 = get_option('s3') -needs_filesystem = get_option('filesystem') or needs_azure or needs_gcs or needs_hdfs or needs_s3 -needs_integration = get_option('integration') -needs_tests = get_option('tests') -needs_ipc = get_option('ipc') or needs_tests or needs_benchmarks -needs_testing = get_option('testing') or needs_tests or needs_benchmarks or needs_integration -needs_json = get_option('json') or needs_testing +needs_benchmarks = get_option('benchmarks').enabled() +needs_compute = get_option('compute').enabled() +needs_csv = get_option('csv').enabled() +needs_azure = get_option('azure').enabled() +needs_gcs = get_option('gcs').enabled() +needs_hdfs = get_option('hdfs').enabled() +needs_parquet = get_option('parquet').enabled() +needs_parquet_encryption = get_option('parquet_require_encryption').enabled() +needs_s3 = get_option('s3').enabled() +needs_filesystem = (get_option('filesystem').enabled() + or needs_azure + or needs_gcs + or needs_hdfs + or needs_parquet_encryption + or needs_s3 +) +needs_integration = get_option('integration').enabled() +needs_tests = get_option('tests').enabled() +needs_acero = get_option('acero').enabled() +needs_flight = get_option('flight').enabled() +needs_ipc = (get_option('ipc').enabled() + or needs_tests + or needs_acero + or needs_benchmarks + or needs_flight + or needs_parquet +) + +needs_fuzzing = get_option('fuzzing').enabled() +if needs_fuzzing + if meson.version() < '1.8.0' + error( + f'Meson >= 1.8.0 is required for fuzzing support, found @meson.version()@', + ) + endif +endif + +needs_testing = (get_option('testing').enabled() + or needs_tests + or needs_benchmarks + or needs_fuzzing + or needs_integration +) +needs_json = get_option('json').enabled() or needs_testing +needs_brotli = get_option('brotli').enabled() or needs_fuzzing +needs_bz2 = get_option('bz2').enabled() +needs_lz4 = get_option('lz4').enabled() +needs_snappy = get_option('snappy').enabled() +needs_zlib = get_option('zlib').enabled() +needs_zstd = get_option('zstd').enabled() +needs_utilities = get_option('utilities').enabled() subdir('src/arrow') + +if needs_parquet + subdir('src/parquet') + subdir('tools/parquet') + if get_option('parquet_build_examples').enabled() + subdir('examples/parquet') + endif +endif diff --git a/cpp/meson.options b/cpp/meson.options index 3641a7452d1..668f440ee72 100644 --- a/cpp/meson.options +++ b/cpp/meson.options @@ -15,102 +15,125 @@ # specific language governing permissions and limitations # under the License. +option( + 'acero', + type: 'feature', + description: 'Build the Arrow Acero Engine Module', +) option( 'azure', - type: 'boolean', + type: 'feature', description: 'Build Arrow with Azure support (requires the Azure SDK for C++)', - value: false, ) option( 'benchmarks', - type: 'boolean', + type: 'feature', description: 'Build the Arrow micro benchmarks', - value: false, ) - +option('brotli', type: 'feature', description: 'Build with Brotli compression') +option('bz2', type: 'feature', description: 'Build with BZ2 compression') option( - 'csv', - type: 'boolean', - description: 'Build the Arrow CSV Parser Module', - value: false, + 'compute', + type: 'feature', + description: 'Build all Arrow Compute kernels', ) - +option('csv', type: 'feature', description: 'Build the Arrow CSV Parser Module') option( 'filesystem', - type: 'boolean', + type: 'feature', description: 'Build the Arrow Filesystem Layer', - value: false, +) +option( + 'fuzzing', + type: 'feature', + description: 'Build Arrow Fuzzing executables', ) option( 'gcs', - type: 'boolean', + type: 'feature', description: 'Build Arrow with GCS support (requires the Google Cloud Platform C++ Client Libraries)', - value: false, ) -option( - 'hdfs', - type: 'boolean', - description: 'Build the Arrow HDFS bridge', - value: false, -) +option('hdfs', type: 'feature', description: 'Build the Arrow HDFS bridge') option( 'integration', - type: 'boolean', + type: 'feature', description: 'Build the Arrow integration test executables', - value: false, ) option( 'ipc', - type: 'boolean', + type: 'feature', description: 'Build the Arrow IPC extensions', - value: true, + value: 'enabled', ) -option( - 'json', - type: 'boolean', - description: 'Build Arrow with JSON support', - value: false, -) +option('json', type: 'feature', description: 'Build Arrow with JSON support') option( - 'git_id', - type: 'string', + 'flight', + type: 'feature', + description: 'Build the Arrow Flight RPC System (requires GRPC, Protocol Buffers)', ) -option( - 'git_description', - type: 'string', -) +option('git_id', type: 'string') +option('git_description', type: 'string') + +option('lz4', type: 'feature', description: 'Build with lz4 compression') option( 'package_kind', type: 'string', description: 'Arbitrary string that identifies the kind of package (for informational purposes)', ) +option('parquet', type: 'feature', description: 'Build the Parquet libraries') +option( + 'parquet_build_executables', + type: 'feature', + description: 'Build the Parquet executable CLI tools.', +) +option( + 'parquet_build_examples', + type: 'feature', + description: 'Build the Parquet examples.', +) +option( + 'parquet_require_encryption', + type: 'feature', + description: 'Build support for encryption. Fail if OpenSSL is not found', +) +option('snappy', type: 'feature', description: 'Build with snappy compression') option( 's3', - type: 'boolean', + type: 'feature', description: 'Build Arrow with S3 support (requires the AWS SDK for C++)', - value: false, +) + +option( + 'tensorflow', + type: 'feature', + description: 'Build Arrow with TensorFlow support enabled', ) option( 'testing', - type: 'boolean', + type: 'feature', description: 'Build the Arrow testing libraries', - value: false, ) - option( 'tests', - type: 'boolean', + type: 'feature', description: 'Build the Arrow googletest unit tests', - value: false, ) + +option( + 'utilities', + type: 'feature', + description: 'Build Arrow commandline utilities', +) +option('zlib', type: 'feature', description: 'Build with zlib compression') +option('zstd', type: 'feature', description: 'Build with zstd compression') diff --git a/cpp/src/arrow/ArrowComputeConfig.cmake.in b/cpp/src/arrow/ArrowComputeConfig.cmake.in new file mode 100644 index 00000000000..f38c776c8c8 --- /dev/null +++ b/cpp/src/arrow/ArrowComputeConfig.cmake.in @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This config sets the following variables in your project:: +# +# ArrowCompute_FOUND - true if Arrow Compute found on the system +# +# This config sets the following targets in your project:: +# +# ArrowCompute::arrow_compute_shared - for linked as shared library if shared library is built +# ArrowCompute::arrow_compute_static - for linked as static library if static library is built + +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +find_dependency(Arrow CONFIG) + +include("${CMAKE_CURRENT_LIST_DIR}/ArrowComputeTargets.cmake") + +arrow_keep_backward_compatibility(ArrowCompute arrow_compute) + +check_required_components(ArrowCompute) + +arrow_show_details(ArrowCompute ARROW_COMPUTE) diff --git a/cpp/src/arrow/ArrowConfig.cmake.in b/cpp/src/arrow/ArrowConfig.cmake.in index 27910b3f3c3..cbadad4d742 100644 --- a/cpp/src/arrow/ArrowConfig.cmake.in +++ b/cpp/src/arrow/ArrowConfig.cmake.in @@ -39,6 +39,8 @@ set(ARROW_INCLUDE_PATH_SUFFIXES "@ARROW_INCLUDE_PATH_SUFFIXES@") set(ARROW_LIBRARY_PATH_SUFFIXES "@ARROW_LIBRARY_PATH_SUFFIXES@") set(ARROW_SYSTEM_DEPENDENCIES "@ARROW_SYSTEM_DEPENDENCIES@") +set(ARROW_VCPKG "@ARROW_VCPKG@") + include("${CMAKE_CURRENT_LIST_DIR}/ArrowOptions.cmake") macro(arrow_find_dependencies dependencies) @@ -122,11 +124,13 @@ if(TARGET Arrow::arrow_static AND NOT TARGET Arrow::arrow_bundled_dependencies) # https://cmake.org/cmake/help/latest/policy/CMP0057.html cmake_policy(PUSH) cmake_policy(SET CMP0057 NEW) - if("AWS::aws-c-common" IN_LIST ARROW_BUNDLED_STATIC_LIBS) + if("aws-c-common" IN_LIST ARROW_BUNDLED_STATIC_LIBS) if(APPLE) find_library(CORE_FOUNDATION CoreFoundation) target_link_libraries(Arrow::arrow_bundled_dependencies INTERFACE ${CORE_FOUNDATION}) + find_library(NETWORK Network) + target_link_libraries(Arrow::arrow_bundled_dependencies INTERFACE ${NETWORK}) find_library(SECURITY Security) target_link_libraries(Arrow::arrow_bundled_dependencies INTERFACE ${SECURITY}) elseif(WIN32) diff --git a/cpp/src/arrow/ArrowTestingConfig.cmake.in b/cpp/src/arrow/ArrowTestingConfig.cmake.in index 148d6516a09..9e631e4fc98 100644 --- a/cpp/src/arrow/ArrowTestingConfig.cmake.in +++ b/cpp/src/arrow/ArrowTestingConfig.cmake.in @@ -29,7 +29,7 @@ set(ARROW_TESTING_SYSTEM_DEPENDENCIES "@ARROW_TESTING_SYSTEM_DEPENDENCIES@") include(CMakeFindDependencyMacro) -find_dependency(Arrow) +find_dependency(Arrow CONFIG) arrow_find_dependencies("${ARROW_TESTING_SYSTEM_DEPENDENCIES}") diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 7bfdc332f14..e299e8f6167 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -137,7 +137,7 @@ if(ARROW_ENABLE_THREADING) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS Threads::Threads) endif() -set(ARROW_TEST_LINK_TOOLCHAIN ${ARROW_GTEST_GMOCK} ${ARROW_GTEST_GTEST_MAIN}) +set(ARROW_TEST_LINK_TOOLCHAIN ${ARROW_GTEST_GMOCK_MAIN}) set(ARROW_TEST_STATIC_LINK_LIBS arrow::flatbuffers arrow_testing_static arrow_static ${ARROW_TEST_LINK_TOOLCHAIN}) set(ARROW_TEST_SHARED_LINK_LIBS arrow::flatbuffers arrow_testing_shared arrow_shared @@ -241,7 +241,7 @@ endfunction() function(ADD_ARROW_TEST REL_TEST_NAME) set(options) set(one_value_args PREFIX) - set(multi_value_args LABELS PRECOMPILED_HEADERS) + set(multi_value_args LABELS) cmake_parse_arguments(ARG "${options}" "${one_value_args}" @@ -260,22 +260,11 @@ function(ADD_ARROW_TEST REL_TEST_NAME) set(LABELS "arrow-tests") endif() - # Because of https://gitlab.kitware.com/cmake/cmake/issues/20289, - # we must generate the precompiled header on an executable target. - # Do that on the first unit test target (here "arrow-array-test") - # and reuse the PCH for the other tests. - if(ARG_PRECOMPILED_HEADERS) - set(PCH_ARGS PRECOMPILED_HEADERS ${ARG_PRECOMPILED_HEADERS}) - else() - set(PCH_ARGS PRECOMPILED_HEADER_LIB "arrow-array-test") - endif() - add_test_case(${REL_TEST_NAME} PREFIX ${PREFIX} LABELS ${LABELS} - ${PCH_ARGS} ${ARG_UNPARSED_ARGUMENTS}) endfunction() @@ -333,7 +322,6 @@ endfunction() macro(append_runtime_avx2_src SRCS SRC) if(ARROW_HAVE_RUNTIME_AVX2) list(APPEND ${SRCS} ${SRC}) - set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS ${ARROW_AVX2_FLAG}) endif() endmacro() @@ -341,7 +329,6 @@ endmacro() macro(append_runtime_avx2_bmi2_src SRCS SRC) if(ARROW_HAVE_RUNTIME_AVX2 AND ARROW_HAVE_RUNTIME_BMI2) list(APPEND ${SRCS} ${SRC}) - set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS "${ARROW_AVX2_FLAG} ${ARROW_BMI2_FLAG}") endif() @@ -350,7 +337,6 @@ endmacro() macro(append_runtime_avx512_src SRCS SRC) if(ARROW_HAVE_RUNTIME_AVX512) list(APPEND ${SRCS} ${SRC}) - set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS ${ARROW_AVX512_FLAG}) endif() endmacro() @@ -440,8 +426,7 @@ set(ARROW_MEMORY_POOL_SRCS memory_pool.cc) if(ARROW_JEMALLOC) list(APPEND ARROW_MEMORY_POOL_SRCS memory_pool_jemalloc.cc) set_source_files_properties(memory_pool_jemalloc.cc - PROPERTIES SKIP_PRECOMPILE_HEADERS ON - SKIP_UNITY_BUILD_INCLUSION ON) + PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() arrow_add_object_library(ARROW_MEMORY_POOL ${ARROW_MEMORY_POOL_SRCS}) if(ARROW_JEMALLOC) @@ -485,9 +470,8 @@ set(ARROW_VENDORED_SRCS if(APPLE) list(APPEND ARROW_VENDORED_SRCS vendored/datetime/ios.mm) endif() -set_source_files_properties(vendored/datetime.cpp - PROPERTIES SKIP_PRECOMPILE_HEADERS ON - SKIP_UNITY_BUILD_INCLUSION ON) +set_source_files_properties(vendored/datetime.cpp PROPERTIES SKIP_UNITY_BUILD_INCLUSION + ON) arrow_add_object_library(ARROW_VENDORED ${ARROW_VENDORED_SRCS}) # Disable DLL exports in vendored uriparser library foreach(ARROW_VENDORED_TARGET ${ARROW_VENDORED_TARGETS}) @@ -507,6 +491,7 @@ set(ARROW_UTIL_SRCS util/bitmap_ops.cc util/bpacking.cc util/byte_size.cc + util/byte_stream_split_internal.cc util/cancel.cc util/compression.cc util/counting_semaphore.cc @@ -531,8 +516,9 @@ set(ARROW_UTIL_SRCS util/memory.cc util/mutex.cc util/ree_util.cc + util/secure_string.cc util/string.cc - util/string_builder.cc + util/string_util.cc util/task_group.cc util/tdigest.cc util/thread_pool.cc @@ -545,6 +531,8 @@ set(ARROW_UTIL_SRCS util/utf8.cc util/value_parsing.cc) +append_runtime_avx2_src(ARROW_UTIL_SRCS util/byte_stream_split_internal_avx2.cc) + append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_avx2.cc) append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_avx512.cc) if(ARROW_HAVE_NEON) @@ -590,6 +578,11 @@ if(ARROW_USE_GLOG) target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE glog::glog) endforeach() endif() +if(ARROW_USE_OPENSSL) + foreach(ARROW_UTIL_TARGET ${ARROW_UTIL_TARGETS}) + target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE ${ARROW_OPENSSL_LIBS}) + endforeach() +endif() if(ARROW_USE_XSIMD) foreach(ARROW_UTIL_TARGET ${ARROW_UTIL_TARGETS}) target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE ${ARROW_XSIMD}) @@ -620,6 +613,11 @@ if(ARROW_WITH_OPENTELEMETRY) target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE ${ARROW_OPENTELEMETRY_LIBS}) endforeach() endif() +if(ARROW_WITH_RAPIDJSON) + foreach(ARROW_UTIL_TARGET ${ARROW_UTIL_TARGETS}) + target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE RapidJSON) + endforeach() +endif() if(ARROW_WITH_ZLIB) foreach(ARROW_UTIL_TARGET ${ARROW_UTIL_TARGETS}) target_link_libraries(${ARROW_UTIL_TARGET} PRIVATE ZLIB::ZLIB) @@ -643,13 +641,13 @@ else() endif() set(ARROW_TESTING_SHARED_LINK_LIBS arrow_shared ${ARROW_GTEST_GTEST}) -set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON Boost::process) -set(ARROW_TESTING_STATIC_LINK_LIBS - arrow::flatbuffers - RapidJSON - Boost::process - arrow_static - ${ARROW_GTEST_GTEST}) +set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON) +set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static + ${ARROW_GTEST_GTEST}) +if(ARROW_ENABLE_THREADING) + list(APPEND ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::Boost::process) + list(APPEND ARROW_TESTING_STATIC_LINK_LIBS arrow::Boost::process) +endif() set(ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared) set(ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static) # that depend on gtest @@ -729,7 +727,6 @@ set(ARROW_COMPUTE_SRCS compute/registry.cc compute/kernels/chunked_internal.cc compute/kernels/codegen_internal.cc - compute/kernels/ree_util_internal.cc compute/kernels/scalar_cast_boolean.cc compute/kernels/scalar_cast_dictionary.cc compute/kernels/scalar_cast_extension.cc @@ -738,17 +735,27 @@ set(ARROW_COMPUTE_SRCS compute/kernels/scalar_cast_numeric.cc compute/kernels/scalar_cast_string.cc compute/kernels/scalar_cast_temporal.cc - compute/kernels/util_internal.cc + compute/kernels/temporal_internal.cc compute/kernels/vector_hash.cc compute/kernels/vector_selection.cc compute/kernels/vector_selection_filter_internal.cc compute/kernels/vector_selection_internal.cc - compute/kernels/vector_selection_take_internal.cc) + compute/kernels/vector_selection_take_internal.cc + compute/kernels/vector_swizzle.cc) if(ARROW_COMPUTE) + # If libarrow_compute.a is only built, "pkg-config --cflags --libs + # arrow-compute" outputs build flags for static linking not shared + # linking. ARROW_COMPUTE_PC_* except ARROW_COMPUTE_PC_*_PRIVATE are for + # the static linking case. + if(NOT ARROW_BUILD_SHARED AND ARROW_BUILD_STATIC) + string(APPEND ARROW_COMPUTE_PC_CFLAGS "${ARROW_COMPUTE_PC_CFLAGS_PRIVATE}") + set(ARROW_COMPUTE_PC_CFLAGS_PRIVATE "") + endif() # Include the remaining kernels list(APPEND - ARROW_COMPUTE_SRCS + ARROW_COMPUTE_LIB_SRCS + compute/initialize.cc compute/kernels/aggregate_basic.cc compute/kernels/aggregate_mode.cc compute/kernels/aggregate_pivot.cc @@ -759,6 +766,7 @@ if(ARROW_COMPUTE) compute/kernels/hash_aggregate_numeric.cc compute/kernels/hash_aggregate_pivot.cc compute/kernels/pivot_internal.cc + compute/kernels/ree_util_internal.cc compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc compute/kernels/scalar_compare.cc @@ -772,6 +780,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_temporal_binary.cc compute/kernels/scalar_temporal_unary.cc compute/kernels/scalar_validity.cc + compute/kernels/util_internal.cc compute/kernels/vector_array_sort.cc compute/kernels/vector_cumulative_ops.cc compute/kernels/vector_nested.cc @@ -782,7 +791,6 @@ if(ARROW_COMPUTE) compute/kernels/vector_select_k.cc compute/kernels/vector_sort.cc compute/kernels/vector_statistics.cc - compute/kernels/vector_swizzle.cc compute/key_hash_internal.cc compute/key_map_internal.cc compute/light_array_internal.cc @@ -794,39 +802,89 @@ if(ARROW_COMPUTE) compute/util.cc compute/util_internal.cc) - append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/kernels/aggregate_basic_avx2.cc) - append_runtime_avx512_src(ARROW_COMPUTE_SRCS compute/kernels/aggregate_basic_avx512.cc) - append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_internal_avx2.cc) - append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_internal_avx2.cc) - append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/row/compare_internal_avx2.cc) - append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/row/encode_internal_avx2.cc) - append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/util_avx2.cc) -endif() + append_runtime_avx2_src(ARROW_COMPUTE_LIB_SRCS compute/kernels/aggregate_basic_avx2.cc) + append_runtime_avx512_src(ARROW_COMPUTE_LIB_SRCS + compute/kernels/aggregate_basic_avx512.cc) + append_runtime_avx2_src(ARROW_COMPUTE_LIB_SRCS compute/key_hash_internal_avx2.cc) + append_runtime_avx2_bmi2_src(ARROW_COMPUTE_LIB_SRCS compute/key_map_internal_avx2.cc) + append_runtime_avx2_src(ARROW_COMPUTE_LIB_SRCS compute/row/compare_internal_avx2.cc) + append_runtime_avx2_src(ARROW_COMPUTE_LIB_SRCS compute/row/encode_internal_avx2.cc) + append_runtime_avx2_bmi2_src(ARROW_COMPUTE_LIB_SRCS compute/util_avx2.cc) + + set(ARROW_COMPUTE_SHARED_PRIVATE_LINK_LIBS) + set(ARROW_COMPUTE_SHARED_LINK_LIBS) + set(ARROW_COMPUTE_STATIC_LINK_LIBS) + set(ARROW_COMPUTE_STATIC_INSTALL_INTERFACE_LIBS) + set(ARROW_COMPUTE_SHARED_INSTALL_INTERFACE_LIBS) + + list(APPEND ARROW_COMPUTE_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static) + list(APPEND ARROW_COMPUTE_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared) + list(APPEND ARROW_COMPUTE_STATIC_LINK_LIBS arrow_static) + list(APPEND ARROW_COMPUTE_SHARED_LINK_LIBS arrow_shared) + + if(ARROW_USE_BOOST) + list(APPEND ARROW_COMPUTE_STATIC_LINK_LIBS Boost::headers) + list(APPEND ARROW_COMPUTE_SHARED_PRIVATE_LINK_LIBS Boost::headers) + endif() + if(ARROW_USE_XSIMD) + list(APPEND ARROW_COMPUTE_STATIC_LINK_LIBS ${ARROW_XSIMD}) + list(APPEND ARROW_COMPUTE_SHARED_PRIVATE_LINK_LIBS ${ARROW_XSIMD}) + endif() + if(ARROW_WITH_OPENTELEMETRY) + list(APPEND ARROW_COMPUTE_STATIC_LINK_LIBS ${ARROW_OPENTELEMETRY_LIBS}) + list(APPEND ARROW_COMPUTE_SHARED_PRIVATE_LINK_LIBS ${ARROW_OPENTELEMETRY_LIBS}) + endif() + if(ARROW_WITH_RE2) + list(APPEND ARROW_COMPUTE_STATIC_LINK_LIBS re2::re2) + list(APPEND ARROW_COMPUTE_SHARED_PRIVATE_LINK_LIBS re2::re2) + endif() + if(ARROW_WITH_UTF8PROC) + list(APPEND ARROW_COMPUTE_STATIC_LINK_LIBS utf8proc::utf8proc) + list(APPEND ARROW_COMPUTE_SHARED_PRIVATE_LINK_LIBS utf8proc::utf8proc) + endif() -arrow_add_object_library(ARROW_COMPUTE ${ARROW_COMPUTE_SRCS}) -if(ARROW_USE_BOOST) - foreach(ARROW_COMPUTE_TARGET ${ARROW_COMPUTE_TARGETS}) - target_link_libraries(${ARROW_COMPUTE_TARGET} PRIVATE Boost::headers) + add_arrow_lib(arrow_compute + CMAKE_PACKAGE_NAME + ArrowCompute + PKG_CONFIG_NAME + arrow-compute + SHARED_LINK_LIBS + ${ARROW_COMPUTE_SHARED_LINK_LIBS} + SHARED_PRIVATE_LINK_LIBS + ${ARROW_COMPUTE_SHARED_PRIVATE_LINK_LIBS} + SHARED_INSTALL_INTERFACE_LIBS + ${ARROW_COMPUTE_SHARED_INSTALL_INTERFACE_LIBS} + STATIC_LINK_LIBS + ${ARROW_COMPUTE_STATIC_LINK_LIBS} + STATIC_INSTALL_INTERFACE_LIBS + ${ARROW_COMPUTE_STATIC_INSTALL_INTERFACE_LIBS} + OUTPUTS + ARROW_COMPUTE_LIBRARIES + SOURCES + ${ARROW_COMPUTE_LIB_SRCS} + SHARED_LINK_FLAGS + ${ARROW_VERSION_SCRIPT_FLAGS} # Defined in cpp/arrow/CMakeLists.txt + ) + foreach(LIB_TARGET ${ARROW_COMPUTE_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_COMPUTE_EXPORTING) endforeach() + + if(ARROW_BUILD_STATIC AND WIN32) + target_compile_definitions(arrow_compute_static PUBLIC ARROW_COMPUTE_STATIC) + endif() endif() + +arrow_add_object_library(ARROW_COMPUTE_CORE ${ARROW_COMPUTE_SRCS}) + if(ARROW_USE_XSIMD) - foreach(ARROW_COMPUTE_TARGET ${ARROW_COMPUTE_TARGETS}) - target_link_libraries(${ARROW_COMPUTE_TARGET} PRIVATE ${ARROW_XSIMD}) + foreach(ARROW_COMPUTE_CORE_TARGET ${ARROW_COMPUTE_CORE_TARGETS}) + target_link_libraries(${ARROW_COMPUTE_CORE_TARGET} PRIVATE ${ARROW_XSIMD}) endforeach() endif() if(ARROW_WITH_OPENTELEMETRY) - foreach(ARROW_COMPUTE_TARGET ${ARROW_COMPUTE_TARGETS}) - target_link_libraries(${ARROW_COMPUTE_TARGET} PRIVATE ${ARROW_OPENTELEMETRY_LIBS}) - endforeach() -endif() -if(ARROW_WITH_RE2) - foreach(ARROW_COMPUTE_TARGET ${ARROW_COMPUTE_TARGETS}) - target_link_libraries(${ARROW_COMPUTE_TARGET} PRIVATE re2::re2) - endforeach() -endif() -if(ARROW_WITH_UTF8PROC) - foreach(ARROW_COMPUTE_TARGET ${ARROW_COMPUTE_TARGETS}) - target_link_libraries(${ARROW_COMPUTE_TARGET} PRIVATE utf8proc::utf8proc) + foreach(ARROW_COMPUTE_CORE_TARGET ${ARROW_COMPUTE_CORE_TARGETS}) + target_link_libraries(${ARROW_COMPUTE_CORE_TARGET} + PRIVATE ${ARROW_OPENTELEMETRY_LIBS}) endforeach() endif() @@ -841,23 +899,20 @@ if(ARROW_FILESYSTEM) if(ARROW_AZURE) list(APPEND ARROW_FILESYSTEM_SRCS filesystem/azurefs.cc) set_source_files_properties(filesystem/azurefs.cc - PROPERTIES SKIP_PRECOMPILE_HEADERS ON - SKIP_UNITY_BUILD_INCLUSION ON) + PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() if(ARROW_GCS) list(APPEND ARROW_FILESYSTEM_SRCS filesystem/gcsfs.cc filesystem/gcsfs_internal.cc) set_source_files_properties(filesystem/gcsfs.cc filesystem/gcsfs_internal.cc - PROPERTIES SKIP_PRECOMPILE_HEADERS ON - SKIP_UNITY_BUILD_INCLUSION ON) + PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() if(ARROW_HDFS) list(APPEND ARROW_FILESYSTEM_SRCS filesystem/hdfs.cc) endif() if(ARROW_S3) list(APPEND ARROW_FILESYSTEM_SRCS filesystem/s3fs.cc) - set_source_files_properties(filesystem/s3fs.cc - PROPERTIES SKIP_PRECOMPILE_HEADERS ON - SKIP_UNITY_BUILD_INCLUSION ON) + set_source_files_properties(filesystem/s3fs.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION + ON) endif() arrow_add_object_library(ARROW_FILESYSTEM ${ARROW_FILESYSTEM_SRCS}) @@ -891,8 +946,7 @@ if(ARROW_FILESYSTEM) add_library(arrow_s3fs MODULE filesystem/s3fs_module.cc filesystem/s3fs.cc) target_link_libraries(arrow_s3fs PRIVATE ${AWSSDK_LINK_LIBRARIES} arrow_shared) set_source_files_properties(filesystem/s3fs.cc filesystem/s3fs_module.cc - PROPERTIES SKIP_PRECOMPILE_HEADERS ON - SKIP_UNITY_BUILD_INCLUSION ON) + PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() endif() @@ -914,18 +968,10 @@ if(ARROW_IPC) ipc/options.cc ipc/reader.cc ipc/writer.cc) - if(ARROW_JSON) - list(APPEND ARROW_IPC_SRCS ipc/json_simple.cc) - endif() arrow_add_object_library(ARROW_IPC ${ARROW_IPC_SRCS}) foreach(ARROW_IPC_TARGET ${ARROW_IPC_TARGETS}) target_link_libraries(${ARROW_IPC_TARGET} PRIVATE arrow::flatbuffers) endforeach() - if(ARROW_JSON) - foreach(ARROW_IPC_TARGET ${ARROW_IPC_TARGETS}) - target_link_libraries(${ARROW_IPC_TARGET} PRIVATE RapidJSON) - endforeach() - endif() else() set(ARROW_IPC_TARGET_SHARED) set(ARROW_IPC_TARGET_STATIC) @@ -939,6 +985,7 @@ if(ARROW_JSON) json/chunked_builder.cc json/chunker.cc json/converter.cc + json/from_string.cc json/object_parser.cc json/object_writer.cc json/parser.cc @@ -1042,13 +1089,11 @@ add_arrow_lib(arrow ${ARROW_SRCS} OUTPUTS ARROW_LIBRARIES - PRECOMPILED_HEADERS - "$<$:arrow/pch.h>" SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} SHARED_PRIVATE_LINK_LIBS ${ARROW_ARRAY_TARGET_SHARED} - ${ARROW_COMPUTE_TARGET_SHARED} + ${ARROW_COMPUTE_CORE_TARGET_SHARED} ${ARROW_CSV_TARGET_SHARED} ${ARROW_FILESYSTEM_TARGET_SHARED} ${ARROW_INTEGRATION_TARGET_SHARED} @@ -1064,7 +1109,7 @@ add_arrow_lib(arrow ${ARROW_SYSTEM_LINK_LIBS} STATIC_LINK_LIBS ${ARROW_ARRAY_TARGET_STATIC} - ${ARROW_COMPUTE_TARGET_STATIC} + ${ARROW_COMPUTE_CORE_TARGET_STATIC} ${ARROW_CSV_TARGET_STATIC} ${ARROW_FILESYSTEM_TARGET_STATIC} ${ARROW_INTEGRATION_TARGET_STATIC} @@ -1148,8 +1193,6 @@ if(ARROW_TESTING) ${ARROW_TESTING_SRCS} OUTPUTS ARROW_TESTING_LIBRARIES - PRECOMPILED_HEADERS - "$<$:arrow/pch.h>" SHARED_LINK_LIBS ${ARROW_TESTING_SHARED_LINK_LIBS} SHARED_PRIVATE_LINK_LIBS @@ -1197,9 +1240,7 @@ add_arrow_test(array_test array/array_struct_test.cc array/array_union_test.cc array/array_view_test.cc - array/statistics_test.cc - PRECOMPILED_HEADERS - "$<$:arrow/testing/pch.h>") + array/statistics_test.cc) add_arrow_test(buffer_test) @@ -1217,8 +1258,7 @@ add_arrow_test(misc_test status_test.cc) add_arrow_test(public_api_test) -set_source_files_properties(public_api_test.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON - SKIP_UNITY_BUILD_INCLUSION ON) +set_source_files_properties(public_api_test.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) add_arrow_test(scalar_test) add_arrow_test(type_test SOURCES field_ref_test.cc type_test.cc) diff --git a/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in b/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in index 66aa2b4078c..47488e8ac86 100644 --- a/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in +++ b/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in @@ -26,8 +26,12 @@ @PACKAGE_INIT@ +set(ARROW_ACERO_REQUIRED_DEPENDENCIES "@ARROW_ACERO_REQUIRED_DEPENDENCIES@") + include(CMakeFindDependencyMacro) -find_dependency(Arrow) +foreach(dependency ${ARROW_ACERO_REQUIRED_DEPENDENCIES}) + find_dependency(${dependency} CONFIG) +endforeach() include("${CMAKE_CURRENT_LIST_DIR}/ArrowAceroTargets.cmake") diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 5708d71737c..dc18afa9797 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -28,6 +28,9 @@ if(NOT ARROW_BUILD_SHARED AND ARROW_BUILD_STATIC) set(ARROW_ACERO_PC_CFLAGS_PRIVATE "") endif() +set(ARROW_ACERO_PKG_CONFIG_REQUIRES "arrow-compute") +set(ARROW_ACERO_REQUIRED_DEPENDENCIES Arrow ArrowCompute) + set(ARROW_ACERO_SRCS accumulation_queue.cc scalar_aggregate_node.cc @@ -73,10 +76,12 @@ if(ARROW_WITH_OPENTELEMETRY) list(APPEND ARROW_ACERO_STATIC_LINK_LIBS ${ARROW_OPENTELEMETRY_LIBS}) endif() -list(APPEND ARROW_ACERO_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static) -list(APPEND ARROW_ACERO_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared) -list(APPEND ARROW_ACERO_STATIC_LINK_LIBS arrow_static) -list(APPEND ARROW_ACERO_SHARED_LINK_LIBS arrow_shared) +list(APPEND ARROW_ACERO_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static + ArrowCompute::arrow_compute_static) +list(APPEND ARROW_ACERO_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared + ArrowCompute::arrow_compute_shared) +list(APPEND ARROW_ACERO_STATIC_LINK_LIBS arrow_static arrow_compute_static) +list(APPEND ARROW_ACERO_SHARED_LINK_LIBS arrow_shared arrow_compute_shared) add_arrow_lib(arrow_acero CMAKE_PACKAGE_NAME @@ -87,8 +92,6 @@ add_arrow_lib(arrow_acero ARROW_ACERO_LIBRARIES SOURCES ${ARROW_ACERO_SRCS} - PRECOMPILED_HEADERS - "$<$:arrow/acero/pch.h>" SHARED_LINK_LIBS ${ARROW_ACERO_SHARED_LINK_LIBS} SHARED_PRIVATE_LINK_LIBS @@ -126,6 +129,10 @@ if(ARROW_TESTING) if(ARROW_WITH_OPENTELEMETRY) target_link_libraries(arrow_acero_testing PRIVATE ${ARROW_OPENTELEMETRY_LIBS}) endif() + # arrow_compute_testing will register the kernels for Gtest. In order to register the kernels + # for Google benchmark we use a custom main function used on add_arrow_compute_benchmark. + set(ARROW_ACERO_BENCHMARKS_TEST_LINK_LIBS + ${ARROW_ACERO_TEST_LINK_LIBS} arrow_acero_testing arrow_compute_core_testing) list(APPEND ARROW_ACERO_TEST_LINK_LIBS arrow_acero_testing arrow_compute_testing) endif() # Only for hash_aggregate_test.cc. @@ -183,68 +190,54 @@ add_arrow_acero_test(aggregate_node_test SOURCES aggregate_node_test.cc) add_arrow_acero_test(util_test SOURCES util_test.cc task_util_test.cc) add_arrow_acero_test(hash_aggregate_test SOURCES hash_aggregate_test.cc) -if(ARROW_BUILD_BENCHMARKS) - function(add_arrow_acero_benchmark REL_BENCHMARK_NAME) - set(options) - set(one_value_args PREFIX) - set(multi_value_args LABELS) - cmake_parse_arguments(ARG - "${options}" - "${one_value_args}" - "${multi_value_args}" - ${ARGN}) - - if(ARG_PREFIX) - set(PREFIX ${ARG_PREFIX}) - else() - set(PREFIX "arrow-acero") - endif() - - if(ARG_LABELS) - set(LABELS ${ARG_LABELS}) - else() - set(LABELS "arrow_acero") - endif() - - add_arrow_benchmark(${REL_BENCHMARK_NAME} - EXTRA_LINK_LIBS - ${ARROW_ACERO_TEST_LINK_LIBS} - PREFIX - ${PREFIX} - LABELS - ${LABELS} - ${ARG_UNPARSED_ARGUMENTS}) - endfunction() - - add_arrow_acero_benchmark(expression_benchmark SOURCES expression_benchmark.cc) - - add_arrow_acero_benchmark(filter_benchmark SOURCES benchmark_util.cc - filter_benchmark.cc) - - add_arrow_acero_benchmark(project_benchmark SOURCES benchmark_util.cc - project_benchmark.cc) - - add_arrow_acero_benchmark(asof_join_benchmark SOURCES asof_join_benchmark.cc) - - add_arrow_acero_benchmark(tpch_benchmark SOURCES tpch_benchmark.cc) - - add_arrow_acero_benchmark(aggregate_benchmark SOURCES aggregate_benchmark.cc) - - add_arrow_acero_benchmark(hash_join_benchmark SOURCES hash_join_benchmark.cc) - - if(ARROW_BUILD_STATIC) - target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_static) - target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_static) - target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_static) - target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_static) - target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_static) - target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_static) +add_arrow_acero_test(test_util_internal_test SOURCES test_util_internal_test.cc) + +function(add_arrow_acero_benchmark REL_BENCHMARK_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args LABELS EXTRA_LINK_LIBS) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + + if(ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) else() - target_link_libraries(arrow-acero-expression-benchmark PUBLIC arrow_acero_shared) - target_link_libraries(arrow-acero-filter-benchmark PUBLIC arrow_acero_shared) - target_link_libraries(arrow-acero-project-benchmark PUBLIC arrow_acero_shared) - target_link_libraries(arrow-acero-asof-join-benchmark PUBLIC arrow_acero_shared) - target_link_libraries(arrow-acero-tpch-benchmark PUBLIC arrow_acero_shared) - target_link_libraries(arrow-acero-hash-join-benchmark PUBLIC arrow_acero_shared) + set(PREFIX "arrow-acero") endif() -endif() + + if(ARG_LABELS) + set(LABELS ${ARG_LABELS}) + else() + set(LABELS "arrow_acero") + endif() + + if(ARROW_TEST_LINKAGE STREQUAL "static") + set(EXTRA_LINK_LIBS arrow_acero_static ${ARROW_ACERO_BENCHMARKS_TEST_LINK_LIBS}) + else() + set(EXTRA_LINK_LIBS arrow_acero_shared ${ARROW_ACERO_BENCHMARKS_TEST_LINK_LIBS}) + endif() + if(ARG_EXTRA_LINK_LIBS) + list(APPEND EXTRA_LINK_LIBS ${ARG_EXTRA_LINK_LIBS}) + endif() + + add_arrow_compute_benchmark(${REL_BENCHMARK_NAME} + EXTRA_LINK_LIBS + ${EXTRA_LINK_LIBS} + PREFIX + ${PREFIX} + LABELS + ${LABELS} + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + +add_arrow_acero_benchmark(aggregate_benchmark) +add_arrow_acero_benchmark(asof_join_benchmark) +add_arrow_acero_benchmark(expression_benchmark) +add_arrow_acero_benchmark(filter_benchmark SOURCES benchmark_util.cc filter_benchmark.cc) +add_arrow_acero_benchmark(hash_join_benchmark) +add_arrow_acero_benchmark(project_benchmark SOURCES benchmark_util.cc + project_benchmark.cc) +add_arrow_acero_benchmark(tpch_benchmark) diff --git a/cpp/src/arrow/acero/aggregate_internal.cc b/cpp/src/arrow/acero/aggregate_internal.cc index 87424ae1bb9..ac47921bf46 100644 --- a/cpp/src/arrow/acero/aggregate_internal.cc +++ b/cpp/src/arrow/acero/aggregate_internal.cc @@ -23,6 +23,7 @@ #include "arrow/acero/aggregate_internal.h" #include "arrow/acero/aggregate_node.h" #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/compute/exec.h" #include "arrow/compute/function.h" diff --git a/cpp/src/arrow/acero/arrow-acero.pc.in b/cpp/src/arrow/acero/arrow-acero.pc.in index ddddd52c4dd..94249cd78bd 100644 --- a/cpp/src/arrow/acero/arrow-acero.pc.in +++ b/cpp/src/arrow/acero/arrow-acero.pc.in @@ -22,7 +22,7 @@ libdir=@ARROW_PKG_CONFIG_LIBDIR@ Name: Apache Arrow Acero Engine Description: Apache Arrow's Acero Engine. Version: @ARROW_VERSION@ -Requires: arrow +Requires: @ARROW_ACERO_PKG_CONFIG_REQUIRES@ Libs: -L${libdir} -larrow_acero Cflags:@ARROW_ACERO_PC_CFLAGS@ Cflags.private:@ARROW_ACERO_PC_CFLAGS_PRIVATE@ diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index c21af3da84f..3970050e502 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -32,6 +32,7 @@ #include #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/acero/unmaterialized_table_internal.h" #ifndef NDEBUG @@ -513,9 +514,9 @@ class InputState : public util::SerialSequencingQueue::Processor { std::unique_ptr backpressure_control = std::make_unique( /*node=*/asof_input, /*output=*/asof_node, backpressure_counter); - ARROW_ASSIGN_OR_RAISE( - auto handler, BackpressureHandler::Make(asof_input, low_threshold, high_threshold, - std::move(backpressure_control))); + ARROW_ASSIGN_OR_RAISE(auto handler, + BackpressureHandler::Make(low_threshold, high_threshold, + std::move(backpressure_control))); return std::make_unique(index, tolerance, must_hash, may_rehash, key_hasher, asof_node, std::move(handler), schema, time_col_index, key_col_index); @@ -639,12 +640,10 @@ class InputState : public util::SerialSequencingQueue::Processor { // hit the end of the batch, need to get the next batch if possible. ++batches_processed_; latest_ref_row_ = 0; - have_active_batch &= !queue_.TryPop(); - if (have_active_batch) { - DCHECK_GT(queue_.Front()->num_rows(), 0); // empty batches disallowed - memo_.UpdateTime(GetTime(queue_.Front().get(), time_type_id_, time_col_index_, - 0)); // time changed - } + bool did_pop = queue_.TryPop().has_value(); + DCHECK(did_pop); + ARROW_UNUSED(did_pop); + have_active_batch = !queue_.Empty(); } } return have_active_batch; @@ -764,10 +763,10 @@ class InputState : public util::SerialSequencingQueue::Processor { total_batches_ = n; } - Status ForceShutdown() { + void ForceShutdown() { // Force the upstream input node to unpause. Necessary to avoid deadlock when we // terminate the process thread - return queue_.ForceShutdown(); + queue_.ForceShutdown(); } private: @@ -1047,8 +1046,10 @@ class AsofJoinNode : public ExecNode { if (st.ok()) { st = output_->InputFinished(this, batches_produced_); } - for (const auto& s : state_) { - st &= s->ForceShutdown(); + for (size_t i = 0; i < state_.size(); ++i) { + const auto& s = state_[i]; + s->ForceShutdown(); + st &= inputs_[i]->StopProducing(); } })); } @@ -1500,8 +1501,11 @@ class AsofJoinNode : public ExecNode { if (st.ok()) { st = output_->InputFinished(this, batches_produced_); } - for (const auto& s : state_) { - st &= s->ForceShutdown(); + + for (size_t i = 0; i < state_.size(); ++i) { + const auto& s = state_[i]; + s->ForceShutdown(); + st &= inputs_[i]->StopProducing(); } } diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index 271ad6018f2..59a9b4ebba1 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -44,6 +44,7 @@ #include "arrow/compute/cast.h" #include "arrow/compute/row/row_encoder_internal.h" #include "arrow/compute/test_util_internal.h" +#include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/testing/random.h" @@ -1770,5 +1771,58 @@ TEST(AsofJoinTest, DestroyNonStartedAsofJoinNode) { DeclarationToStatus(std::move(sink))); } +// Reproduction of GH-46224: Hang when all left timestamps are greater than right +// timestamps. +TEST(AsofJoinTest, OneSideTsAllGreaterThanTheOther) { +#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) + const int rounds = 1; +#else + const int rounds = 42; +#endif + int64_t tolerance = 1; + int64_t num_rows_big_ts = 1; + int64_t num_rows_small_ts = ExecPlan::kMaxBatchSize + 1; + // Make sure the big_ts is outside the horizon of the tolerance regardless of the side. + int64_t big_ts = num_rows_small_ts + tolerance + 1; + + // Column of big timestamps. + ASSERT_OK_AND_ASSIGN(auto col_big_ts, + gen::Constant(MakeScalar(big_ts))->Generate(num_rows_big_ts)); + // Column of small timestamps from 0 to num_rows_small_ts - 1. + ASSERT_OK_AND_ASSIGN(auto col_small_ts, + gen::Step()->Generate(num_rows_small_ts)); + + struct Case { + std::shared_ptr left_col; + std::shared_ptr right_col; + }; + + for (const auto& c : { + Case{col_big_ts, col_small_ts}, + Case{col_small_ts, col_big_ts}, + }) { + auto left_schema = arrow::schema({arrow::field("on", int64())}); + auto right_schema = arrow::schema({arrow::field("on", int64())}); + + ExecBatch left_batch({c.left_col}, c.left_col->length()); + ExecBatch right_batch({c.right_col}, c.right_col->length()); + ASSERT_OK_AND_ASSIGN(auto col_null, MakeArrayOfNull(int64(), c.left_col->length())); + ExecBatch exp_batch({c.left_col, col_null}, c.left_col->length()); + + // Run moderate number of times to ensure that no hangs occur. + for (int i = 0; i < rounds; ++i) { + AsofJoinNodeOptions opts({{{"on"}, {}}, {{"on"}, {}}}, tolerance); + auto left = Declaration("exec_batch_source", + ExecBatchSourceNodeOptions(left_schema, {left_batch})); + auto right = Declaration("exec_batch_source", + ExecBatchSourceNodeOptions(right_schema, {right_batch})); + auto asof_join = arrow::acero::Declaration{"asofjoin", {left, right}, opts}; + ASSERT_OK_AND_ASSIGN(auto result, + arrow::acero::DeclarationToExecBatches(std::move(asof_join))); + AssertExecBatchesEqualIgnoringOrder(result.schema, {exp_batch}, result.batches); + } + } +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/backpressure_handler.h b/cpp/src/arrow/acero/backpressure_handler.h index db6c3799354..c6a47e60197 100644 --- a/cpp/src/arrow/acero/backpressure_handler.h +++ b/cpp/src/arrow/acero/backpressure_handler.h @@ -25,16 +25,15 @@ namespace arrow::acero { class BackpressureHandler { private: - BackpressureHandler(ExecNode* input, size_t low_threshold, size_t high_threshold, + BackpressureHandler(size_t low_threshold, size_t high_threshold, std::unique_ptr backpressure_control) - : input_(input), - low_threshold_(low_threshold), + : low_threshold_(low_threshold), high_threshold_(high_threshold), backpressure_control_(std::move(backpressure_control)) {} public: static Result Make( - ExecNode* input, size_t low_threshold, size_t high_threshold, + size_t low_threshold, size_t high_threshold, std::unique_ptr backpressure_control) { if (low_threshold >= high_threshold) { return Status::Invalid("low threshold (", low_threshold, @@ -43,7 +42,7 @@ class BackpressureHandler { if (backpressure_control == NULLPTR) { return Status::Invalid("null backpressure control parameter"); } - BackpressureHandler backpressure_handler(input, low_threshold, high_threshold, + BackpressureHandler backpressure_handler(low_threshold, high_threshold, std::move(backpressure_control)); return backpressure_handler; } @@ -56,16 +55,7 @@ class BackpressureHandler { } } - Status ForceShutdown() { - // It may be unintuitive to call Resume() here, but this is to avoid a deadlock. - // Since acero's executor won't terminate if any one node is paused, we need to - // force resume the node before stopping production. - backpressure_control_->Resume(); - return input_->StopProducing(); - } - private: - ExecNode* input_; size_t low_threshold_; size_t high_threshold_; std::unique_ptr backpressure_control_; diff --git a/cpp/src/arrow/acero/concurrent_queue_internal.h b/cpp/src/arrow/acero/concurrent_queue_internal.h index a751db70262..b91daae8b04 100644 --- a/cpp/src/arrow/acero/concurrent_queue_internal.h +++ b/cpp/src/arrow/acero/concurrent_queue_internal.h @@ -113,7 +113,7 @@ class ConcurrentQueue { }; template -class BackpressureConcurrentQueue : public ConcurrentQueue { +class BackpressureConcurrentQueue : private ConcurrentQueue { private: struct DoHandle { explicit DoHandle(BackpressureConcurrentQueue& queue) @@ -134,6 +134,9 @@ class BackpressureConcurrentQueue : public ConcurrentQueue { explicit BackpressureConcurrentQueue(BackpressureHandler handler) : handler_(std::move(handler)) {} + using ConcurrentQueue::Empty; + using ConcurrentQueue::Front; + // Pops the last item from the queue but waits if the queue is empty until new items are // pushed. T WaitAndPop() { @@ -152,6 +155,7 @@ class BackpressureConcurrentQueue : public ConcurrentQueue { // Pushes an item to the queue void Push(const T& item) { + if (shutdown_) return; std::unique_lock lock(ConcurrentQueue::GetMutex()); DoHandle do_handle(*this); ConcurrentQueue::PushUnlocked(item); @@ -164,10 +168,14 @@ class BackpressureConcurrentQueue : public ConcurrentQueue { ConcurrentQueue::ClearUnlocked(); } - Status ForceShutdown() { return handler_.ForceShutdown(); } + void ForceShutdown() { + shutdown_ = true; + Clear(); + } private: BackpressureHandler handler_; + std::atomic shutdown_{false}; }; } // namespace arrow::acero diff --git a/cpp/src/arrow/acero/doc/key_map.md b/cpp/src/arrow/acero/doc/key_map.md deleted file mode 100644 index a676343bbeb..00000000000 --- a/cpp/src/arrow/acero/doc/key_map.md +++ /dev/null @@ -1,223 +0,0 @@ - - -# Swiss Table - -A specialized hash table implementation used to dynamically map combinations of key field values to a dense set of integer ids. Ids can later be used in place of keys to identify groups of rows with equal keys. - -## Introduction - -Hash group-by in Arrow uses a variant of a hash table based on a data structure called Swiss table. Swiss table uses linear probing. There is an array of slots and the information related to inserted keys is stored in these slots. A hash function determines the slot where the search for a matching key will start during hash table lookup. Then the slots are visited sequentially, wrapping around the end of an array, until either a match or an empty slot is found, the latter case meaning that there is no match. Swiss table organizes the slots in blocks of 8 and has a design that enables data level parallelism at the block level. More precisely, it allows for visiting all slots within a block at once during lookups, by simply using 64-bit arithmetic. SIMD instructions can further enhance this data level parallelism allowing to process multiple blocks related to multiple input keys together using SIMD vectors of 64-bit elements. Occupied slots within a block are always clustered together. The name Swiss table comes from likening resulting sequences of empty slots to holes in a one dimensional cheese. - -## Interface - -Hash table used in query processing for implementing join and group-by operators does not need to provide all of the operations that a general purpose hash table would. Simplified requirements can help achieve a simpler and more efficient design. For instance we do not need to be able to remove previously inserted keys. It’s an append-only data structure: new keys can be added but old keys are never erased. Also, only a single copy of each key can be inserted - it is like `std::map` in that sense and not `std::multimap`. - -Our Swiss table is fully vectorized. That means that all methods work on vectors of input keys processing them in batches. Specialized SIMD implementations of processing functions are almost always provided for performance critical operations. All callback interfaces used from the core hash table code are also designed to work on batches of inputs instead of individual keys. The batch size can be almost arbitrary and is selected by the client of the hash table. Batch size should be the smallest number of input items, big enough so that the benefits of vectorization and SIMD can be fully experienced. Keeping it small means less memory used for temporary arrays storing intermediate results of computation (vector equivalent of some temporary variables kept on the stack). That in turn means smaller space in CPU caches, which also means less impact on other memory access intensive operations. We pick 1024 as the default size of the batch. We will call it a **mini-batch** to distinguish it from potentially other forms of batches used at higher levels in the code, e.g. when scheduling work for worker threads or relational operators inside an analytic query. - -The main functionality provided by Swiss table is mapping of arbitrarily complex keys to unique integer ids. Let us call it **lookup-or-insert**. Given a sequence of key values, return a corresponding sequence of integer ids, such that all keys that are equal receive the same id and for K distinct keys the integer ids will be assigned from the set of numbers 0 to (K-1). If we find a matching key in a hash table for a given input, we return the **key id** assigned when the key was first inserted into a hash table. If we fail to find an already inserted match, we assign the first unused integer as a key id and add a new entry to a hash table. Due to vectorized processing, which may result in out-of-order processing of individual inputs, it is not guaranteed that if there are two new key values in the same input batch and one of them appears earlier in the input sequence, then it will receive a smaller key id. Additional mapping functionality can be built on top of basic mapping to integer key id, for instance if we want to assign and perhaps keep updating some values to all unique keys, we can keep these values in a resizable vector indexed by obtained key id. - -The implementation of Swiss table does not need to have any information related to the domain of the keys. It does not use their logical data type or information about their physical representation and does not even use pointers to keys. All access to keys is delegated to a separate class or classes that provide callback functions for three operations: -- computing hashes of keys; -- checking equality for given pairs of keys; -- appending a given sequence of keys to a stack maintained outside of Swiss table object, so that they can be referenced later on by key ids (key ids will be equal to their positions in the stack). - - -When passing arguments to callback functions the keys are referenced using integer ids. For the left side - that is the keys present in the input mini-batch - ordinal positions within that mini-batch are used. For the right side - that is the keys inserted into the hash table - these are identified by key ids assigned to them and stored inside Swiss table when they were first encountered and processed. - -Diagram with logical view of information passing in callbacks: - -![alt text](img/key_map_1.jpg) - -Hash table values for inserted keys are also stored inside Swiss table. Because of that, hash table logic does not need to ever re-evaluate the hash, and there is actually no need for a hash function callback. It is enough that the caller provides hash values for all entries in the batch when calling lookup-or-insert. - -## Basic architecture and organization of data -The hash table is an array of **slots**. Slots are grouped in groups of 8 called **blocks**. The number of blocks is a power of 2. The empty hash table starts with a single block, with all slots empty. Then, as the keys are getting inserted and the amount of empty slots is shrinking, at some point resizing of the hash table is triggered. The data stored in slots is moved to a new hash table that has the double of the number of blocks. - -The diagram below shows the basic organization of data in our implementation of Swiss table: - -![alt text](img/key_map_2.jpg) - -N is the log of the number of blocks, 2n+3 is the number of slots and also the maximum number of inserted keys and hence (N + 3) is the number of bits required to store a key id. We will refer to N as the **size of the hash table**. - -Index of a block within an array will be called **block id**, and similarly index of a slot will be **slot id**. Sometimes we will focus on a single block and refer to slots that belong to it by using a **local slot id**, which is an index from 0 to 7. - -Every slot can either be **empty** or store data related to a single inserted key. There are three pieces of information stored inside a slot: -- status byte, -- key id, -- key hash. - -Status byte, as the name suggests, stores 8 bits. The highest bit indicates if the slot is empty (the highest bit is set) or corresponds to one of inserted keys (the highest bit is zero). The remaining 7 bits contain 7 bits of key hash that we call a **stamp**. The stamp is used to eliminate some false positives when searching for a matching key for a given input. Slot also stores **key id**, which is a non-negative integer smaller than the number of inserted keys, that is used as a reference to the actual inserted key. The last piece of information related to an inserted key is its **hash** value. We store hashes for all keys, so that they never need to be re-computed. That greatly simplifies some operations, like resizing of a hash table, that may not even need to look at the keys at all. For an empty slot, the status byte is 0x80, key id is zero and the hash is not used and can be set to any number. - -A single block contains 8 slots and can be viewed as a micro-stack of up to 8 inserted keys. When the first key is inserted into an empty block, it will occupy a slot with local id 0. The second inserted key will go into slot number 1 and so on. We use N highest bits of hash to get an index of a **start block**, when searching for a match or an empty slot to insert a previously not seen key when that is the case. If the start block contains any empty slots, then the search for either a match or place to insert a key will end at that block. We will call such a block an **open block**. A block that is not open is a full block. In the case of full block, the input key related search may continue in the next block modulo the number of blocks. If the key is not inserted into its start block, we will refer to it as an **overflow** entry, other entries being **non-overflow**. Overflow entries are slower to process, since they require visiting more than one block, so we want to keep their percentage low. This is done by choosing the right **load factor** (percentage of occupied slots in the hash table) at which the hash table gets resized and the number of blocks gets doubled. By tuning this value we can control the probability of encountering an overflow entry. - -The most interesting part of each block is the set of status bytes of its slots, which is simply a single 64-bit word. The implementation of efficient searches across these bytes during lookups require using either leading zero count or trailing zero count intrinsic. Since there are cases when only the first one is available, in order to take advantage of it, we order the bytes in the 64-bit status word so that the first slot within a block uses the highest byte and the last one uses the lowest byte (slots are in reversed bytes order). The diagram below shows how the information about slots is stored within a 64-bit status word: - -![alt text](img/key_map_3.jpg) - -Each status byte has a 7-bit fragment of hash value - a **stamp** - and an empty slot bit. Empty slots have status byte equal to 0x80 - the highest bit is set to 1 to indicate an empty slot and the lowest bits, which are used by a stamp, are set to zero. - -The diagram below shows which bits of hash value are used by hash table: - -![alt text](img/key_map_4.jpg) - -If a hash table has 2N blocks, then we use N highest bits of a hash to select a start block when searching for a match. The next 7 bits are used as a stamp. Using the highest bits to pick a start block means that a range of hash values can be easily mapped to a range of block ids of start blocks for hashes in that range. This is useful when resizing a hash table or merging two hash tables together. - -### Interleaving status bytes and key ids - -Status bytes and key ids for all slots are stored in a single array of bytes. They are first grouped by 8 into blocks, then each block of status bytes is interleaved with a corresponding block of key ids. Finally key ids are represented using the smallest possible number of bits and bit-packed (bits representing each next key id start right after the last bit of the previous key id). Note that regardless of the chosen number of bits, a block of bit-packed key ids (that is 8 of them) will start and end on the byte boundary. - -The diagram below shows the organization of bytes and bits of a single block in interleaved array: -![alt text](img/key_map_5.jpg) - -From the size of the hash table we can derive the number K of bits needed in the worst case to encode any key id. K is equal to the number of bits needed to represent slot id (number of keys is not greater than the number of slots and any key id is strictly less than the number of keys), which for a hash table of size N (N blocks) equals (N+3). To simplify bit packing and unpacking and avoid handling of special cases, we will round up K to full bytes for K > 24 bits. - -Status bytes are stored in a single 64-bit word in reverse byte order (the last byte corresponds to the slot with local id 0). On the other hand key ids are stored in the normal order (the order of slot ids). - -Since both status byte and key id for a given slot are stored in the same array close to each other, we can expect that most of the lookups will read only one CPU cache-line from memory inside Swiss table code (then at least another one outside Swiss table to access the bytes of the key for the purpose of comparison). Even if we hit an overflow entry, it is still likely to reside on the same cache-line as the start block data. Hash values, which are stored separately from status byte and key id, are only used when resizing and do not impact the lookups outside these events. - -> Improvement to consider: -> In addition to the Swiss table data, we need to store an array of inserted keys, one for each key id. If keys are of fixed length, then the address of the bytes of the key can be calculated by multiplying key id by the common length of the key. If keys are of varying length, then there will be an additional array with an offset of each key within the array of concatenated bytes of keys. That means that any key comparison during lookup will involve 3 arrays: one to get key id, one to get key offset and final one with bytes of the key. This could be reduced to 2 array lookups if we stored key offset instead of key id interleaved with slot status bytes. Offset indexed by key id and stored in its own array becomes offset indexed by slot id and stored interleaved with slot status bytes. At the same time key id indexed by slot id and interleaved with slot status bytes before becomes key id referenced using offset and stored with key bytes. There may be a slight increase in the total size of memory needed by the hash table, equal to the difference in the number of bits used to store offset and those used to store key id, multiplied by the number of slots, but that should be a small fraction of the total size. - -### 32-bit hash vs 64-bit hash - -Currently we use 32-bit hash values in Swiss table code and 32-bit integers as key ids. For the robust implementation, sooner or later we will need to support 64-bit hash and 64-bit key ids. When we use 32-bit hash, it means that we run out of hash bits when hash table size N is greater than 25 (25 bits of hash needed to select a block and 7 bits needed to generate a stamp byte reach 32 total bits). When the number of inserted keys exceeds the maximal number of keys stored in a hash table of size 25 (which is at least 224), the chance of false positives during lookups will start quickly growing. 32-bit hash should not be used with more than about 16 million inserted keys. - -### Low memory footprint and low chance of hash collisions - -Swiss table is a good choice of a hash table for modern hardware, because it combines lookups that can take advantage of special CPU instructions with space efficiency and low chance of hash collisions. - -Space efficiency is important for performance, because the cost of random array accesses, often dominating the lookup cost for larger hash tables, increases with the size of the arrays. This happens due to limited space of CPU caches. Let us look at what is the amortized additional storage cost for a key in a hash table apart from the essential cost of storing data of all those keys. Furthermore, we can skip the storage of hash values, since these are only used during infrequent hash table resize operations (should not have a big impact on CPU cache usage in normal cases). - -Half full hash table of size N will use 2 status bytes per inserted key (because for every filled slot there is one empty slot) and 2\*(N+3) bits for key id (again, one for the occupied slot and one for the empty). For N = 16 for instance this is slightly under 7 bytes per inserted key. - -Swiss table also has a low probability of false positives leading to wasted key comparisons. Here is some rationale behind why this should be the case. Hash table of size N can contain up to 2N+3 keys. Search for a match involves (N + 7) hash bits: N to select a start block and 7 to use as a stamp. There are always at least 16 times more combinations of used hash bits than there are keys in the hash table (32 times more if the hash table is half full). These numbers mean that the probability of false positives resulting from a search for a matching slot should be low. That corresponds to an expected number of comparisons per lookup being close to 1 for keys already present and 0 for new keys. - -## Lookup - -Lookup-or-insert operation, given a hash of a key, finds a list of candidate slots with corresponding keys that are likely to be equal to the input key. The list may be empty, which means that the key does not exist yet in the hash table. If it is not empty, then the callback function for key comparison is called for each next candidate to verify that there is indeed a match. False positives get rejected and we end up either finding an actual match or an empty slot, which means that the key is new to the hash table. New keys get assigned next available integers as key ids, and are appended to the set of keys stored in the hash table. As a result of inserting new keys to the hash table, the density of occupied slots may reach an upper limit, at which point the hash table will be resized and will afterwards have twice as many slots. That is in summary lookup-or-insert functionality, but the actual implementation is a bit more involved, because of vectorization of the processing and various optimizations for common cases. - -### Search within a single block - -There are three possible cases that can occur when searching for a match for a given key (that is, for a given stamp of a key) within a single block, illustrated below. - - 1. There is a matching stamp in the block of status bytes: - -![alt text](img/key_map_6.jpg) - - 2. There is no matching stamp in the block, but there is an empty slot in the block: - -![alt text](img/key_map_7.jpg) - - 3. There is no matching stamp in the block and the block is full (there are no empty slots left): - -![alt text](img/key_map_8.jpg) - -64-bit arithmetic can be used to search for a matching slot within the entire single block at once, without iterating over all slots in it. Following is an example of a sequence of steps to find the first status byte for a given stamp, returning the first empty slot on miss if the block is not full or 8 (one past maximum local slot id) otherwise. - -Following is a sketch of the possible steps to execute when searching for the matching stamp in a single block. - -*Example will use input stamp 0x5E and a 64-bit status bytes word with one empty slot: -0x 4B17 5E3A 5E2B 1180*. - -1. [1 instruction] Replicate stamp to all bytes by multiplying it by 0x 0101 0101 0101 0101. - - *We obtain: 0x 5E5E 5E5E 5E5E 5E5E.* - -2. [1 instruction] XOR replicated stamp with status bytes word. Bytes corresponding to a matching stamp will be 0, bytes corresponding to empty slots will have a value between 128 and 255, bytes corresponding to non-matching non-empty slots will have a value between 1 and 127. - - *We obtain: 0x 1549 0064 0075 4FDE.* - -3. [2 instructions] In the next step we want to have information about a match in the highest bit of each byte. We can ignore here empty slot bytes, because they will be taken care of at a later step. Set the highest bit in each byte (OR with 0x 8080 8080 8080 8080) and then subtract 1 from each byte (subtract 0x 0101 0101 0101 0101 from 64-bit word). Now if a byte corresponds to a non-empty slot then the highest bit 0 indicates a match and 1 indicates a miss. - - *We obtain: 0x 95C9 80E4 80F5 CFDE, - then 0x 94C8 7FE3 7FF4 CEDD.* - -4. [3 instructions] In the next step we want to obtain in each byte one of two values: 0x80 if it is either an empty slot or a match, 0x00 otherwise. We do it in three steps: NOT the result of the previous step to change the meaning of the highest bit; OR with the original status word to set highest bit in a byte to 1 for empty slots; mask out everything other than the highest bits in all bytes (AND with 0x 8080 8080 8080 8080). - - *We obtain: 6B37 801C 800B 3122, - then 6B37 DE3E DE2B 31A2, - finally 0x0000 8000 8000 0080.* - -5. [2 instructions] Finally, use leading zero bits count and divide it by 8 to find an index of the last byte that corresponds either to a match or an empty slot. If the leading zero count intrinsic returns 64 for a 64-bit input zero, then after dividing by 8 we will also get the desired answer in case of a full block without any matches. - - *We obtain: 16, - then 2 (index of the first slot within the block that matches the stamp).* - -If SIMD instructions with 64-bit lanes are available, multiple single block searches for different keys can be executed together. For instance AVX2 instruction set allows to process quadruplets of 64-bit values in a single instruction, four searches at once. - -### Complete search potentially across multiple blocks - -Full implementation of a search for a matching key may involve visiting multiple blocks beginning with the start block selected based on the hash of the key. We move to the next block modulo the number of blocks, whenever we do not find a match in the current block and the current block is full. The search may also involve visiting one or more slots in each block. Visiting in this case means calling a comparison callback to verify the match whenever a slot with a matching stamp is encountered. Eventually the search stops when either: -- the matching key is found in one of the slots matching the stamp, or - -- an empty slot is reached. This is illustrated in the diagram below: -![alt text](img/key_map_9.jpg) - - -### Optimistic processing with two passes - -Hash table lookups may have high cost in the pessimistic case, when we encounter cases of hash collisions and full blocks that lead to visiting further blocks. In the majority of cases we can expect an optimistic situation - the start block is not full, so we will only visit this one block, and all stamps in the block are different, so we will need at most one comparison to find a match. We can expect about 90% of the key lookups for an existing key to go through the optimistic path of processing. For that reason it pays off to optimize especially for this 90% of inputs. - -Lookups in Swiss table are split into two passes over an input batch of keys. The **first pass: fast-path lookup** , is a highly optimized, vectorized, SIMD-friendly, branch-free code that fully handles optimistic cases. The **second pass: slow-path lookup** , is normally executed only for the selection of inputs that have not been finished in the first pass, although it can also be called directly on all of the inputs, skipping fast-path lookup. It handles all special cases and inserts but in order to be robust it is not as efficient as fast-path. Slow-path lookup does not need to repeat the work done in fast-path lookup - it can use the state reached at the end of fast-path lookup as a starting point. - -Fast-path lookup implements search only for the first stamp match and only within the start block. It only makes sense when we already have at least one key inserted into the hash table, since it does not handle inserts. It takes a vector of key hashes as an input and based on it outputs three pieces of information for each key: - -- Key id corresponding to the slot in which a matching stamp was found. Any valid key id if a matching stamp was not found. -- A flag indicating if a match was found or not. -- Slot id of a slot from which slow-path should pick up the search if the first match was either not found or it turns out to be false positive after evaluating key comparison. - -> Improvement to consider: -> precomputing 1st pass lookup results. -> -> If the hash table is small, the number of inserted keys is small, we could further simplify and speed-up the first pass by storing in a lookup table pre-computed results for all combinations of hash bits. Let us consider the case of Swiss table of size 5 that has 256 slots and up to 128 inserted keys. Only 12 bits of hash are used by lookup in that case: 5 to select a block, 7 to create a stamp. For all 212 combinations of those bits we could keep the result of first pass lookup in an array. Key id and a match indicating flag can use one byte: 7 bits for key id and 1 bit for the flag. Note that slot id is only needed if we go into 2nd pass lookup, so it can be stored separately and likely only accessed by a small subset of keys. Fast-path lookup becomes almost a single fetch of result from a 4KB array. Lookup arrays used to implement this need to be kept in sync with the main copy of data about slots, which requires extra care during inserts. Since the number of entries in lookup arrays is much higher than the number of slots, this technique only makes sense for small hash tables. - -### Dense comparisons - -If there is at least one key inserted into a hash table, then every slot contains a key id value that corresponds to some actual key that can be used in comparison. That is because empty slots are initialized with 0 as their key id. After the fast-path lookup we get a match-found flag for each input. If it is set, then we need to run a comparison of the input key with the key in the hash table identified by key id returned by fast-path code. The comparison will verify that there is a true match between the keys. We only need to do this for a subset of inputs that have a match candidate, but since we have key id values corresponding to some real key for all inputs, we may as well execute comparisons on all inputs unconditionally. If the majority (e.g. more than 80%) of the keys have a match candidate, the cost of evaluating comparison for the remaining fraction of keys but without filtering may actually be cheaper than the cost of running evaluation only for required keys while referencing filter information. This can be seen as a variant of general preconditioning techniques used to avoid diverging conditional branches in the code. It may be used, based on some heuristic, to verify matches reported by fast-path lookups and is referred to as **dense comparisons**. - -## Resizing - -New hash table is initialized as empty and has only a single block with a space for only a few key entries. Doubling of the hash table size becomes necessary as more keys get inserted. It is invoked during the 2nd pass of the lookups, which also handles inserts. It happens immediately after the number of inserted keys reaches a specific upper limit decided based on a current size of the hash table. There may still be unprocessed entries from the input mini-batch after resizing, so the 2nd pass of the lookup is restarted right after, with the bigger hash table and the remaining subset of unprocessed entries. - -Current policy, that should work reasonably well, is to resize a small hash table (up to 8KB) when it is 50% full. Larger hash tables are resized when 75% full. We want to keep size in memory as small as possible, while maintaining a low probability of blocks becoming full. - -When discussing resizing we will be talking about **resize source** and **resize target** tables. The diagram below shows how the same hash bits are interpreted differently by the source and the target. - -![alt text](img/key_map_10.jpg) - -For a given hash, if a start block id was L in the source table, it will be either (2\*L+0) or (2\*L+1) in the target table. Based on that we can expect data access locality when migrating the data between the tables. - -Resizing is cheap also thanks to the fact that hash values for keys in the hash table are kept together with other slot data and do not need to be recomputed. That means that resizing procedure does not ever need to access the actual bytes of the key. - -### 1st pass - -Based on the hash value for a given slot we can tell whether this slot contains an overflow or non-overflow entry. In the first pass we go over all source slots in sequence, filter out overflow entries and move to the target table all other entries. Non-overflow entries from a block L will be distributed between blocks (2\*L+0) and (2\*L+1) of the target table. None of these target blocks can overflow, since they will be accommodating at most 8 input entries during this pass. - -For every non-overflow entry, the highest bit of a stamp in the source slot decides whether it will go to the left or to the right target block. It is further possible to avoid any conditional branches in this partitioning code, so that the result is friendly to the CPU execution pipeline. - -![alt text](img/key_map_11.jpg) - - -### 2nd pass - -In the second pass of resizing, we scan all source slots again, this time focusing only on the overflow entries that were all skipped in the 1st pass. We simply reinsert them in the target table using generic insertion code with one exception. Since we know that all the source keys are different, there is no need to search for a matching stamp or run key comparisons (or look at the key values). We just need to find the first open block beginning with the start block in the target table and use its first empty slot as the insert destination. - -We expect overflow entries to be rare and therefore the relative cost of that pass should stay low. - diff --git a/cpp/src/arrow/acero/exec_plan.cc b/cpp/src/arrow/acero/exec_plan.cc index e27ae7b65a2..ff5e5d8bdd9 100644 --- a/cpp/src/arrow/acero/exec_plan.cc +++ b/cpp/src/arrow/acero/exec_plan.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/acero/query_context.h" #include "arrow/acero/task_util.h" @@ -1101,23 +1102,6 @@ Result> DeclarationToReader( return DeclarationToReader(std::move(declaration), std::move(options)); } -namespace internal { - -void RegisterSourceNode(ExecFactoryRegistry*); -void RegisterFetchNode(ExecFactoryRegistry*); -void RegisterFilterNode(ExecFactoryRegistry*); -void RegisterOrderByNode(ExecFactoryRegistry*); -void RegisterPivotLongerNode(ExecFactoryRegistry*); -void RegisterProjectNode(ExecFactoryRegistry*); -void RegisterUnionNode(ExecFactoryRegistry*); -void RegisterAggregateNode(ExecFactoryRegistry*); -void RegisterSinkNode(ExecFactoryRegistry*); -void RegisterHashJoinNode(ExecFactoryRegistry*); -void RegisterAsofJoinNode(ExecFactoryRegistry*); -void RegisterSortedMergeNode(ExecFactoryRegistry*); - -} // namespace internal - ExecFactoryRegistry* default_exec_factory_registry() { class DefaultRegistry : public ExecFactoryRegistry { public: diff --git a/cpp/src/arrow/acero/exec_plan_internal.h b/cpp/src/arrow/acero/exec_plan_internal.h new file mode 100644 index 00000000000..e9fe87b69e9 --- /dev/null +++ b/cpp/src/arrow/acero/exec_plan_internal.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/acero/exec_plan.h" + +namespace arrow::acero::internal { + +void RegisterSourceNode(ExecFactoryRegistry*); +void RegisterFetchNode(ExecFactoryRegistry*); +void RegisterFilterNode(ExecFactoryRegistry*); +void RegisterOrderByNode(ExecFactoryRegistry*); +void RegisterPivotLongerNode(ExecFactoryRegistry*); +void RegisterProjectNode(ExecFactoryRegistry*); +void RegisterUnionNode(ExecFactoryRegistry*); +void RegisterAggregateNode(ExecFactoryRegistry*); +void RegisterSinkNode(ExecFactoryRegistry*); +void RegisterHashJoinNode(ExecFactoryRegistry*); +void RegisterAsofJoinNode(ExecFactoryRegistry*); +void RegisterSortedMergeNode(ExecFactoryRegistry*); + +} // namespace arrow::acero::internal diff --git a/cpp/src/arrow/acero/fetch_node.cc b/cpp/src/arrow/acero/fetch_node.cc index 2b168b1e533..bf352698a9f 100644 --- a/cpp/src/arrow/acero/fetch_node.cc +++ b/cpp/src/arrow/acero/fetch_node.cc @@ -19,6 +19,7 @@ #include "arrow/acero/accumulation_queue.h" #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/map_node.h" #include "arrow/acero/options.h" #include "arrow/acero/query_context.h" diff --git a/cpp/src/arrow/acero/filter_node.cc b/cpp/src/arrow/acero/filter_node.cc index b0d500abac4..67de82497ef 100644 --- a/cpp/src/arrow/acero/filter_node.cc +++ b/cpp/src/arrow/acero/filter_node.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/map_node.h" #include "arrow/acero/options.h" #include "arrow/acero/query_context.h" diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index dce0e44eb13..de414219eb4 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -937,8 +937,8 @@ TEST_P(GroupBy, SumMeanProductDecimal) { AssertDatumsEqual(ArrayFromJSON(struct_({ field("key_0", int64()), - field("hash_sum", decimal128(3, 2)), - field("hash_sum", decimal256(3, 2)), + field("hash_sum", decimal128(38, 2)), + field("hash_sum", decimal256(76, 2)), field("hash_mean", decimal128(3, 2)), field("hash_mean", decimal256(3, 2)), field("hash_product", decimal128(3, 2)), diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index 89a94d4a162..28e3eb0e04f 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -21,6 +21,7 @@ #include #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/hash_join.h" #include "arrow/acero/hash_join_dict.h" #include "arrow/acero/hash_join_node.h" @@ -45,6 +46,24 @@ using compute::KeyColumnArray; namespace acero { +namespace { + +Status ValidateHashJoinNodeOptions(const HashJoinNodeOptions& join_options) { + if (join_options.key_cmp.empty() || join_options.left_keys.empty() || + join_options.right_keys.empty()) { + return Status::Invalid("key_cmp and keys cannot be empty"); + } + + if ((join_options.key_cmp.size() != join_options.left_keys.size()) || + (join_options.key_cmp.size() != join_options.right_keys.size())) { + return Status::Invalid("key_cmp and keys must have the same size"); + } + + return Status::OK(); +} + +} // namespace + // Check if a type is supported in a join (as either a key or non-key column) bool HashJoinSchema::IsTypeSupported(const DataType& type) { const Type::type id = type.id(); @@ -468,20 +487,6 @@ Status HashJoinSchema::CollectFilterColumns(std::vector& left_filter, return Status::OK(); } -Status ValidateHashJoinNodeOptions(const HashJoinNodeOptions& join_options) { - if (join_options.key_cmp.empty() || join_options.left_keys.empty() || - join_options.right_keys.empty()) { - return Status::Invalid("key_cmp and keys cannot be empty"); - } - - if ((join_options.key_cmp.size() != join_options.left_keys.size()) || - (join_options.key_cmp.size() != join_options.right_keys.size())) { - return Status::Invalid("key_cmp and keys must have the same size"); - } - - return Status::OK(); -} - class HashJoinNode; // This is a struct encapsulating things related to Bloom filters and pushing them around diff --git a/cpp/src/arrow/acero/meson.build b/cpp/src/arrow/acero/meson.build new file mode 100644 index 00000000000..c7a8bdb4ca6 --- /dev/null +++ b/cpp/src/arrow/acero/meson.build @@ -0,0 +1,157 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + [ + 'accumulation_queue.h', + 'aggregate_node.h', + 'api.h', + 'asof_join_node.h', + 'backpressure_handler.h', + 'benchmark_util.h', + 'bloom_filter.h', + 'exec_plan.h', + 'hash_join_dict.h', + 'hash_join.h', + 'hash_join_node.h', + 'map_node.h', + 'options.h', + 'order_by_impl.h', + 'partition_util.h', + 'query_context.h', + 'schema_util.h', + 'task_util.h', + 'test_nodes.h', + 'time_series_util.h', + 'tpch_node.h', + 'type_fwd.h', + 'util.h', + 'visibility.h', + ], + subdir: 'arrow/acero', +) + +arrow_acero_srcs = [ + 'accumulation_queue.cc', + 'scalar_aggregate_node.cc', + 'groupby_aggregate_node.cc', + 'aggregate_internal.cc', + 'asof_join_node.cc', + 'bloom_filter.cc', + 'exec_plan.cc', + 'fetch_node.cc', + 'filter_node.cc', + 'hash_join.cc', + 'hash_join_dict.cc', + 'hash_join_node.cc', + 'map_node.cc', + 'options.cc', + 'order_by_node.cc', + 'order_by_impl.cc', + 'partition_util.cc', + 'pivot_longer_node.cc', + 'project_node.cc', + 'query_context.cc', + 'sink_node.cc', + 'sorted_merge_node.cc', + 'source_node.cc', + 'swiss_join.cc', + 'task_util.cc', + 'time_series_util.cc', + 'tpch_node.cc', + 'union_node.cc', + 'util.cc', +] + +arrow_acero_lib = library( + 'arrow-acero', + sources: arrow_acero_srcs, + dependencies: [arrow_compute_dep, arrow_dep, threads_dep], + gnu_symbol_visibility: 'inlineshidden', +) + +arrow_acero_dep = declare_dependency( + dependencies: [arrow_compute_dep], + link_with: [arrow_acero_lib], +) +meson.override_dependency('arrow-acero', arrow_acero_dep) + +arrow_acero_testing_sources = ['test_nodes.cc', 'test_util_internal.cc'] + +arrow_acero_tests = { + 'plan-test': {'sources': ['plan_test.cc', 'test_nodes_test.cc']}, + 'source-node-test': {'sources': ['source_node_test.cc']}, + 'fetch-node-test': {'sources': ['fetch_node_test.cc']}, + 'order-by-node-test': {'sources': ['order_by_node_test.cc']}, + 'hash-join-node-test': { + 'sources': ['hash_join_node_test.cc', 'bloom_filter_test.cc'], + }, + 'pivot-longer-node-test': {'sources': ['pivot_longer_node_test.cc']}, + 'asof-join-node-test': {'sources': ['asof_join_node_test.cc']}, + 'sorted-merge-node-test': {'sources': ['sorted_merge_node_test.cc']}, + 'tpch-node-test': {'sources': ['tpch_node_test.cc']}, + 'union-node-test': {'sources': ['union_node_test.cc']}, + 'aggregate-node-test': {'sources': ['aggregate_node_test.cc']}, + 'util-test': {'sources': ['util_test.cc', 'task_util_test.cc']}, + 'hash-aggregate-test': {'sources': ['hash_aggregate_test.cc']}, + 'test-util-internal-test': {'sources': ['test_util_internal_test.cc']}, +} + +foreach key, val : arrow_acero_tests + exc = executable( + 'arrow-acero-@0@'.format(key), + sources: val['sources'] + arrow_acero_testing_sources, + dependencies: [arrow_acero_dep, arrow_compute_test_dep], + ) + test(key, exc) +endforeach + +arrow_acero_benchmarks = { + 'expression-benchmark': {'sources': ['expression_benchmark.cc']}, + 'filter-benchmark': { + 'sources': ['benchmark_util.cc', 'filter_benchmark.cc'], + }, + 'project-benchmark': { + 'sources': ['benchmark_util.cc', 'project_benchmark.cc'], + }, + 'asof-join-benchmark': {'sources': ['asof_join_benchmark.cc']}, + 'tpch-benchmark': {'sources': ['tpch_benchmark.cc']}, + 'aggregate-benchmark': {'sources': ['aggregate_benchmark.cc']}, + 'hash-join-benchmark': {'sources': ['hash_join_benchmark.cc']}, +} + +foreach key, val : arrow_acero_benchmarks + exc = executable( + key, + sources: val['sources'] + arrow_acero_testing_sources, + dependencies: [ + arrow_acero_dep, + arrow_compute_test_dep, + arrow_benchmark_dep, + gmock_dep, + ], + ) + benchmark(key, exc) +endforeach + +pkg.generate( + arrow_acero_lib, + filebase: 'arrow-acero', + name: 'Apache Arrow Acero Engine', + description: 'Apache Arrow\'s Acero Engine', + requires: ['arrow-compute'], +) diff --git a/cpp/src/arrow/acero/options.h b/cpp/src/arrow/acero/options.h index 26293725582..827e9ea775d 100644 --- a/cpp/src/arrow/acero/options.h +++ b/cpp/src/arrow/acero/options.h @@ -700,8 +700,9 @@ class ARROW_ACERO_EXPORT AsofJoinNodeOptions : public ExecNodeOptions { /// \brief "on" key for the join. /// /// The input table must be sorted by the "on" key. Must be a single field of a common - /// type. Inexact match is used on the "on" key. i.e., a row is considered a match iff - /// left_on - tolerance <= right_on <= left_on. + /// type. An inexact match is used on the "on" key, i.e. a row is considered a + /// match if and only if `right.on - left.on` is in the range + /// `[min(0, tolerance), max(0, tolerance)]`. /// Currently, the "on" key must be of an integer, date, or timestamp type. FieldRef on_key; /// \brief "by" key for the join. @@ -723,10 +724,14 @@ class ARROW_ACERO_EXPORT AsofJoinNodeOptions : public ExecNodeOptions { /// \see `Keys` for details. std::vector input_keys; /// \brief Tolerance for inexact "on" key matching. A right row is considered a match - /// with the left row if `right.on - left.on <= tolerance`. The `tolerance` may be: - /// - negative, in which case a past-as-of-join occurs; - /// - or positive, in which case a future-as-of-join occurs; - /// - or zero, in which case an exact-as-of-join occurs. + /// with a left row if `right.on - left.on` is in the range + /// `[min(0, tolerance), max(0, tolerance)]`. `tolerance` may be: + /// - negative, in which case a past-as-of-join occurs (match iff + /// `tolerance <= right.on - left.on <= 0`); + /// - or positive, in which case a future-as-of-join occurs (match iff + /// `0 <= right.on - left.on <= tolerance`); + /// - or zero, in which case an exact-as-of-join occurs (match iff + /// `right.on == left.on`). /// /// The tolerance is interpreted in the same units as the "on" key. int64_t tolerance; diff --git a/cpp/src/arrow/acero/order_by_node.cc b/cpp/src/arrow/acero/order_by_node.cc index 65aa83247f8..213730e6f9a 100644 --- a/cpp/src/arrow/acero/order_by_node.cc +++ b/cpp/src/arrow/acero/order_by_node.cc @@ -23,6 +23,7 @@ #include #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/acero/query_context.h" #include "arrow/acero/util.h" diff --git a/cpp/src/arrow/acero/partition_util.h b/cpp/src/arrow/acero/partition_util.h index d02e9cb03f0..52cc47bb8a9 100644 --- a/cpp/src/arrow/acero/partition_util.h +++ b/cpp/src/arrow/acero/partition_util.h @@ -22,6 +22,7 @@ #include #include #include + #include "arrow/acero/util.h" #include "arrow/buffer.h" #include "arrow/util/pcg_random.h" diff --git a/cpp/src/arrow/acero/pch.h b/cpp/src/arrow/acero/pch.h deleted file mode 100644 index ddb4c120f2a..00000000000 --- a/cpp/src/arrow/acero/pch.h +++ /dev/null @@ -1,23 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Often-used headers, for precompiling. -// If updating this header, please make sure you check compilation speed -// before checking in. Adding headers which are not used extremely often -// may incur a slowdown, since it makes the precompiled header heavier to load. - -#include "arrow/pch.h" diff --git a/cpp/src/arrow/acero/pivot_longer_node.cc b/cpp/src/arrow/acero/pivot_longer_node.cc index f261a9c402c..c8f2a5c7b06 100644 --- a/cpp/src/arrow/acero/pivot_longer_node.cc +++ b/cpp/src/arrow/acero/pivot_longer_node.cc @@ -23,6 +23,7 @@ #include #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/acero/util.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/acero/project_node.cc b/cpp/src/arrow/acero/project_node.cc index 98b11cff004..188a2708835 100644 --- a/cpp/src/arrow/acero/project_node.cc +++ b/cpp/src/arrow/acero/project_node.cc @@ -18,6 +18,7 @@ #include #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/map_node.h" #include "arrow/acero/options.h" #include "arrow/acero/query_context.h" diff --git a/cpp/src/arrow/acero/sink_node.cc b/cpp/src/arrow/acero/sink_node.cc index ab06dd8ffd8..0efb365a51a 100644 --- a/cpp/src/arrow/acero/sink_node.cc +++ b/cpp/src/arrow/acero/sink_node.cc @@ -23,6 +23,7 @@ #include "arrow/acero/accumulation_queue.h" #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/acero/order_by_impl.h" #include "arrow/acero/query_context.h" diff --git a/cpp/src/arrow/acero/sorted_merge_node.cc b/cpp/src/arrow/acero/sorted_merge_node.cc index 374e672a849..43f5b7b930a 100644 --- a/cpp/src/arrow/acero/sorted_merge_node.cc +++ b/cpp/src/arrow/acero/sorted_merge_node.cc @@ -23,8 +23,10 @@ #include #include #include + #include "arrow/acero/concurrent_queue_internal.h" #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/acero/query_context.h" #include "arrow/acero/time_series_util.h" @@ -117,7 +119,7 @@ class InputState { std::unique_ptr backpressure_control = std::make_unique(input, output, backpressure_counter); ARROW_ASSIGN_OR_RAISE(auto handler, - BackpressureHandler::Make(input, low_threshold, high_threshold, + BackpressureHandler::Make(low_threshold, high_threshold, std::move(backpressure_control))); return PtrType(new InputState(index, std::move(handler), schema, time_col_index)); } diff --git a/cpp/src/arrow/acero/source_node.cc b/cpp/src/arrow/acero/source_node.cc index 0f58406760c..888f6e23c13 100644 --- a/cpp/src/arrow/acero/source_node.cc +++ b/cpp/src/arrow/acero/source_node.cc @@ -20,6 +20,7 @@ #include #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/acero/query_context.h" #include "arrow/acero/util.h" diff --git a/cpp/src/arrow/acero/test_util_internal.cc b/cpp/src/arrow/acero/test_util_internal.cc index 2748d4107ed..3e279175810 100644 --- a/cpp/src/arrow/acero/test_util_internal.cc +++ b/cpp/src/arrow/acero/test_util_internal.cc @@ -613,5 +613,31 @@ Result> MakeRandomTimeSeriesTable( return Table::Make(schema, columns, num_rows); } +Result> RunEndEncodeTableColumns( + const Table& table, const std::vector& column_indices) { + const int num_columns = table.num_columns(); + std::vector> encoded_columns; + encoded_columns.reserve(num_columns); + std::vector> encoded_fields; + encoded_fields.reserve(num_columns); + for (int i = 0; i < num_columns; i++) { + const auto& field = table.schema()->field(i); + if (std::find(column_indices.begin(), column_indices.end(), i) != + column_indices.end()) { + ARROW_ASSIGN_OR_RAISE(auto run_end_encoded, + arrow::compute::RunEndEncode(table.column(i))); + ARROW_DCHECK_EQ(run_end_encoded.kind(), Datum::CHUNKED_ARRAY); + encoded_columns.push_back(run_end_encoded.chunked_array()); + auto encoded_type = arrow::run_end_encoded(arrow::int32(), field->type()); + encoded_fields.push_back(field->WithType(encoded_type)); + } else { + encoded_columns.push_back(table.column(i)); + encoded_fields.push_back(field); + } + } + auto updated_schema = arrow::schema(std::move(encoded_fields)); + return Table::Make(std::move(updated_schema), std::move(encoded_columns)); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/test_util_internal.h b/cpp/src/arrow/acero/test_util_internal.h index 2367524a560..e8ff9103cd6 100644 --- a/cpp/src/arrow/acero/test_util_internal.h +++ b/cpp/src/arrow/acero/test_util_internal.h @@ -30,6 +30,7 @@ #include "arrow/acero/exec_plan.h" #include "arrow/compute/exec.h" #include "arrow/compute/kernel.h" +#include "arrow/table.h" #include "arrow/testing/visibility.h" #include "arrow/util/async_generator.h" #include "arrow/util/pcg_random.h" @@ -113,23 +114,14 @@ Result>> ToArrayVectors( const BatchesWithSchema& batches_with_schema); Result>> ToExecBatches( - const BatchesWithSchema& batches); + const BatchesWithSchema& batches_with_schema); Result>> ToRecordBatches( - const BatchesWithSchema& batches); - -Result> ToRecordBatchReader( const BatchesWithSchema& batches_with_schema); -Result>> ToArrayVectors( +Result> ToRecordBatchReader( const BatchesWithSchema& batches_with_schema); -Result>> ToExecBatches( - const BatchesWithSchema& batches); - -Result>> ToRecordBatches( - const BatchesWithSchema& batches); - Result> SortTableOnAllFields(const std::shared_ptr& tab); void AssertTablesEqualIgnoringOrder(const std::shared_ptr
& exp, @@ -195,4 +187,7 @@ struct TableGenerationProperties { Result> MakeRandomTimeSeriesTable( const TableGenerationProperties& properties); +Result> RunEndEncodeTableColumns( + const Table& table, const std::vector& column_indices); + } // namespace arrow::acero diff --git a/cpp/src/arrow/acero/test_util_internal_test.cc b/cpp/src/arrow/acero/test_util_internal_test.cc new file mode 100644 index 00000000000..7a42b9d172d --- /dev/null +++ b/cpp/src/arrow/acero/test_util_internal_test.cc @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/acero/test_util_internal.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow::acero { + +TEST(RunEndEncodeTableColumnsTest, SchemaTypeIsModified) { + std::shared_ptr
table = + arrow::TableFromJSON(arrow::schema({arrow::field("col", arrow::utf8())}), {R"([ + {"col": "a"}, + {"col": "b"}, + {"col": "c"}, + {"col": "d"} + ])"}); + ASSERT_OK_AND_ASSIGN(std::shared_ptr
ree_table, + RunEndEncodeTableColumns(*table, {0})); + ASSERT_OK(ree_table->ValidateFull()); + ASSERT_TRUE(ree_table->schema()->field(0)->type()->Equals( + arrow::run_end_encoded(arrow::int32(), arrow::utf8()))); +} +} // namespace arrow::acero diff --git a/cpp/src/arrow/acero/union_node.cc b/cpp/src/arrow/acero/union_node.cc index 9b1211e9d1d..47699b48745 100644 --- a/cpp/src/arrow/acero/union_node.cc +++ b/cpp/src/arrow/acero/union_node.cc @@ -18,6 +18,7 @@ #include #include "arrow/acero/exec_plan.h" +#include "arrow/acero/exec_plan_internal.h" #include "arrow/acero/options.h" #include "arrow/acero/util.h" #include "arrow/compute/api.h" diff --git a/cpp/src/arrow/acero/util_test.cc b/cpp/src/arrow/acero/util_test.cc index d86577d3584..557574a40c9 100644 --- a/cpp/src/arrow/acero/util_test.cc +++ b/cpp/src/arrow/acero/util_test.cc @@ -263,8 +263,7 @@ class TestBackpressureControl : public BackpressureControl { TEST(BackpressureConcurrentQueue, BasicTest) { BackpressureTestExecNode dummy_node; auto ctrl = std::make_unique(&dummy_node); - ASSERT_OK_AND_ASSIGN(auto handler, - BackpressureHandler::Make(&dummy_node, 2, 4, std::move(ctrl))); + ASSERT_OK_AND_ASSIGN(auto handler, BackpressureHandler::Make(2, 4, std::move(ctrl))); BackpressureConcurrentQueue queue(std::move(handler)); ConcurrentQueueBasicTest(queue); @@ -275,8 +274,7 @@ TEST(BackpressureConcurrentQueue, BasicTest) { TEST(BackpressureConcurrentQueue, BackpressureTest) { BackpressureTestExecNode dummy_node; auto ctrl = std::make_unique(&dummy_node); - ASSERT_OK_AND_ASSIGN(auto handler, - BackpressureHandler::Make(&dummy_node, 2, 4, std::move(ctrl))); + ASSERT_OK_AND_ASSIGN(auto handler, BackpressureHandler::Make(2, 4, std::move(ctrl))); BackpressureConcurrentQueue queue(std::move(handler)); queue.Push(6); @@ -299,9 +297,28 @@ TEST(BackpressureConcurrentQueue, BackpressureTest) { queue.Push(11); ASSERT_TRUE(dummy_node.paused); ASSERT_FALSE(dummy_node.stopped); - ASSERT_OK(queue.ForceShutdown()); + queue.ForceShutdown(); + ASSERT_FALSE(dummy_node.paused); +} + +TEST(BackpressureConcurrentQueue, BackpressureTestStayUnpaused) { + BackpressureTestExecNode dummy_node; + auto ctrl = std::make_unique(&dummy_node); + ASSERT_OK_AND_ASSIGN( + auto handler, BackpressureHandler::Make(/*low_threshold=*/2, /*high_threshold=*/4, + std::move(ctrl))); + BackpressureConcurrentQueue queue(std::move(handler)); + + queue.Push(6); + queue.Push(7); + queue.Push(8); + ASSERT_FALSE(dummy_node.paused); + ASSERT_FALSE(dummy_node.stopped); + queue.ForceShutdown(); + for (int i = 0; i < 10; ++i) { + queue.Push(i); + } ASSERT_FALSE(dummy_node.paused); - ASSERT_TRUE(dummy_node.stopped); } } // namespace acero diff --git a/cpp/src/arrow/adapters/orc/CMakeLists.txt b/cpp/src/arrow/adapters/orc/CMakeLists.txt index 14fb8e681d1..bae63210b29 100644 --- a/cpp/src/arrow/adapters/orc/CMakeLists.txt +++ b/cpp/src/arrow/adapters/orc/CMakeLists.txt @@ -38,5 +38,4 @@ add_arrow_test(adapter_test orc::orc ${ARROW_ORC_STATIC_LINK_LIBS}) -set_source_files_properties(adapter_test.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON - SKIP_UNITY_BUILD_INCLUSION ON) +set_source_files_properties(adapter_test.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index b3c314fccc0..714e61b22b1 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -642,6 +642,9 @@ TEST(TestAdapterReadWrite, ThrowWhenTZDBUnavaiable) { if (adapters::orc::GetOrcMajorVersion() >= 2) { GTEST_SKIP() << "Only ORC pre-2.0.0 versions have the time zone database check"; } +#ifdef _WIN32 + GTEST_SKIP() << "GH-47489: Expected error is not thrown on Windows"; +#endif EnvVarGuard tzdir_guard("TZDIR", "/wrong/path"); const char* expect_str = "IANA time zone database is unavailable but required by ORC"; diff --git a/cpp/src/arrow/adapters/tensorflow/meson.build b/cpp/src/arrow/adapters/tensorflow/meson.build new file mode 100644 index 00000000000..299b00c998f --- /dev/null +++ b/cpp/src/arrow/adapters/tensorflow/meson.build @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers(['convert.h'], subdir: 'arrow/adapters/tensorflow') + +arrow_tensorflow_dep = declare_dependency( + include_directories: include_directories('.'), + dependencies: arrow_dep, +) +meson.override_dependency('arrow-tensorflow', arrow_tensorflow_dep) + +pkg.generate( + filebase: 'arrow-tensorflow', + name: 'Apache Arrow Tensorflow', + description: 'Tensorflow modules for Apache Arrow', + requires: ['arrow'], +) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index fa77f4ff4ed..60df45357e5 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -123,6 +123,8 @@ class ARROW_EXPORT Array { const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } /// Equality comparison with another array + /// + /// Note that arrow::ArrayStatistics is not included in the comparison. bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; bool Equals(const std::shared_ptr& arr, const EqualOptions& = EqualOptions::Defaults()) const; @@ -134,6 +136,8 @@ class ARROW_EXPORT Array { /// Approximate equality comparison with another array /// /// epsilon is only used if this is FloatArray or DoubleArray + /// + /// Note that arrow::ArrayStatistics is not included in the comparison. bool ApproxEquals(const std::shared_ptr& arr, const EqualOptions& = EqualOptions::Defaults()) const; bool ApproxEquals(const Array& arr, @@ -141,6 +145,8 @@ class ARROW_EXPORT Array { /// Compare if the range of slots specified are equal for the given array and /// this array. end_idx exclusive. This methods does not bounds check. + /// + /// Note that arrow::ArrayStatistics is not included in the comparison. bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const Array& other, const EqualOptions& = EqualOptions::Defaults()) const; @@ -259,10 +265,10 @@ class ARROW_EXPORT Array { private: ARROW_DISALLOW_COPY_AND_ASSIGN(Array); - - ARROW_FRIEND_EXPORT friend void PrintTo(const Array& x, std::ostream* os); }; +ARROW_EXPORT void PrintTo(const Array& x, std::ostream* os); + static inline std::ostream& operator<<(std::ostream& os, const Array& x) { os << x.ToString(); return os; diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 226f5fc4649..8406bd1d8ed 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -982,11 +982,7 @@ TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } -#ifndef ARROW_LARGE_MEMORY_TESTS -TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck(); } -#else TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } -#endif class TestListConversions : public ::testing::Test { private: diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 44ccf9687b7..b40f14a5547 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -38,6 +38,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_dict.h" +#include "arrow/array/builder_primitive.h" #include "arrow/array/builder_run_end.h" #include "arrow/array/builder_time.h" #include "arrow/array/data.h" @@ -60,6 +61,7 @@ #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" #include "arrow/util/range.h" @@ -72,6 +74,7 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; +using util::Float16; class TestArray : public ::testing::Test { public: @@ -815,6 +818,23 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) { AssertAppendScalar(pool_, std::make_shared(scalar)); } +TEST_F(TestArray, TestMakeArrayFromScalarSmallintExtensionType) { + auto ext_type = std::make_shared(); + auto storage_scalar = std::make_shared(42); + auto ext_scalar = std::make_shared(storage_scalar, ext_type); + + ASSERT_OK_AND_ASSIGN(auto arr, MakeArrayFromScalar(*ext_scalar, 3)); + ASSERT_EQ(arr->type()->id(), Type::EXTENSION); + ASSERT_EQ(arr->length(), 3); + + auto ext_arr = std::static_pointer_cast(arr); + ASSERT_EQ(ext_arr->storage()->type_id(), Type::INT16); + auto int_arr = std::static_pointer_cast(ext_arr->storage()); + for (int i = 0; i < 3; ++i) { + ASSERT_EQ(int_arr->Value(i), 42); + } +} + void CheckSpanRoundTrip(const Array& array) { ArraySpan span; span.SetMembers(*array.data()); @@ -2051,6 +2071,24 @@ void CheckApproxEquals() { ASSERT_FALSE(b->ApproxEquals(a, EqualOptions().nans_equal(true))); } +template +void CheckFloatApproxEqualsWithAtol() { + using c_type = typename TYPE::c_type; + auto type = TypeTraits::type_singleton(); + std::shared_ptr a, b; + ArrayFromVector(type, {true}, {static_cast(0.5)}, &a); + ArrayFromVector(type, {true}, {static_cast(0.6)}, &b); + auto options = EqualOptions::Defaults().atol(0.2); + + ASSERT_FALSE(a->Equals(b)); + ASSERT_TRUE(a->Equals(b, options.use_atol(true))); + ASSERT_TRUE(a->ApproxEquals(b, options)); + + ASSERT_FALSE(a->RangeEquals(0, 1, 0, b, options)); + ASSERT_TRUE(a->RangeEquals(0, 1, 0, b, options.use_atol(true))); + ASSERT_TRUE(ArrayRangeApproxEquals(*a, *b, 0, 1, 0, options)); +} + template void CheckSliceApproxEquals() { using T = typename TYPE::c_type; @@ -2082,16 +2120,21 @@ void CheckSliceApproxEquals() { ASSERT_TRUE(slice1->ApproxEquals(slice2)); } +template +using NumericArgType = std::conditional_t::value, Float16, + typename ArrowType::c_type>; + template void CheckFloatingNanEquality() { + using V = NumericArgType; std::shared_ptr a, b; std::shared_ptr type = TypeTraits::type_singleton(); - const auto nan_value = static_cast(NAN); + const auto nan_value = std::numeric_limits::quiet_NaN(); // NaN in a null entry - ArrayFromVector(type, {true, false}, {0.5, nan_value}, &a); - ArrayFromVector(type, {true, false}, {0.5, nan_value}, &b); + ArrayFromVector(type, {true, false}, {V(0.5), nan_value}, &a); + ArrayFromVector(type, {true, false}, {V(0.5), nan_value}, &b); ASSERT_TRUE(a->Equals(b)); ASSERT_TRUE(b->Equals(a)); ASSERT_TRUE(a->ApproxEquals(b)); @@ -2102,8 +2145,8 @@ void CheckFloatingNanEquality() { ASSERT_TRUE(b->RangeEquals(a, 1, 2, 1)); // NaN in a valid entry - ArrayFromVector(type, {false, true}, {0.5, nan_value}, &a); - ArrayFromVector(type, {false, true}, {0.5, nan_value}, &b); + ArrayFromVector(type, {false, true}, {V(0.5), nan_value}, &a); + ArrayFromVector(type, {false, true}, {V(0.5), nan_value}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_TRUE(a->Equals(b, EqualOptions().nans_equal(true))); @@ -2122,8 +2165,8 @@ void CheckFloatingNanEquality() { ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0)); // NaN != non-NaN - ArrayFromVector(type, {false, true}, {0.5, nan_value}, &a); - ArrayFromVector(type, {false, true}, {0.5, 0.0}, &b); + ArrayFromVector(type, {false, true}, {V(0.5), nan_value}, &a); + ArrayFromVector(type, {false, true}, {V(0.5), V(0.0)}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_FALSE(a->Equals(b, EqualOptions().nans_equal(true))); @@ -2144,15 +2187,16 @@ void CheckFloatingNanEquality() { template void CheckFloatingInfinityEquality() { + using V = NumericArgType; std::shared_ptr a, b; std::shared_ptr type = TypeTraits::type_singleton(); - const auto infinity = std::numeric_limits::infinity(); + const auto infinity = std::numeric_limits::infinity(); for (auto nans_equal : {false, true}) { // Infinity in a null entry - ArrayFromVector(type, {true, false}, {0.5, infinity}, &a); - ArrayFromVector(type, {true, false}, {0.5, -infinity}, &b); + ArrayFromVector(type, {true, false}, {V(0.5), infinity}, &a); + ArrayFromVector(type, {true, false}, {V(0.5), -infinity}, &b); ASSERT_TRUE(a->Equals(b)); ASSERT_TRUE(b->Equals(a)); ASSERT_TRUE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); @@ -2163,8 +2207,8 @@ void CheckFloatingInfinityEquality() { ASSERT_TRUE(b->RangeEquals(a, 1, 2, 1)); // Infinity in a valid entry - ArrayFromVector(type, {false, true}, {0.5, infinity}, &a); - ArrayFromVector(type, {false, true}, {0.5, infinity}, &b); + ArrayFromVector(type, {false, true}, {V(0.5), infinity}, &a); + ArrayFromVector(type, {false, true}, {V(0.5), infinity}, &b); ASSERT_TRUE(a->Equals(b)); ASSERT_TRUE(b->Equals(a)); ASSERT_TRUE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); @@ -2181,8 +2225,8 @@ void CheckFloatingInfinityEquality() { ASSERT_TRUE(b->RangeEquals(a, 0, 1, 0)); // Infinity != non-infinity - ArrayFromVector(type, {false, true}, {0.5, -infinity}, &a); - ArrayFromVector(type, {false, true}, {0.5, 0.0}, &b); + ArrayFromVector(type, {false, true}, {V(0.5), -infinity}, &a); + ArrayFromVector(type, {false, true}, {V(0.5), V(0.0)}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_FALSE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); @@ -2190,8 +2234,8 @@ void CheckFloatingInfinityEquality() { ASSERT_FALSE(a->ApproxEquals(b, EqualOptions().atol(1e-5).nans_equal(nans_equal))); ASSERT_FALSE(b->ApproxEquals(a, EqualOptions().atol(1e-5).nans_equal(nans_equal))); // Infinity != Negative infinity - ArrayFromVector(type, {true, true}, {0.5, -infinity}, &a); - ArrayFromVector(type, {true, true}, {0.5, infinity}, &b); + ArrayFromVector(type, {true, true}, {V(0.5), -infinity}, &a); + ArrayFromVector(type, {true, true}, {V(0.5), infinity}, &b); ASSERT_FALSE(a->Equals(b)); ASSERT_FALSE(b->Equals(a)); ASSERT_FALSE(a->ApproxEquals(b)); @@ -2211,11 +2255,12 @@ void CheckFloatingInfinityEquality() { template void CheckFloatingZeroEquality() { + using V = NumericArgType; std::shared_ptr a, b; std::shared_ptr type = TypeTraits::type_singleton(); - ArrayFromVector(type, {true, false}, {0.0, 1.0}, &a); - ArrayFromVector(type, {true, false}, {0.0, 1.0}, &b); + ArrayFromVector(type, {true, false}, {V(0.0), V(1.0)}, &a); + ArrayFromVector(type, {true, false}, {V(0.0), V(1.0)}, &b); ASSERT_TRUE(a->Equals(b)); ASSERT_TRUE(b->Equals(a)); for (auto nans_equal : {false, true}) { @@ -2231,8 +2276,8 @@ void CheckFloatingZeroEquality() { } } - ArrayFromVector(type, {true, false}, {0.0, 1.0}, &a); - ArrayFromVector(type, {true, false}, {-0.0, 1.0}, &b); + ArrayFromVector(type, {true, false}, {V(0.0), V(1.0)}, &a); + ArrayFromVector(type, {true, false}, {V(-0.0), V(1.0)}, &b); for (auto nans_equal : {false, true}) { auto opts = EqualOptions().nans_equal(nans_equal); ASSERT_TRUE(a->Equals(b, opts)); @@ -2255,6 +2300,11 @@ TEST(TestPrimitiveAdHoc, FloatingApproxEquals) { CheckApproxEquals(); } +TEST(TestPrimitiveAdHoc, FloatingApproxEqualsWithAtol) { + CheckFloatApproxEqualsWithAtol(); + CheckFloatApproxEqualsWithAtol(); +} + TEST(TestPrimitiveAdHoc, FloatingSliceApproxEquals) { CheckSliceApproxEquals(); CheckSliceApproxEquals(); @@ -2263,16 +2313,19 @@ TEST(TestPrimitiveAdHoc, FloatingSliceApproxEquals) { TEST(TestPrimitiveAdHoc, FloatingNanEquality) { CheckFloatingNanEquality(); CheckFloatingNanEquality(); + CheckFloatingNanEquality(); } TEST(TestPrimitiveAdHoc, FloatingInfinityEquality) { CheckFloatingInfinityEquality(); CheckFloatingInfinityEquality(); + CheckFloatingInfinityEquality(); } TEST(TestPrimitiveAdHoc, FloatingZeroEquality) { CheckFloatingZeroEquality(); CheckFloatingZeroEquality(); + CheckFloatingZeroEquality(); } // ---------------------------------------------------------------------- @@ -3857,6 +3910,9 @@ class TestArrayDataStatistics : public ::testing::Test { void SetUp() { valids_ = {1, 0, 1, 1}; null_count_ = std::count(valids_.begin(), valids_.end(), 0); + distinct_count_ = 3.0; + max_byte_width_ = 4.0; + average_byte_width_ = 4.0; null_buffer_ = *internal::BytesToBits(valids_); values_ = {1, 0, 3, -4}; min_ = *std::min_element(values_.begin(), values_.end()); @@ -3866,6 +3922,10 @@ class TestArrayDataStatistics : public ::testing::Test { null_count_); data_->statistics = std::make_shared(); data_->statistics->null_count = null_count_; + data_->statistics->distinct_count = distinct_count_; + data_->statistics->max_byte_width = max_byte_width_; + data_->statistics->average_byte_width = average_byte_width_; + data_->statistics->is_average_byte_width_exact = true; data_->statistics->min = min_; data_->statistics->is_min_exact = true; data_->statistics->max = max_; @@ -3875,6 +3935,9 @@ class TestArrayDataStatistics : public ::testing::Test { protected: std::vector valids_; size_t null_count_; + double distinct_count_; + double max_byte_width_; + double average_byte_width_; std::shared_ptr null_buffer_; std::vector values_; int64_t min_; @@ -3890,6 +3953,19 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) { ASSERT_TRUE(moved_data.statistics->null_count.has_value()); ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + ASSERT_TRUE(moved_data.statistics->distinct_count.has_value()); + ASSERT_DOUBLE_EQ(distinct_count_, + std::get(moved_data.statistics->distinct_count.value())); + + ASSERT_TRUE(moved_data.statistics->max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(max_byte_width_, + std::get(moved_data.statistics->max_byte_width.value())); + + ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(average_byte_width_, + moved_data.statistics->average_byte_width.value()); + ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact); + ASSERT_TRUE(moved_data.statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); @@ -3907,6 +3983,19 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) { ASSERT_TRUE(copied_data.statistics->null_count.has_value()); ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + ASSERT_TRUE(copied_data.statistics->distinct_count.has_value()); + ASSERT_DOUBLE_EQ(distinct_count_, + std::get(copied_data.statistics->distinct_count.value())); + + ASSERT_TRUE(copied_data.statistics->max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(max_byte_width_, + std::get(copied_data.statistics->max_byte_width.value())); + + ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(average_byte_width_, + copied_data.statistics->average_byte_width.value()); + ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact); + ASSERT_TRUE(copied_data.statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); @@ -3926,6 +4015,19 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) { ASSERT_TRUE(moved_data.statistics->null_count.has_value()); ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + ASSERT_TRUE(moved_data.statistics->distinct_count.has_value()); + ASSERT_DOUBLE_EQ(distinct_count_, + std::get(moved_data.statistics->distinct_count.value())); + + ASSERT_TRUE(moved_data.statistics->max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(max_byte_width_, + std::get(moved_data.statistics->max_byte_width.value())); + + ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(average_byte_width_, + moved_data.statistics->average_byte_width.value()); + ASSERT_TRUE(moved_data.statistics->is_average_byte_width_exact); + ASSERT_TRUE(moved_data.statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); @@ -3944,6 +4046,19 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) { ASSERT_TRUE(copied_data.statistics->null_count.has_value()); ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + ASSERT_TRUE(copied_data.statistics->distinct_count.has_value()); + ASSERT_DOUBLE_EQ(distinct_count_, + std::get(copied_data.statistics->distinct_count.value())); + + ASSERT_TRUE(copied_data.statistics->max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(max_byte_width_, + std::get(copied_data.statistics->max_byte_width.value())); + + ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(average_byte_width_, + copied_data.statistics->average_byte_width.value()); + ASSERT_TRUE(copied_data.statistics->is_average_byte_width_exact); + ASSERT_TRUE(copied_data.statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); @@ -4035,4 +4150,73 @@ TYPED_TEST(TestPrimitiveArray, IndexOperator) { } } +class TestHalfFloatBuilder : public ::testing::Test { + public: + void VerifyValue(const HalfFloatBuilder& builder, int64_t index, float expected) { + ASSERT_EQ(builder.GetValue(index), Float16(expected).bits()); + ASSERT_EQ(builder.GetValue(index), Float16(expected)); + ASSERT_EQ(builder.GetValue(index), Float16(expected).bits()); + ASSERT_EQ(builder[index], Float16(expected).bits()); + } +}; + +TEST_F(TestHalfFloatBuilder, TestAppend) { + HalfFloatBuilder builder; + ASSERT_OK(builder.Append(Float16(0.0f))); + ASSERT_OK(builder.Append(Float16(1.0f).bits())); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Reserve(3)); + builder.UnsafeAppend(Float16(3.0f)); + builder.UnsafeAppend(Float16(4.0f).bits()); + builder.UnsafeAppend(uint16_t{15872}); // 1.5f + + VerifyValue(builder, 0, 0.0f); + VerifyValue(builder, 1, 1.0f); + VerifyValue(builder, 3, 3.0f); + VerifyValue(builder, 4, 4.0f); + VerifyValue(builder, 5, 1.5f); +} + +TEST_F(TestHalfFloatBuilder, TestBulkAppend) { + HalfFloatBuilder builder; + + ASSERT_OK(builder.AppendValues(5, Float16(1.5))); + uint16_t val = Float16(2.0f).bits(); + ASSERT_OK(builder.AppendValues({val, val, val, val}, {0, 1, 0, 1})); + ASSERT_EQ(builder.length(), 9); + for (int i = 0; i < 5; i++) { + VerifyValue(builder, i, 1.5f); + } + + { + ASSERT_OK_AND_ASSIGN(auto array, builder.Finish()); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->null_count(), 2); + ASSERT_EQ(array->length(), 9); + auto comp = ArrayFromJSON(float16(), "[1.5,1.5,1.5,1.5,1.5,null,2,null,2]"); + AssertArraysEqual(*array, *comp); + } + + std::vector vals = {Float16(1.0f), Float16(2.0f), Float16(3.0f)}; + std::vector is_valid = {true, false, true}; + std::vector valid_bytes = {1, 0, 1}; + std::vector bitmap = {0b00000101}; + ASSERT_OK(builder.AppendValues(vals)); + ASSERT_OK(builder.AppendValues(vals, is_valid)); + ASSERT_OK(builder.AppendValues(vals.data(), vals.size(), is_valid)); + ASSERT_OK(builder.AppendValues(vals.data(), vals.size())); + ASSERT_OK(builder.AppendValues(vals.data(), vals.size(), valid_bytes.data())); + ASSERT_OK(builder.AppendValues(vals.data(), vals.size(), bitmap.data(), 0)); + + { + ASSERT_OK_AND_ASSIGN(auto array, builder.Finish()); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->null_count(), 4); + ASSERT_EQ(array->length(), 18); + auto comp = + ArrayFromJSON(float16(), "[1,2,3,1,null,3,1,null,3,1,2,3,1,null,3,1,null,3]"); + AssertArraysEqual(*array, *comp); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 70561b5738a..19c4c2d523f 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -34,12 +34,14 @@ #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/logging_internal.h" #include "arrow/visit_data_inline.h" namespace arrow { using internal::checked_cast; +using internal::MultiplyWithOverflow; // ---------------------------------------------------------------------- // Binary/StringView @@ -162,7 +164,13 @@ void FixedSizeBinaryBuilder::Reset() { Status FixedSizeBinaryBuilder::Resize(int64_t capacity) { RETURN_NOT_OK(CheckCapacity(capacity)); - RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_)); + int64_t dest_capacity_bytes; + if (ARROW_PREDICT_FALSE( + MultiplyWithOverflow(capacity, byte_width_, &dest_capacity_bytes))) { + return Status::CapacityError("Resize: capacity overflows (requested: ", capacity, + ", byte_width: ", byte_width_, ")"); + } + RETURN_NOT_OK(byte_builder_.Resize(dest_capacity_bytes)); return ArrayBuilder::Resize(capacity); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 442e4a26320..d0e761ae968 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -855,11 +855,41 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { /// This pointer becomes invalid on the next modifying operation. const uint8_t* GetValue(int64_t i) const; - /// Temporary access to a value. + /// Temporary mutable access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + uint8_t* GetMutableValue(int64_t i) { + uint8_t* data_ptr = byte_builder_.mutable_data(); + return data_ptr + i * byte_width_; + } + + /// Temporary mutable access to a value. /// /// This view becomes invalid on the next modifying operation. std::string_view GetView(int64_t i) const; + /// Advance builder without allocating nor writing any values + /// + /// The internal pointer is advanced by `length` values and the same number + /// of non-null entries are appended to the validity bitmap. + /// This method assumes that the `length` values were populated directly, + /// for example using `GetMutableValue`. + void UnsafeAdvance(int64_t length) { + byte_builder_.UnsafeAdvance(length * byte_width_); + UnsafeAppendToBitmap(length, true); + } + + /// Advance builder without allocating nor writing any values + /// + /// The internal pointer is advanced by `length` values and the same number + /// of validity bits are appended to the validity bitmap. + /// This method assumes that the `length` values were populated directly, + /// for example using `GetMutableValue`. + void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) { + byte_builder_.UnsafeAdvance(length * byte_width_); + UnsafeAppendToBitmap(validity, valid_bits_offset, length); + } + static constexpr int64_t memory_limit() { return std::numeric_limits::max() - 1; } @@ -872,14 +902,6 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { int32_t byte_width_; BufferBuilder byte_builder_; - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - uint8_t* GetMutableValue(int64_t i) { - uint8_t* data_ptr = byte_builder_.mutable_data(); - return data_ptr + i * byte_width_; - } - void CheckValueSize(int64_t size); }; diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index d483e5ae08d..915fbfbf895 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -48,7 +48,7 @@ template class BaseListViewBuilder; // MapBuilder MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, - std::shared_ptr const& item_builder, + const std::shared_ptr& item_builder, const std::shared_ptr& type) : ArrayBuilder(pool), key_builder_(key_builder), item_builder_(item_builder) { auto map_type = internal::checked_cast(type.get()); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index d0e5b6d3c0e..fdbeb0cd7d1 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -51,7 +51,7 @@ class VarLengthListLikeBuilder : public ArrayBuilder { /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. VarLengthListLikeBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, + const std::shared_ptr& value_builder, const std::shared_ptr& type, int64_t alignment = kDefaultBufferAlignment) : ArrayBuilder(pool, alignment), @@ -60,7 +60,7 @@ class VarLengthListLikeBuilder : public ArrayBuilder { value_field_(type->field(0)->WithType(NULLPTR)) {} VarLengthListLikeBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, + const std::shared_ptr& value_builder, int64_t alignment = kDefaultBufferAlignment) : VarLengthListLikeBuilder(pool, value_builder, std::make_shared(value_builder->type()), @@ -647,13 +647,13 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { /// Use this constructor to define the built array's type explicitly. If value_builder /// has indeterminate type, this builder will also. FixedSizeListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, + const std::shared_ptr& value_builder, int32_t list_size); /// Use this constructor to infer the built array's type. If value_builder has /// indeterminate type, this builder will also. FixedSizeListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, + const std::shared_ptr& value_builder, const std::shared_ptr& type); Status Resize(int64_t capacity) override; diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index be9761fb46b..6d79d6e9649 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -26,6 +26,7 @@ #include "arrow/result.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/float16.h" namespace arrow { @@ -141,6 +142,10 @@ class NumericBuilder value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } + value_type* GetMutableValue(int64_t index) { + return &data_builder_.mutable_data()[index]; + } + void Reset() override { data_builder_.Reset(); ArrayBuilder::Reset(); @@ -319,6 +324,28 @@ class NumericBuilder data_builder_.UnsafeAppend(value_type{}); // zero } + /// Advance builder without allocating nor writing any values + /// + /// The internal pointer is advanced by `length` values and the same number + /// of non-null entries are appended to the validity bitmap. + /// This method assumes that the `length` values were populated directly, + /// for example using `GetMutableValue`. + void UnsafeAdvance(int64_t length) { + data_builder_.UnsafeAdvance(length); + UnsafeAppendToBitmap(length, true); + } + + /// Advance builder without allocating nor writing any values + /// + /// The internal pointer is advanced by `length` values and the same number + /// of validity bits are appended to the validity bitmap. + /// This method assumes that the `length` values were populated directly, + /// for example using `GetMutableValue`. + void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) { + data_builder_.UnsafeAdvance(length); + UnsafeAppendToBitmap(validity, valid_bits_offset, length); + } + std::shared_ptr type() const override { return type_; } protected: @@ -338,7 +365,6 @@ using Int16Builder = NumericBuilder; using Int32Builder = NumericBuilder; using Int64Builder = NumericBuilder; -using HalfFloatBuilder = NumericBuilder; using FloatBuilder = NumericBuilder; using DoubleBuilder = NumericBuilder; @@ -358,6 +384,107 @@ using DurationBuilder = NumericBuilder; /// @} +/// \addtogroup numeric-builders +/// +/// @{ + +class ARROW_EXPORT HalfFloatBuilder : public NumericBuilder { + public: + using BaseClass = NumericBuilder; + using Float16 = arrow::util::Float16; + + using BaseClass::Append; + using BaseClass::AppendValues; + using BaseClass::BaseClass; + using BaseClass::GetValue; + using BaseClass::UnsafeAppend; + + /// Scalar append a arrow::util::Float16 + Status Append(const Float16 val) { return Append(val.bits()); } + + /// Scalar append a arrow::util::Float16, without checking for capacity + void UnsafeAppend(const Float16 val) { UnsafeAppend(val.bits()); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of arrow::util::Float16 + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const Float16* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + return BaseClass::AppendValues(reinterpret_cast(values), length, + valid_bytes); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of arrow::util::Float16 + /// \param[in] length the number of values to append + /// \param[in] bitmap a validity bitmap to copy (may be null) + /// \param[in] bitmap_offset an offset into the validity bitmap + /// \return Status + Status AppendValues(const Float16* values, int64_t length, const uint8_t* bitmap, + int64_t bitmap_offset) { + return BaseClass::AppendValues(reinterpret_cast(values), length, + bitmap, bitmap_offset); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of arrow::util::Float16 + /// \param[in] length the number of values to append + /// \param[in] is_valid a std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const Float16* values, int64_t length, + const std::vector& is_valid) { + return BaseClass::AppendValues(reinterpret_cast(values), length, + is_valid); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector + /// \param[in] is_valid a std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector + /// \return Status + Status AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); + } + + /// \brief Append one value many times in one shot + /// \param[in] length the number of values to append + /// \param[in] value a arrow::util::Float16 + Status AppendValues(int64_t length, Float16 value) { + RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, value.bits()); + ArrayBuilder::UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Get the value at a certain index + /// \param[in] index the zero-based index + /// @tparam T arrow::util::Float16 or value_type (uint16_t) + template + T GetValue(int64_t index) const { + static_assert(std::is_same_v || + std::is_same_v); + if constexpr (std::is_same_v) { + return BaseClass::GetValue(index); + } else { + return Float16::FromBits(BaseClass::GetValue(index)); + } + } +}; + +/// @} + class ARROW_EXPORT BooleanBuilder : public ArrayBuilder, public internal::ArrayBuilderExtraOps { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index aea53115752..c09a6b45a70 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -430,14 +430,14 @@ TEST_F(ConcatenateTest, DictionaryTypeDifferentSizeIndex) { auto bigger_dict_type = dictionary(uint16(), utf8()); auto dict_one = DictArrayFromJSON(dict_type, "[0]", "[\"A0\"]"); auto dict_two = DictArrayFromJSON(bigger_dict_type, "[0]", "[\"B0\"]"); - ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two}).status()); + ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two})); } TEST_F(ConcatenateTest, DictionaryTypeCantUnifyNullInDictionary) { auto dict_type = dictionary(uint8(), utf8()); auto dict_one = DictArrayFromJSON(dict_type, "[0, 1]", "[null, \"A\"]"); auto dict_two = DictArrayFromJSON(dict_type, "[0, 1]", "[null, \"B\"]"); - ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two}).status()); + ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two})); } TEST_F(ConcatenateTest, DictionaryTypeEnlargedIndices) { @@ -464,7 +464,7 @@ TEST_F(ConcatenateTest, DictionaryTypeEnlargedIndices) { auto dict_one = std::make_shared(dict_type, indices, dictionary_one); auto dict_two = std::make_shared(dict_type, indices, dictionary_two); - ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two}).status()); + ASSERT_RAISES(Invalid, Concatenate({dict_one, dict_two})); auto bigger_dict_type = dictionary(uint16(), uint16()); @@ -729,8 +729,7 @@ TEST_F(ConcatenateTest, OffsetOverflow) { fake_long_list->data()->child_data[0] = fake_long->data(); ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list, fake_long_list}, pool, - &suggested_cast) - .status()); + &suggested_cast)); ASSERT_TRUE(suggested_cast->Equals(*expected_suggestion)); } } @@ -740,8 +739,7 @@ TEST_F(ConcatenateTest, OffsetOverflow) { fake_long_list->data()->GetMutableValues(1)[1] = std::numeric_limits::max(); ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list, fake_long_list}, pool, - &suggested_cast) - .status()); + &suggested_cast)); ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_ty))); auto list_view_ty = list_view(null()); @@ -757,8 +755,7 @@ TEST_F(ConcatenateTest, OffsetOverflow) { mutable_sizes[0] = kInt32Max; } ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list_view, fake_long_list_view}, - pool, &suggested_cast) - .status()); + pool, &suggested_cast)); ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_view_ty))); } diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 2e55668fb96..1c56a485062 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -34,7 +34,7 @@ #include "arrow/type_traits.h" #include "arrow/util/binary_view_util.h" #include "arrow/util/bitmap_ops.h" -#include "arrow/util/dict_util.h" +#include "arrow/util/dict_util_internal.h" #include "arrow/util/logging_internal.h" #include "arrow/util/macros.h" #include "arrow/util/range.h" @@ -100,11 +100,15 @@ bool DictionaryMayHaveLogicalNulls(const ArrayData& data) { return ArraySpan(data).MayHaveLogicalNulls(); } +namespace { + BufferSpan PackVariadicBuffers(util::span> buffers) { return {const_cast(reinterpret_cast(buffers.data())), static_cast(buffers.size() * sizeof(std::shared_ptr))}; } +} // namespace + } // namespace internal std::shared_ptr ArrayData::Make(std::shared_ptr type, int64_t length, diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index eed7860a9f7..e921da86e15 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -64,32 +64,24 @@ constexpr int64_t kUnknownNullCount = -1; /// /// This data structure is a self-contained representation of the memory and /// metadata inside an Arrow array data structure (called vectors in Java). The -/// classes arrow::Array and its subclasses provide strongly-typed accessors +/// Array class and its concrete subclasses provide strongly-typed accessors /// with support for the visitor pattern and other affordances. /// /// This class is designed for easy internal data manipulation, analytical data -/// processing, and data transport to and from IPC messages. For example, we -/// could cast from int64 to float64 like so: +/// processing, and data transport to and from IPC messages. /// -/// Int64Array arr = GetMyData(); -/// auto new_data = arr.data()->Copy(); -/// new_data->type = arrow::float64(); -/// DoubleArray double_arr(new_data); +/// This class is also useful in an analytics setting where memory may be +/// efficiently reused. For example, computing the Abs of a numeric array +/// should return null iff the input is null: therefore, an Abs function can +/// reuse the validity bitmap (a Buffer) of its input as the validity bitmap +/// of its output. /// -/// This object is also useful in an analytics setting where memory may be -/// reused. For example, if we had a group of operations all returning doubles, -/// say: -/// -/// Log(Sqrt(Expr(arr))) -/// -/// Then the low-level implementations of each of these functions could have -/// the signatures -/// -/// void Log(const ArrayData& values, ArrayData* out); -/// -/// As another example a function may consume one or more memory buffers in an -/// input array and replace them with newly-allocated data, changing the output -/// data type as well. +/// This class is meant mostly for immutable data access. Any mutable access +/// (either to ArrayData members or to the contents of its Buffers) should take +/// into account the fact that ArrayData instances are typically wrapped in a +/// shared_ptr and can therefore have multiple owners at any given time. +/// Therefore, mutable access is discouraged except when initially populating +/// the ArrayData. struct ARROW_EXPORT ArrayData { ArrayData() = default; @@ -194,16 +186,18 @@ struct ARROW_EXPORT ArrayData { return *this; } + /// \brief Return a shallow copy of this ArrayData std::shared_ptr Copy() const { return std::make_shared(*this); } - /// \brief Copy all buffers and children recursively to destination MemoryManager + /// \brief Deep copy this ArrayData to destination memory manager /// - /// This utilizes MemoryManager::CopyBuffer to create a new ArrayData object - /// recursively copying the buffers and all child buffers to the destination - /// memory manager. This includes dictionaries if applicable. + /// Returns a new ArrayData object with buffers and all child buffers + /// copied to the destination memory manager. This includes dictionaries + /// if applicable. Result> CopyTo( const std::shared_ptr& to) const; - /// \brief View or Copy this ArrayData to destination memory manager. + + /// \brief View or copy this ArrayData to destination memory manager /// /// Tries to view the buffer contents on the given memory manager's device /// if possible (to avoid a copy) but falls back to copying if a no-copy view @@ -211,8 +205,19 @@ struct ARROW_EXPORT ArrayData { Result> ViewOrCopyTo( const std::shared_ptr& to) const; + /// \brief Return the null-ness of a given array element + /// + /// Calling `IsNull(i)` is the same as `!IsValid(i)`. bool IsNull(int64_t i) const { return !IsValid(i); } + /// \brief Return the validity of a given array element + /// + /// For most data types, this will simply query the validity bitmap. + /// For union and run-end-encoded arrays, the underlying child data is + /// queried instead. + /// For dictionary arrays, this reflects the validity of the dictionary + /// index, but the corresponding dictionary value might still be null. + /// For null arrays, this always returns false. bool IsValid(int64_t i) const { if (buffers[0] != NULLPTR) { return bit_util::GetBit(buffers[0]->data(), i + offset); @@ -230,7 +235,19 @@ struct ARROW_EXPORT ArrayData { return null_count.load() != length; } - // Access a buffer's data as a typed C pointer + /// \brief Access a buffer's data as a typed C pointer + /// + /// \param i the buffer index + /// \param absolute_offset the offset into the buffer + /// + /// If `absolute_offset` is non-zero, the type `T` must match the + /// layout of buffer number `i` for the array's data type; otherwise + /// offset computation would be incorrect. + /// + /// If the given buffer is bit-packed (such as a validity bitmap, or + /// the data buffer of a boolean array), then `absolute_offset` must be + /// zero for correct results, and any bit offset must be applied manually + /// by the caller. template inline const T* GetValues(int i, int64_t absolute_offset) const { if (buffers[i]) { @@ -240,13 +257,27 @@ struct ARROW_EXPORT ArrayData { } } + /// \brief Access a buffer's data as a typed C pointer + /// + /// \param i the buffer index + /// + /// This method uses the array's offset to index into buffer number `i`. + /// + /// Calling this method on a bit-packed buffer (such as a validity bitmap, or + /// the data buffer of a boolean array) will lead to incorrect results. + /// You should instead call `GetValues(i, 0)` and apply the bit offset manually. template inline const T* GetValues(int i) const { return GetValues(i, offset); } - // Like GetValues, but returns NULLPTR instead of aborting if the underlying - // buffer is not a CPU buffer. + /// \brief Access a buffer's data as a typed C pointer + /// + /// \param i the buffer index + /// \param absolute_offset the offset into the buffer + /// + /// Like `GetValues(i, absolute_offset)`, but returns nullptr if the given buffer + /// is not a CPU buffer. template inline const T* GetValuesSafe(int i, int64_t absolute_offset) const { if (buffers[i] && buffers[i]->is_cpu()) { @@ -256,12 +287,24 @@ struct ARROW_EXPORT ArrayData { } } + /// \brief Access a buffer's data as a typed C pointer + /// + /// \param i the buffer index + /// + /// Like `GetValues(i)`, but returns nullptr if the given buffer is not a CPU buffer. template inline const T* GetValuesSafe(int i) const { return GetValuesSafe(i, offset); } - // Access a buffer's data as a typed C pointer + /// \brief Access a buffer's data as a mutable typed C pointer + /// + /// \param i the buffer index + /// \param absolute_offset the offset into the buffer + /// + /// Like `GetValues(i, absolute_offset)`, but allows mutating buffer contents. + /// This should only be used when initially populating the ArrayData, before + /// it is attached to a Array instance. template inline T* GetMutableValues(int i, int64_t absolute_offset) { if (buffers[i]) { @@ -271,6 +314,13 @@ struct ARROW_EXPORT ArrayData { } } + /// \brief Access a buffer's data as a mutable typed C pointer + /// + /// \param i the buffer index + /// + /// Like `GetValues(i)`, but allows mutating buffer contents. + /// This should only be used when initially populating the ArrayData, before + /// it is attached to a Array instance. template inline T* GetMutableValues(int i) { return GetMutableValues(i, offset); @@ -278,36 +328,56 @@ struct ARROW_EXPORT ArrayData { /// \brief Construct a zero-copy slice of the data with the given offset and length /// - /// The associated `ArrayStatistics` is always discarded in a sliced - /// `ArrayData`. Because `ArrayStatistics` in the original - /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want - /// to reuse statistics in the original `ArrayData`, you need to do - /// it by yourself. - /// - /// If the specified slice range has the same range as the original - /// `ArrayData`, we can reuse statistics in the original - /// `ArrayData`. Because it has the same data as the original - /// `ArrayData`. But the associated `ArrayStatistics` is discarded - /// in this case too. Use `Copy()` instead for the case. + /// This method applies the given slice to this ArrayData, taking into account + /// its existing offset and length. + /// If the given `length` is too large, the slice length is clamped so as not + /// to go past the offset end. + /// If the given `often` is too large, or if either `offset` or `length` is negative, + /// behavior is undefined. + /// + /// The associated ArrayStatistics is always discarded in a sliced + /// ArrayData, even if the slice is trivially equal to the original ArrayData. + /// If you want to reuse the statistics from the original ArrayData, you must + /// explicitly reattach them. std::shared_ptr Slice(int64_t offset, int64_t length) const; - /// \brief Input-checking variant of Slice + /// \brief Construct a zero-copy slice of the data with the given offset and length /// - /// An Invalid Status is returned if the requested slice falls out of bounds. - /// Note that unlike Slice, `length` isn't clamped to the available buffer size. + /// Like `Slice(offset, length)`, but returns an error if the requested slice + /// falls out of bounds. + /// Unlike Slice, `length` isn't clamped to the available buffer size. Result> SliceSafe(int64_t offset, int64_t length) const; + /// \brief Set the cached physical null count + /// + /// \param v the number of nulls in the ArrayData + /// + /// This should only be used when initially populating the ArrayData, if + /// it possible to compute the null count without visiting the entire validity + /// bitmap. In most cases, relying on `GetNullCount` is sufficient. void SetNullCount(int64_t v) { null_count.store(v); } - /// \brief Return physical null count, or compute and set it if it's not known + /// \brief Return the physical null count + /// + /// This method returns the number of array elements for which `IsValid` would + /// return false. + /// + /// A cached value is returned if already available, otherwise it is first + /// computed and stored. + /// How it is is computed depends on the data type, see `IsValid` for details. + /// + /// Note that this method is typically much faster than calling `IsValid` + /// for all elements. Therefore, it helps avoid per-element validity bitmap + /// lookups in the common cases where the array contains zero or only nulls. int64_t GetNullCount() const; - /// \brief Return true if the data has a validity bitmap and the physical null - /// count is known to be non-zero or not yet known. + /// \brief Return true if the array may have nulls in its validity bitmap /// - /// Note that this is not the same as MayHaveLogicalNulls, which also checks - /// for the presence of nulls in child data for types like unions and run-end - /// encoded types. + /// This method returns true if the data has a validity bitmap, and the physical + /// null count is either known to be non-zero or not yet known. + /// + /// Unlike `MayHaveLogicalNulls`, this does not check for the presence of nulls + /// in child data for data types such as unions and run-end encoded types. /// /// \see HasValidityBitmap /// \see MayHaveLogicalNulls @@ -317,18 +387,20 @@ struct ARROW_EXPORT ArrayData { return null_count.load() != 0 && buffers[0] != NULLPTR; } - /// \brief Return true if the data has a validity bitmap + /// \brief Return true if the array has a validity bitmap bool HasValidityBitmap() const { return buffers[0] != NULLPTR; } - /// \brief Return true if the validity bitmap may have 0's in it, or if the - /// child arrays (in the case of types without a validity bitmap) may have - /// nulls, or if the dictionary of dictionary array may have nulls. + /// \brief Return true if the array may have logical nulls /// - /// This is not a drop-in replacement for MayHaveNulls, as historically - /// MayHaveNulls() has been used to check for the presence of a validity - /// bitmap that needs to be checked. + /// Unlike `MayHaveNulls`, this method checks for null child values + /// for types without a validity bitmap, such as unions and run-end encoded + /// types, and for null dictionary values for dictionary types. /// - /// Code that previously used MayHaveNulls() and then dealt with the validity + /// This implies that `MayHaveLogicalNulls` may return true for arrays that + /// don't have a top-level validity bitmap. It is therefore necessary + /// to call `HasValidityBitmap` before accessing a top-level validity bitmap. + /// + /// Code that previously used MayHaveNulls and then dealt with the validity /// bitmap directly can be fixed to handle all types correctly without /// performance degradation when handling most types by adopting /// HasValidityBitmap and MayHaveLogicalNulls. @@ -373,13 +445,12 @@ struct ARROW_EXPORT ArrayData { return null_count.load() != 0; } - /// \brief Computes the logical null count for arrays of all types including - /// those that do not have a validity bitmap like union and run-end encoded - /// arrays + /// \brief Compute the logical null count for arrays of all types /// /// If the array has a validity bitmap, this function behaves the same as - /// GetNullCount. For types that have no validity bitmap, this function will - /// recompute the null count every time it is called. + /// GetNullCount. For arrays that have no validity bitmap but whose values + /// may be logically null (such as union arrays and run-end encoded arrays), + /// this function recomputes the null count every time it is called. /// /// \see GetNullCount int64_t ComputeLogicalNullCount() const; @@ -430,9 +501,14 @@ struct ARROW_EXPORT BufferSpan { } }; -/// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply -/// copyable and does not contain any shared_ptr objects. Do not use in public -/// APIs aside from compute kernels for now +/// \brief EXPERIMENTAL: A non-owning array data container +/// +/// Unlike ArrayData, this class doesn't own its referenced data type nor data buffers. +/// It is cheaply copyable and can therefore be suitable for use cases where +/// shared_ptr overhead is not acceptable. However, care should be taken to +/// keep alive the referenced objects and memory while the ArraySpan object is in use. +/// For this reason, this should not be exposed in most public APIs (apart from +/// compute kernel interfaces). struct ARROW_EXPORT ArraySpan { const DataType* type = NULLPTR; int64_t length = 0; diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 4a640e6b9c7..fd907e3c7b2 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -43,6 +43,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" #include "arrow/util/logging_internal.h" #include "arrow/util/range.h" #include "arrow/util/ree_util.h" @@ -57,6 +58,8 @@ using internal::checked_cast; using internal::checked_pointer_cast; using internal::MakeLazyRange; +namespace { + template auto GetView(const ArrayType& array, int64_t index) -> decltype(array.GetView(index)) { return array.GetView(index); @@ -92,11 +95,11 @@ struct UnitSlice { // FIXME(bkietz) this is inefficient; // StructArray's fields can be diffed independently then merged -static UnitSlice GetView(const StructArray& array, int64_t index) { +UnitSlice GetView(const StructArray& array, int64_t index) { return UnitSlice{&array, index}; } -static UnitSlice GetView(const UnionArray& array, int64_t index) { +UnitSlice GetView(const UnionArray& array, int64_t index) { return UnitSlice{&array, index}; } @@ -582,28 +585,6 @@ Result> NullDiff(const Array& base, const Array& ta {field("insert", boolean()), field("run_length", int64())}); } -Result> Diff(const Array& base, const Array& target, - MemoryPool* pool) { - if (!base.type()->Equals(target.type())) { - return Status::TypeError("only taking the diff of like-typed arrays is supported."); - } - - if (base.type()->id() == Type::NA) { - return NullDiff(base, target, pool); - } else if (base.type()->id() == Type::EXTENSION) { - auto base_storage = checked_cast(base).storage(); - auto target_storage = checked_cast(target).storage(); - return Diff(*base_storage, *target_storage, pool); - } else if (base.type()->id() == Type::DICTIONARY) { - return Status::NotImplemented("diffing arrays of type ", *base.type()); - } else if (base.type()->id() == Type::LIST_VIEW || - base.type()->id() == Type::LARGE_LIST_VIEW) { - return Status::NotImplemented("diffing arrays of type ", *base.type()); - } else { - return QuadraticSpaceMyersDiff(base, target, pool).Diff(); - } -} - using Formatter = std::function; static Result MakeFormatter(const DataType& type); @@ -615,10 +596,6 @@ class MakeFormatterImpl { return std::move(impl_); } - private: - template - friend Status VisitTypeInline(const DataType&, VISITOR*, ARGS&&... args); - // factory implementation Status Visit(const BooleanType&) { impl_ = [](const Array& array, int64_t index, std::ostream* os) { @@ -627,6 +604,14 @@ class MakeFormatterImpl { return Status::OK(); } + Status Visit(const HalfFloatType&) { + impl_ = [](const Array& array, int64_t index, std::ostream* os) { + const auto& float16_arr = checked_cast(array); + *os << arrow::util::Float16::FromBits(float16_arr.Value(index)); + }; + return Status::OK(); + } + // format Numerics with std::ostream defaults template enable_if_number Visit(const T&) { @@ -913,48 +898,10 @@ class MakeFormatterImpl { Formatter impl_; }; -static Result MakeFormatter(const DataType& type) { +Result MakeFormatter(const DataType& type) { return MakeFormatterImpl{}.Make(type); } -Status VisitEditScript( - const Array& edits, - const std::function& visitor) { - static const auto edits_type = - struct_({field("insert", boolean()), field("run_length", int64())}); - DCHECK(edits.type()->Equals(*edits_type)); - DCHECK_GE(edits.length(), 1); - - auto insert = checked_pointer_cast( - checked_cast(edits).field(0)); - auto run_lengths = - checked_pointer_cast(checked_cast(edits).field(1)); - - DCHECK(!insert->Value(0)); - - auto length = run_lengths->Value(0); - int64_t base_begin, base_end, target_begin, target_end; - base_begin = base_end = target_begin = target_end = length; - for (int64_t i = 1; i < edits.length(); ++i) { - if (insert->Value(i)) { - ++target_end; - } else { - ++base_end; - } - length = run_lengths->Value(i); - if (length != 0) { - RETURN_NOT_OK(visitor(base_begin, base_end, target_begin, target_end)); - base_begin = base_end = base_end + length; - target_begin = target_end = target_end + length; - } - } - if (length == 0) { - return visitor(base_begin, base_end, target_begin, target_end); - } - return Status::OK(); -} - class UnifiedDiffFormatter { public: UnifiedDiffFormatter(std::ostream* os, Formatter formatter) @@ -1004,6 +951,8 @@ class UnifiedDiffFormatter { Formatter formatter_; }; +} // namespace + Result> MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os) { if (type.id() == Type::NA) { @@ -1021,4 +970,64 @@ MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os) { return UnifiedDiffFormatter(os, std::move(formatter)); } +Status VisitEditScript( + const Array& edits, + const std::function& visitor) { + static const auto edits_type = + struct_({field("insert", boolean()), field("run_length", int64())}); + DCHECK(edits.type()->Equals(*edits_type)); + DCHECK_GE(edits.length(), 1); + + auto insert = checked_pointer_cast( + checked_cast(edits).field(0)); + auto run_lengths = + checked_pointer_cast(checked_cast(edits).field(1)); + + DCHECK(!insert->Value(0)); + + auto length = run_lengths->Value(0); + int64_t base_begin, base_end, target_begin, target_end; + base_begin = base_end = target_begin = target_end = length; + for (int64_t i = 1; i < edits.length(); ++i) { + if (insert->Value(i)) { + ++target_end; + } else { + ++base_end; + } + length = run_lengths->Value(i); + if (length != 0) { + RETURN_NOT_OK(visitor(base_begin, base_end, target_begin, target_end)); + base_begin = base_end = base_end + length; + target_begin = target_end = target_end + length; + } + } + if (length == 0) { + return visitor(base_begin, base_end, target_begin, target_end); + } + return Status::OK(); +} + +Result> Diff(const Array& base, const Array& target, + MemoryPool* pool) { + if (!base.type()->Equals(target.type())) { + return Status::TypeError("only taking the diff of like-typed arrays is supported."); + } + + if (base.type()->id() == Type::NA) { + return NullDiff(base, target, pool); + } else if (base.type()->id() == Type::EXTENSION) { + auto base_storage = checked_cast(base).storage(); + auto target_storage = checked_cast(target).storage(); + return Diff(*base_storage, *target_storage, pool); + } else if (base.type()->id() == Type::DICTIONARY) { + return Status::NotImplemented("diffing arrays of type ", *base.type()); + } else if (base.type()->id() == Type::LIST_VIEW || + base.type()->id() == Type::LARGE_LIST_VIEW) { + return Status::NotImplemented("diffing arrays of type ", *base.type()); + } else { + return QuadraticSpaceMyersDiff(base, target, pool).Diff(); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index 02bcf5bbb4c..76f4202992f 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -35,6 +35,7 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" namespace arrow { @@ -75,7 +76,7 @@ class DiffTest : public ::testing::Test { void DoDiff() { auto edits = Diff(*base_, *target_, default_memory_pool()); - ASSERT_OK(edits.status()); + ASSERT_OK(edits); edits_ = edits.ValueOrDie(); ASSERT_OK(edits_->ValidateFull()); ASSERT_TRUE(edits_->type()->Equals(edits_type)); @@ -86,7 +87,7 @@ class DiffTest : public ::testing::Test { void DoDiffAndFormat(std::stringstream* out) { DoDiff(); auto formatter = MakeUnifiedDiffFormatter(*base_->type(), out); - ASSERT_OK(formatter.status()); + ASSERT_OK(formatter); ASSERT_OK(formatter.ValueOrDie()(*edits_, *base_, *target_)); } @@ -799,10 +800,10 @@ TEST_F(DiffTest, CompareRandomStruct) { auto type = struct_({field("i", int32()), field("s", utf8())}); auto base_res = StructArray::Make({int32_base, utf8_base}, type->fields()); - ASSERT_OK(base_res.status()); + ASSERT_OK(base_res); base_ = base_res.ValueOrDie(); auto target_res = StructArray::Make({int32_target, utf8_target}, type->fields()); - ASSERT_OK(target_res.status()); + ASSERT_OK(target_res); target_ = target_res.ValueOrDie(); std::stringstream formatted; @@ -815,4 +816,19 @@ TEST_F(DiffTest, CompareRandomStruct) { } } +TEST_F(DiffTest, CompareHalfFloat) { + auto first = ArrayFromJSON(float16(), "[1.1, 2.0, 2.5, 3.3]"); + auto second = ArrayFromJSON(float16(), "[1.1, 4.0, 3.5, 3.3]"); + auto expected_diff = R"( +@@ -1, +1 @@ +-2 +-2.5 ++4 ++3.5 +)"; + + auto diff = first->Diff(*second); + ASSERT_EQ(diff, expected_diff); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/meson.build b/cpp/src/arrow/array/meson.build new file mode 100644 index 00000000000..8abf4378047 --- /dev/null +++ b/cpp/src/arrow/array/meson.build @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +array_tests = ['concatenate_test', 'diff_test'] +foreach array_test : array_tests + test_name = 'arrow-@0@'.format(array_test.replace('_', '-')) + exc = executable( + test_name, + sources: '@0@.cc'.format(array_test), + dependencies: [arrow_test_dep], + ) + test(test_name, exc) +endforeach + +install_headers( + [ + 'array_base.h', + 'array_binary.h', + 'array_decimal.h', + 'array_dict.h', + 'array_nested.h', + 'array_primitive.h', + 'array_run_end.h', + 'builder_adaptive.h', + 'builder_base.h', + 'builder_binary.h', + 'builder_decimal.h', + 'builder_dict.h', + 'builder_nested.h', + 'builder_primitive.h', + 'builder_run_end.h', + 'builder_time.h', + 'builder_union.h', + 'concatenate.h', + 'data.h', + 'diff.h', + 'statistics.h', + 'util.h', + 'validate.h', + ], +) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 6ccd2f4766e..cbf0bc39e81 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -22,6 +22,7 @@ #include #include +#include "arrow/compare.h" #include "arrow/type.h" #include "arrow/util/visibility.h" @@ -38,6 +39,9 @@ struct ARROW_EXPORT ArrayStatistics { /// value exists, one of them is used. `std::nullopt` is used /// otherwise. using ValueType = std::variant; + using NumericType = std::variant; + using CountType = NumericType; + using SizeType = NumericType; static const std::shared_ptr& ValueToArrowType( const std::optional& value, @@ -60,6 +64,8 @@ struct ARROW_EXPORT ArrayStatistics { case Type::FIXED_SIZE_BINARY: case Type::LARGE_STRING: case Type::LARGE_BINARY: + case Type::BINARY_VIEW: + case Type::STRING_VIEW: return array_type; default: return utf8(); @@ -73,7 +79,20 @@ struct ARROW_EXPORT ArrayStatistics { std::optional null_count = std::nullopt; /// \brief The number of distinct values, may not be set - std::optional distinct_count = std::nullopt; + /// Note: when set to `int64_t`, it represents `exact_distinct_count`, + /// and when set to `double`, it represents `approximate_distinct_count`. + std::optional distinct_count = std::nullopt; + + /// \brief The maximum length in bytes of the rows in an array; may not be set + /// Note: when the type is `int64_t`, it represents `max_byte_width_exact`, + /// and when the type is `double`, it represents `max_byte_width_approximate`. + std::optional max_byte_width = std::nullopt; + + /// \brief The average size in bytes of a row in an array, may not be set. + std::optional average_byte_width = std::nullopt; + + /// \brief Whether the average size in bytes is exact or not. + bool is_average_byte_width_exact = false; /// \brief The minimum value, may not be set std::optional min = std::nullopt; @@ -125,11 +144,17 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief Whether the maximum value is exact or not bool is_max_exact = false; - /// \brief Check two statistics for equality - bool Equals(const ArrayStatistics& other) const { - return null_count == other.null_count && distinct_count == other.distinct_count && - min == other.min && is_min_exact == other.is_min_exact && max == other.max && - is_max_exact == other.is_max_exact; + /// \brief Check two \ref arrow::ArrayStatistics for equality + /// + /// \param other The \ref arrow::ArrayStatistics instance to compare against. + /// + /// \param equal_options Options used to compare double values for equality. + /// + /// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise, + /// false. + bool Equals(const ArrayStatistics& other, + const EqualOptions& equal_options = EqualOptions::Defaults()) const { + return ArrayStatisticsEquals(*this, other, equal_options); } /// \brief Check two statistics for equality diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index cf15a5d3829..607ee8aa09f 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -15,13 +15,17 @@ // specific language governing permissions and limitations // under the License. +#include +#include + #include #include "arrow/array/statistics.h" +#include "arrow/compare.h" namespace arrow { -TEST(ArrayStatisticsTest, TestNullCount) { +TEST(TestArrayStatistics, NullCount) { ArrayStatistics statistics; ASSERT_FALSE(statistics.null_count.has_value()); statistics.null_count = 29; @@ -29,15 +33,50 @@ TEST(ArrayStatisticsTest, TestNullCount) { ASSERT_EQ(29, statistics.null_count.value()); } -TEST(ArrayStatisticsTest, TestDistinctCount) { +TEST(TestArrayStatistics, DistinctCountExact) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.distinct_count.has_value()); + statistics.distinct_count = static_cast(29); + ASSERT_TRUE(statistics.distinct_count.has_value()); + ASSERT_EQ(29, std::get(statistics.distinct_count.value())); +} + +TEST(TestArrayStatistics, DistinctCountApproximate) { ArrayStatistics statistics; ASSERT_FALSE(statistics.distinct_count.has_value()); - statistics.distinct_count = 29; + statistics.distinct_count = 29.0; ASSERT_TRUE(statistics.distinct_count.has_value()); - ASSERT_EQ(29, statistics.distinct_count.value()); + ASSERT_DOUBLE_EQ(29.0, std::get(statistics.distinct_count.value())); } -TEST(ArrayStatisticsTest, TestMin) { +TEST(TestArrayStatistics, MaxByteWidthExact) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.max_byte_width.has_value()); + statistics.max_byte_width = static_cast(5); + ASSERT_TRUE(statistics.max_byte_width.has_value()); + ASSERT_EQ(5, std::get(statistics.max_byte_width.value())); +} + +TEST(TestArrayStatistics, MaxByteWidthApproximate) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.max_byte_width.has_value()); + statistics.max_byte_width = 5.0; + ASSERT_TRUE(statistics.max_byte_width.has_value()); + ASSERT_DOUBLE_EQ(5.0, std::get(statistics.max_byte_width.value())); +} + +TEST(TestArrayStatistics, AverageByteWidth) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.average_byte_width.has_value()); + ASSERT_FALSE(statistics.is_average_byte_width_exact); + statistics.average_byte_width = 4.2; + ASSERT_TRUE(statistics.average_byte_width.has_value()); + ASSERT_DOUBLE_EQ(4.2, statistics.average_byte_width.value()); + statistics.is_average_byte_width_exact = true; + ASSERT_TRUE(statistics.is_average_byte_width_exact); +} + +TEST(TestArrayStatistics, Min) { ArrayStatistics statistics; ASSERT_FALSE(statistics.min.has_value()); ASSERT_FALSE(statistics.is_min_exact); @@ -49,7 +88,7 @@ TEST(ArrayStatisticsTest, TestMin) { ASSERT_TRUE(statistics.is_min_exact); } -TEST(ArrayStatisticsTest, TestMax) { +TEST(TestArrayStatistics, Max) { ArrayStatistics statistics; ASSERT_FALSE(statistics.max.has_value()); ASSERT_FALSE(statistics.is_max_exact); @@ -61,7 +100,7 @@ TEST(ArrayStatisticsTest, TestMax) { ASSERT_FALSE(statistics.is_max_exact); } -TEST(ArrayStatisticsTest, TestEquality) { +TEST(TestArrayStatistics, Equals) { ArrayStatistics statistics1; ArrayStatistics statistics2; @@ -72,9 +111,38 @@ TEST(ArrayStatisticsTest, TestEquality) { statistics2.null_count = 29; ASSERT_EQ(statistics1, statistics2); - statistics1.distinct_count = 2929; + // Test DISTINCT_COUNT_EXACT + statistics1.distinct_count = static_cast(2929); + ASSERT_NE(statistics1, statistics2); + statistics2.distinct_count = static_cast(2929); + ASSERT_EQ(statistics1, statistics2); + + // Test DISTINCT_COUNT_APPROXIMATE + statistics1.distinct_count = 2930.5; + ASSERT_NE(statistics1, statistics2); + statistics2.distinct_count = 2930.5; + ASSERT_EQ(statistics1, statistics2); + + // Test MAX_BYTE_WIDTH_EXACT + statistics1.max_byte_width = static_cast(5); ASSERT_NE(statistics1, statistics2); - statistics2.distinct_count = 2929; + statistics2.max_byte_width = static_cast(5); + ASSERT_EQ(statistics1, statistics2); + + // Test MAX_BYTE_WIDTH_APPROXIMATE + statistics1.max_byte_width = 5.0; + ASSERT_NE(statistics1, statistics2); + statistics2.max_byte_width = 5.0; + ASSERT_EQ(statistics1, statistics2); + + statistics1.average_byte_width = 2.9; + ASSERT_NE(statistics1, statistics2); + statistics2.average_byte_width = 2.9; + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_average_byte_width_exact = true; + ASSERT_NE(statistics1, statistics2); + statistics2.is_average_byte_width_exact = true; ASSERT_EQ(statistics1, statistics2); statistics1.min = std::string("world"); @@ -96,6 +164,56 @@ TEST(ArrayStatisticsTest, TestEquality) { ASSERT_NE(statistics1, statistics2); statistics2.is_max_exact = true; ASSERT_EQ(statistics1, statistics2); + + // Test different ArrayStatistics::ValueType + statistics1.max = static_cast(29); + statistics2.max = static_cast(29); + ASSERT_NE(statistics1, statistics2); +} + +class TestArrayStatisticsEqualityDoubleValue : public ::testing::Test { + protected: + ArrayStatistics statistics1_; + ArrayStatistics statistics2_; + EqualOptions options_ = EqualOptions::Defaults(); +}; + +TEST_F(TestArrayStatisticsEqualityDoubleValue, ExactValue) { + statistics2_.min = 29.0; + statistics1_.min = 29.0; + ASSERT_EQ(statistics1_, statistics2_); + statistics2_.min = 30.0; + ASSERT_NE(statistics1_, statistics2_); +} + +TEST_F(TestArrayStatisticsEqualityDoubleValue, SignedZero) { + statistics1_.min = +0.0; + statistics2_.min = -0.0; + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); +} + +TEST_F(TestArrayStatisticsEqualityDoubleValue, Infinity) { + auto infinity = std::numeric_limits::infinity(); + statistics1_.min = infinity; + statistics2_.min = infinity; + ASSERT_EQ(statistics1_, statistics2_); + statistics1_.min = -infinity; + ASSERT_NE(statistics1_, statistics2_); +} + +TEST_F(TestArrayStatisticsEqualityDoubleValue, NaN) { + statistics1_.min = std::numeric_limits::quiet_NaN(); + statistics2_.min = std::numeric_limits::quiet_NaN(); + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.nans_equal(true))); + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.nans_equal(false))); +} + +TEST_F(TestArrayStatisticsEqualityDoubleValue, ApproximateEquals) { + statistics1_.max = 0.5001f; + statistics2_.max = 0.5; + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.atol(1e-3))); + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.atol(1e-3).use_atol(true))); } } // namespace arrow diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 3180b66a1f7..03d8c32c4e3 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -43,7 +43,7 @@ #include "arrow/util/decimal.h" #include "arrow/util/endian.h" #include "arrow/util/logging_internal.h" -#include "arrow/util/sort.h" +#include "arrow/util/sort_internal.h" #include "arrow/util/span.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" @@ -829,7 +829,19 @@ class RepeatedArrayFactory { } Status Visit(const ExtensionType& type) { - return Status::NotImplemented("construction from scalar of type ", *scalar_.type); + // Retrieve the underlying storage scalar from the ExtensionScalar + const auto& ext_scalar = checked_cast(scalar_); + const auto& storage_scalar = ext_scalar.value; + + // Create an array from the storage scalar + ARROW_ASSIGN_OR_RAISE(auto storage_array, + MakeArrayFromScalar(*storage_scalar, length_, pool_)); + + auto ext_type = std::static_pointer_cast(ext_scalar.type); + + out_ = type.WrapArray(ext_type, storage_array); + + return Status::OK(); } Result> CreateUnionTypeCodes(int8_t type_code) { diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 3c9148ebb29..bd0d00126d5 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -30,7 +30,7 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging_internal.h" #include "arrow/util/ree_util.h" -#include "arrow/util/sort.h" +#include "arrow/util/sort_internal.h" #include "arrow/util/string.h" #include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" diff --git a/cpp/src/arrow/arrow-compute.pc.in b/cpp/src/arrow/arrow-compute.pc.in new file mode 100644 index 00000000000..2da0986d612 --- /dev/null +++ b/cpp/src/arrow/arrow-compute.pc.in @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prefix=@CMAKE_INSTALL_PREFIX@ +includedir=@ARROW_PKG_CONFIG_INCLUDEDIR@ +libdir=@ARROW_PKG_CONFIG_LIBDIR@ + +Name: Apache Arrow Compute Kernels +Description: Apache Arrow's Compute Kernels. +Version: @ARROW_VERSION@ +Requires: arrow +Libs: -L${libdir} -larrow_compute +Cflags:@ARROW_COMPUTE_PC_CFLAGS@ +Cflags.private:@ARROW_COMPUTE_PC_CFLAGS_PRIVATE@ diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 2254b6e067f..ab20ce7fb9f 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -203,10 +203,6 @@ Result> AllocateEmptyBitmap(int64_t length, int64_t alig return std::shared_ptr(std::move(buf)); } -Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out) { - return AllocateEmptyBitmap(length).Value(out); -} - Result> ConcatenateBuffers( const std::vector>& buffers, MemoryPool* pool) { int64_t out_length = 0; diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index a84c98b6b24..e9177c656c0 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -67,7 +67,7 @@ class ARROW_EXPORT BufferBuilder { /// \brief Resize the buffer to the nearest multiple of 64 bytes /// - /// \param new_capacity the new capacity of the of the builder. Will be + /// \param new_capacity the new capacity of the builder. Will be /// rounded up to a multiple of 64 bytes for padding /// \param shrink_to_fit if new capacity is smaller than the existing, /// reallocate internal buffer. Set to false to avoid reallocations when @@ -295,6 +295,10 @@ class TypedBufferBuilder< return bytes_builder_.Advance(length * sizeof(T)); } + void UnsafeAdvance(const int64_t length) { + bytes_builder_.UnsafeAdvance(length * sizeof(T)); + } + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { return bytes_builder_.Finish(out, shrink_to_fit); } diff --git a/cpp/src/arrow/c/CMakeLists.txt b/cpp/src/arrow/c/CMakeLists.txt index 81a81cd3f11..a7f722aacc9 100644 --- a/cpp/src/arrow/c/CMakeLists.txt +++ b/cpp/src/arrow/c/CMakeLists.txt @@ -15,7 +15,28 @@ # specific language governing permissions and limitations # under the License. -add_arrow_test(bridge_test PREFIX "arrow-c") +# TODO(GH-37221): Remove compute dependency for REE requirements on bridge_test +set(ARROW_TEST_LINK_LIBS "") + +if(ARROW_TEST_LINKAGE STREQUAL "static") + list(APPEND ARROW_TEST_LINK_LIBS ${ARROW_TEST_STATIC_LINK_LIBS}) +else() + list(APPEND ARROW_TEST_LINK_LIBS ${ARROW_TEST_SHARED_LINK_LIBS}) +endif() + +if(ARROW_COMPUTE) + if(ARROW_TEST_LINKAGE STREQUAL "static") + list(APPEND ARROW_TEST_LINK_LIBS arrow_compute_static arrow_compute_testing) + else() + list(APPEND ARROW_TEST_LINK_LIBS arrow_compute_shared arrow_compute_testing) + endif() +endif() + +add_arrow_test(bridge_test + PREFIX + "arrow-c" + STATIC_LINK_LIBS + ${ARROW_TEST_LINK_LIBS}) add_arrow_test(dlpack_test) add_arrow_benchmark(bridge_benchmark) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 0c158322d0a..dd25ed299dd 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -586,8 +586,8 @@ struct ArrayExporter { ++buffers_begin; } - bool need_variadic_buffer_sizes = - data->type->id() == Type::BINARY_VIEW || data->type->id() == Type::STRING_VIEW; + bool need_variadic_buffer_sizes = data->type->storage_id() == Type::BINARY_VIEW || + data->type->storage_id() == Type::STRING_VIEW; if (need_variadic_buffer_sizes) { ++n_buffers; } @@ -713,6 +713,8 @@ Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out, ////////////////////////////////////////////////////////////////////////// // C device arrays +namespace { + Status ValidateDeviceInfo(const ArrayData& data, std::optional* device_type, int64_t* device_id) { @@ -753,6 +755,8 @@ Result, int64_t>> ValidateDeviceIn return std::make_pair(device_type, device_id); } +} // namespace + Status ExportDeviceArray(const Array& array, std::shared_ptr sync, struct ArrowDeviceArray* out, struct ArrowSchema* out_schema) { void* sync_event = sync ? sync->get_raw() : nullptr; @@ -1255,15 +1259,15 @@ struct SchemaImporter { return f_parser_.Invalid(); } if (prec_scale.size() == 2) { - type_ = decimal128(prec_scale[0], prec_scale[1]); + ARROW_ASSIGN_OR_RAISE(type_, Decimal128Type::Make(prec_scale[0], prec_scale[1])); } else if (prec_scale[2] == 32) { - type_ = decimal32(prec_scale[0], prec_scale[1]); + ARROW_ASSIGN_OR_RAISE(type_, Decimal32Type::Make(prec_scale[0], prec_scale[1])); } else if (prec_scale[2] == 64) { - type_ = decimal64(prec_scale[0], prec_scale[1]); + ARROW_ASSIGN_OR_RAISE(type_, Decimal64Type::Make(prec_scale[0], prec_scale[1])); } else if (prec_scale[2] == 128) { - type_ = decimal128(prec_scale[0], prec_scale[1]); + ARROW_ASSIGN_OR_RAISE(type_, Decimal128Type::Make(prec_scale[0], prec_scale[1])); } else if (prec_scale[2] == 256) { - type_ = decimal256(prec_scale[0], prec_scale[1]); + ARROW_ASSIGN_OR_RAISE(type_, Decimal256Type::Make(prec_scale[0], prec_scale[1])); } else { return f_parser_.Invalid(); } diff --git a/cpp/src/arrow/c/bridge_benchmark.cc b/cpp/src/arrow/c/bridge_benchmark.cc index 85e091704bf..2df31318ab6 100644 --- a/cpp/src/arrow/c/bridge_benchmark.cc +++ b/cpp/src/arrow/c/bridge_benchmark.cc @@ -22,7 +22,7 @@ #include "arrow/array.h" #include "arrow/c/bridge.h" #include "arrow/c/helpers.h" -#include "arrow/ipc/json_simple.h" +#include "arrow/json/from_string.h" #include "arrow/record_batch.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -79,7 +79,7 @@ static void ExportSchema(benchmark::State& state) { // NOLINT non-const referen static void ExportArray(benchmark::State& state) { // NOLINT non-const reference struct ArrowArray c_export; - auto array = ArrayFromJSON(utf8(), R"(["foo", "bar", null])"); + auto array = arrow::ArrayFromJSON(utf8(), R"(["foo", "bar", null])"); for (auto _ : state) { ABORT_NOT_OK(::arrow::ExportArray(*array, &c_export)); @@ -123,7 +123,7 @@ static void ExportImportSchema(benchmark::State& state) { // NOLINT non-const r static void ExportImportArray(benchmark::State& state) { // NOLINT non-const reference struct ArrowArray c_export; - auto array = ArrayFromJSON(utf8(), R"(["foo", "bar", null])"); + auto array = arrow::ArrayFromJSON(utf8(), R"(["foo", "bar", null])"); auto type = array->type(); for (auto _ : state) { diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 5848dd0b55b..c6a5e01e038 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -31,7 +31,6 @@ #include "arrow/c/bridge.h" #include "arrow/c/helpers.h" #include "arrow/c/util_internal.h" -#include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" #include "arrow/testing/builder.h" #include "arrow/testing/extension_type.h" @@ -581,8 +580,10 @@ struct ArrayExportChecker { --expected_n_buffers; ++expected_buffers; } - bool has_variadic_buffer_sizes = expected_data.type->id() == Type::STRING_VIEW || - expected_data.type->id() == Type::BINARY_VIEW; + + bool has_variadic_buffer_sizes = + expected_data.type->storage_id() == Type::BINARY_VIEW || + expected_data.type->storage_id() == Type::STRING_VIEW; ASSERT_EQ(c_export->n_buffers, expected_n_buffers + has_variadic_buffer_sizes); ASSERT_NE(c_export->buffers, nullptr); @@ -961,6 +962,13 @@ TEST_F(TestArrayExport, BinaryViewMultipleBuffers) { }); } +TEST_F(TestArrayExport, BinaryViewExtensionWithMultipleBuffers) { + TestPrimitive([&] { + auto storage = MakeBinaryViewArrayWithMultipleDataBuffers(); + return BinaryViewExtensionType::WrapArray(binary_view_extension_type(), storage); + }); +} + TEST_F(TestArrayExport, Null) { TestPrimitive(null(), "[null, null, null]"); TestPrimitive(null(), "[]"); @@ -2349,6 +2357,34 @@ TEST_F(TestSchemaImport, DictionaryError) { CheckImportError(); } +TEST_F(TestSchemaImport, DecimalError) { + // Decimal precision out of bounds + FillPrimitive("d:0,10"); + CheckImportError(); + FillPrimitive("d:39,10"); + CheckImportError(); + + FillPrimitive("d:0,4,32"); + CheckImportError(); + FillPrimitive("d:10,4,32"); + CheckImportError(); + + FillPrimitive("d:0,4,64"); + CheckImportError(); + FillPrimitive("d:19,4,64"); + CheckImportError(); + + FillPrimitive("d:0,10,128"); + CheckImportError(); + FillPrimitive("d:39,10,128"); + CheckImportError(); + + FillPrimitive("d:0,4,256"); + CheckImportError(); + FillPrimitive("d:77,4,256"); + CheckImportError(); +} + TEST_F(TestSchemaImport, ExtensionError) { ExtensionTypeGuard guard(uuid()); diff --git a/cpp/src/arrow/c/dlpack.cc b/cpp/src/arrow/c/dlpack.cc index 13ee2761b0c..94d7816d78e 100644 --- a/cpp/src/arrow/c/dlpack.cc +++ b/cpp/src/arrow/c/dlpack.cc @@ -20,6 +20,7 @@ #include "arrow/array/array_base.h" #include "arrow/c/dlpack_abi.h" #include "arrow/device.h" +#include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -66,7 +67,7 @@ struct ManagerCtx { } // namespace Result ExportArray(const std::shared_ptr& arr) { - // Define DLDevice struct nad check if array type is supported + // Define DLDevice struct and check if array type is supported // by the DLPack protocol at the same time. Raise TypeError if not. // Supported data types: int, uint, float with no validity buffer. ARROW_ASSIGN_OR_RAISE(auto device, ExportDevice(arr)) @@ -77,7 +78,7 @@ Result ExportArray(const std::shared_ptr& arr) { ARROW_ASSIGN_OR_RAISE(auto dlpack_type, GetDLDataType(type)); // Create ManagerCtx that will serve as the owner of the DLManagedTensor - std::unique_ptr ctx(new ManagerCtx); + auto ctx = std::make_unique(); // Define the data pointer to the DLTensor // If array is of length 0, data pointer should be NULL @@ -130,4 +131,71 @@ Result ExportDevice(const std::shared_ptr& arr) { } } +struct TensorManagerCtx { + std::shared_ptr t; + std::vector strides; + std::vector shape; + DLManagedTensor tensor; +}; + +Result ExportTensor(const std::shared_ptr& t) { + // Define the DLDataType struct + const DataType& type = *t->type(); + ARROW_ASSIGN_OR_RAISE(auto dlpack_type, GetDLDataType(type)); + + // Define DLDevice struct + ARROW_ASSIGN_OR_RAISE(auto device, ExportDevice(t)) + + // Create TensorManagerCtx that will serve as the owner of the DLManagedTensor + auto ctx = std::make_unique(); + + // Define the data pointer to the DLTensor + // If tensor is of length 0, data pointer should be NULL + if (t->size() == 0) { + ctx->tensor.dl_tensor.data = NULL; + } else { + ctx->tensor.dl_tensor.data = t->raw_mutable_data(); + } + + ctx->tensor.dl_tensor.device = device; + ctx->tensor.dl_tensor.ndim = t->ndim(); + ctx->tensor.dl_tensor.dtype = dlpack_type; + ctx->tensor.dl_tensor.byte_offset = 0; + + std::vector& shape_arr = ctx->shape; + shape_arr.reserve(t->ndim()); + for (auto i : t->shape()) { + shape_arr.emplace_back(i); + } + ctx->tensor.dl_tensor.shape = shape_arr.data(); + + std::vector& strides_arr = ctx->strides; + strides_arr.reserve(t->ndim()); + auto byte_width = t->type()->byte_width(); + for (auto i : t->strides()) { + strides_arr.emplace_back(i / byte_width); + } + ctx->tensor.dl_tensor.strides = strides_arr.data(); + + ctx->t = std::move(t); + ctx->tensor.manager_ctx = ctx.get(); + ctx->tensor.deleter = [](struct DLManagedTensor* self) { + delete reinterpret_cast(self->manager_ctx); + }; + return &ctx.release()->tensor; +} + +Result ExportDevice(const std::shared_ptr& t) { + // Define DLDevice struct + DLDevice device; + if (t->data()->device_type() == DeviceAllocationType::kCPU) { + device.device_id = 0; + device.device_type = DLDeviceType::kDLCPU; + return device; + } else { + return Status::NotImplemented( + "DLPack support is implemented only for buffers on CPU device."); + } +} + } // namespace arrow::dlpack diff --git a/cpp/src/arrow/c/dlpack.h b/cpp/src/arrow/c/dlpack.h index d11ccfc1fd7..65da38423c2 100644 --- a/cpp/src/arrow/c/dlpack.h +++ b/cpp/src/arrow/c/dlpack.h @@ -39,6 +39,9 @@ namespace arrow::dlpack { ARROW_EXPORT Result ExportArray(const std::shared_ptr& arr); +ARROW_EXPORT +Result ExportTensor(const std::shared_ptr& t); + /// \brief Get DLDevice with enumerator specifying the /// type of the device data is stored on and index of the /// device which is 0 by default for CPU. @@ -48,4 +51,7 @@ Result ExportArray(const std::shared_ptr& arr); ARROW_EXPORT Result ExportDevice(const std::shared_ptr& arr); +ARROW_EXPORT +Result ExportDevice(const std::shared_ptr& t); + } // namespace arrow::dlpack diff --git a/cpp/src/arrow/c/dlpack_test.cc b/cpp/src/arrow/c/dlpack_test.cc index 3136506bf39..f0119e8aef7 100644 --- a/cpp/src/arrow/c/dlpack_test.cc +++ b/cpp/src/arrow/c/dlpack_test.cc @@ -21,6 +21,7 @@ #include "arrow/c/dlpack.h" #include "arrow/c/dlpack_abi.h" #include "arrow/memory_pool.h" +#include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" namespace arrow::dlpack { @@ -48,7 +49,6 @@ void CheckDLTensor(const std::shared_ptr& arr, ASSERT_EQ(1, dltensor.ndim); ASSERT_EQ(dlpack_type, dltensor.dtype.code); - ASSERT_EQ(arrow_type->bit_width(), dltensor.dtype.bits); ASSERT_EQ(1, dltensor.dtype.lanes); ASSERT_EQ(DLDeviceType::kDLCPU, dltensor.device.device_type); @@ -126,4 +126,93 @@ TEST_F(TestExportArray, TestErrors) { arrow::dlpack::ExportDevice(array_boolean)); } +class TestExportTensor : public ::testing::Test { + public: + void SetUp() {} +}; + +void CheckDLTensor(const std::shared_ptr& t, + const std::shared_ptr& tensor_type, + DLDataTypeCode dlpack_type, std::vector shape, + std::vector strides) { + ASSERT_OK_AND_ASSIGN(auto dlmtensor, arrow::dlpack::ExportTensor(t)); + auto dltensor = dlmtensor->dl_tensor; + + ASSERT_EQ(t->data()->data(), dltensor.data); + ASSERT_EQ(t->ndim(), dltensor.ndim); + ASSERT_EQ(0, dltensor.byte_offset); + for (int i = 0; i < t->ndim(); i++) { + ASSERT_EQ(shape.data()[i], dltensor.shape[i]); + ASSERT_EQ(strides.data()[i], dltensor.strides[i]); + } + + ASSERT_EQ(dlpack_type, dltensor.dtype.code); + ASSERT_EQ(tensor_type->bit_width(), dltensor.dtype.bits); + ASSERT_EQ(1, dltensor.dtype.lanes); + ASSERT_EQ(DLDeviceType::kDLCPU, dltensor.device.device_type); + ASSERT_EQ(0, dltensor.device.device_id); + + ASSERT_OK_AND_ASSIGN(auto device, arrow::dlpack::ExportDevice(t)); + ASSERT_EQ(DLDeviceType::kDLCPU, device.device_type); + ASSERT_EQ(0, device.device_id); + + dlmtensor->deleter(dlmtensor); +} + +TEST_F(TestExportTensor, TestTensor) { + const std::vector, DLDataTypeCode>> cases = { + {int8(), DLDataTypeCode::kDLInt}, + {uint8(), DLDataTypeCode::kDLUInt}, + { + int16(), + DLDataTypeCode::kDLInt, + }, + {uint16(), DLDataTypeCode::kDLUInt}, + { + int32(), + DLDataTypeCode::kDLInt, + }, + {uint32(), DLDataTypeCode::kDLUInt}, + { + int64(), + DLDataTypeCode::kDLInt, + }, + {uint64(), DLDataTypeCode::kDLUInt}, + {float16(), DLDataTypeCode::kDLFloat}, + {float32(), DLDataTypeCode::kDLFloat}, + {float64(), DLDataTypeCode::kDLFloat}}; + + const auto allocated_bytes = arrow::default_memory_pool()->bytes_allocated(); + + for (auto [arrow_type, dlpack_type] : cases) { + std::vector shape = {3, 6}; + std::vector dlpack_strides = {6, 1}; + std::shared_ptr tensor = TensorFromJSON( + arrow_type, "[1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9]", shape); + + CheckDLTensor(tensor, arrow_type, dlpack_type, shape, dlpack_strides); + } + + ASSERT_EQ(allocated_bytes, arrow::default_memory_pool()->bytes_allocated()); +} + +TEST_F(TestExportTensor, TestTensorStrided) { + std::vector shape = {2, 2, 2}; + std::vector strides = {sizeof(float) * 4, sizeof(float) * 2, + sizeof(float) * 1}; + std::vector dlpack_strides = {4, 2, 1}; + std::shared_ptr tensor = + TensorFromJSON(float32(), "[1, 2, 3, 4, 5, 6, 1, 1]", shape, strides); + + CheckDLTensor(tensor, float32(), DLDataTypeCode::kDLFloat, shape, dlpack_strides); + + std::vector f_strides = {sizeof(float) * 1, sizeof(float) * 2, + sizeof(float) * 4}; + std::vector f_dlpack_strides = {1, 2, 4}; + std::shared_ptr f_tensor = + TensorFromJSON(float32(), "[1, 2, 3, 4, 5, 6, 1, 1]", shape, f_strides); + + CheckDLTensor(f_tensor, float32(), DLDataTypeCode::kDLFloat, shape, f_dlpack_strides); +} + } // namespace arrow::dlpack diff --git a/cpp/src/arrow/c/meson.build b/cpp/src/arrow/c/meson.build index b5713a6e083..519e7f5910a 100644 --- a/cpp/src/arrow/c/meson.build +++ b/cpp/src/arrow/c/meson.build @@ -15,10 +15,16 @@ # specific language governing permissions and limitations # under the License. +if needs_compute + arrow_c_bridge_deps = [arrow_compute_test_dep] +else + arrow_c_bridge_deps = [arrow_test_dep] +endif + exc = executable( 'arrow-c-bridge-test', sources: ['bridge_test.cc'], - dependencies: [arrow_test_dep], + dependencies: arrow_c_bridge_deps, ) test('arrow-c-bridge-test', exc) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 988fc148632..0fa174c1759 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -98,8 +98,34 @@ DeviceAllocationTypeSet ChunkedArray::device_types() const { } return set; } +namespace { + +// Check whether the type or any of its children is a float type. +bool ContainsFloatType(const DataType& type) { + if (is_floating(type.id())) { + return true; + } else { + // Check if any nested field contains a float type. + for (const auto& field : type.fields()) { + if (ContainsFloatType(*field->type())) { + return true; + } + } + } + // No float types are observed + return false; +} + +} // namespace bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) const { + if (this == &other) { + if (opts.nans_equal()) { + return true; + } else if (!ContainsFloatType(*type_)) { + return true; + } + } if (length_ != other.length()) { return false; } @@ -125,59 +151,17 @@ bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) c .ok(); } -namespace { - -bool mayHaveNaN(const arrow::DataType& type) { - if (type.num_fields() == 0) { - return is_floating(type.id()); - } else { - for (const auto& field : type.fields()) { - if (mayHaveNaN(*field->type())) { - return true; - } - } - } - return false; -} - -} // namespace - bool ChunkedArray::Equals(const std::shared_ptr& other, const EqualOptions& opts) const { if (!other) { return false; } - if (this == other.get() && !mayHaveNaN(*type_)) { - return true; - } return Equals(*other.get(), opts); } bool ChunkedArray::ApproxEquals(const ChunkedArray& other, const EqualOptions& equal_options) const { - if (length_ != other.length()) { - return false; - } - if (null_count_ != other.null_count()) { - return false; - } - // We cannot toggle check_metadata here yet, so we don't check it - if (!type_->Equals(*other.type_, /*check_metadata=*/false)) { - return false; - } - - // Check contents of the underlying arrays. This checks for equality of - // the underlying data independently of the chunk size. - return internal::ApplyBinaryChunked( - *this, other, - [&](const Array& left_piece, const Array& right_piece, - int64_t ARROW_ARG_UNUSED(position)) { - if (!left_piece.ApproxEquals(right_piece, equal_options)) { - return Status::Invalid("Unequal piece"); - } - return Status::OK(); - }) - .ok(); + return Equals(other, equal_options.use_atol(true)); } Result> ChunkedArray::GetScalar(int64_t index) const { diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index b3944fd1b19..326eb24d083 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -153,33 +153,57 @@ TEST_F(TestChunkedArray, EqualsDifferingMetadata) { ASSERT_TRUE(left.Equals(right)); } -TEST_F(TestChunkedArray, EqualsSameAddressWithNaNs) { - auto chunk_with_nan1 = ArrayFromJSON(float64(), "[0, 1, 2, NaN]"); - auto chunk_without_nan1 = ArrayFromJSON(float64(), "[3, 4, 5]"); - ArrayVector chunks1 = {chunk_with_nan1, chunk_without_nan1}; - ASSERT_OK_AND_ASSIGN(auto chunked_array_with_nan1, ChunkedArray::Make(chunks1)); - ASSERT_FALSE(chunked_array_with_nan1->Equals(chunked_array_with_nan1)); - - auto chunk_without_nan2 = ArrayFromJSON(float64(), "[6, 7, 8, 9]"); - ArrayVector chunks2 = {chunk_without_nan1, chunk_without_nan2}; - ASSERT_OK_AND_ASSIGN(auto chunked_array_without_nan1, ChunkedArray::Make(chunks2)); - ASSERT_TRUE(chunked_array_without_nan1->Equals(chunked_array_without_nan1)); +class TestChunkedArrayEqualsSameAddress : public TestChunkedArray {}; +TEST_F(TestChunkedArrayEqualsSameAddress, NonFloatType) { auto int32_array = ArrayFromJSON(int32(), "[0, 1, 2]"); - auto float64_array_with_nan = ArrayFromJSON(float64(), "[0, 1, NaN]"); - ArrayVector arrays1 = {int32_array, float64_array_with_nan}; - std::vector fieldnames = {"Int32Type", "Float64Type"}; - ASSERT_OK_AND_ASSIGN(auto struct_with_nan, StructArray::Make(arrays1, fieldnames)); - ArrayVector chunks3 = {struct_with_nan}; - ASSERT_OK_AND_ASSIGN(auto chunked_array_with_nan2, ChunkedArray::Make(chunks3)); - ASSERT_FALSE(chunked_array_with_nan2->Equals(chunked_array_with_nan2)); - - auto float64_array_without_nan = ArrayFromJSON(float64(), "[0, 1, 2]"); - ArrayVector arrays2 = {int32_array, float64_array_without_nan}; - ASSERT_OK_AND_ASSIGN(auto struct_without_nan, StructArray::Make(arrays2, fieldnames)); - ArrayVector chunks4 = {struct_without_nan}; - ASSERT_OK_AND_ASSIGN(auto chunked_array_without_nan2, ChunkedArray::Make(chunks4)); - ASSERT_TRUE(chunked_array_without_nan2->Equals(chunked_array_without_nan2)); + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({int32_array})); + ASSERT_TRUE(chunked_array->Equals(chunked_array)); +} + +TEST_F(TestChunkedArrayEqualsSameAddress, NestedTypeWithoutFloat) { + auto int32_array = ArrayFromJSON(int32(), "[0, 1]"); + ASSERT_OK_AND_ASSIGN(auto struct_array, + StructArray::Make({int32_array}, {"Int32Type"})); + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({struct_array})); + + ASSERT_TRUE(chunked_array->Equals(chunked_array)); +} + +TEST_F(TestChunkedArrayEqualsSameAddress, FloatType) { + auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, NaN]"); + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({float64_array})); + + ASSERT_FALSE(chunked_array->Equals(chunked_array)); + + // Assert when EqualOptions::nans_equal_ is set + ASSERT_TRUE( + chunked_array->Equals(chunked_array, EqualOptions::Defaults().nans_equal(true))); +} + +TEST_F(TestChunkedArrayEqualsSameAddress, NestedTypeWithFloat) { + auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, NaN]"); + ASSERT_OK_AND_ASSIGN(auto struct_array, + StructArray::Make({float64_array}, {"Float64Type"})); + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({struct_array})); + + ASSERT_FALSE(chunked_array->Equals(chunked_array)); + + // Assert when EqualOptions::nans_equal_ is set + ASSERT_TRUE( + chunked_array->Equals(chunked_array, EqualOptions::Defaults().nans_equal(true))); +} + +TEST_F(TestChunkedArray, ApproxEquals) { + auto chunk_1 = ArrayFromJSON(float64(), R"([0.0, 0.1, 0.5])"); + auto chunk_2 = ArrayFromJSON(float64(), R"([0.0, 0.1, 0.5001])"); + ASSERT_OK_AND_ASSIGN(auto chunked_array_1, ChunkedArray::Make({chunk_1})); + ASSERT_OK_AND_ASSIGN(auto chunked_array_2, ChunkedArray::Make({chunk_2})); + auto options = EqualOptions::Defaults().atol(1e-3); + + ASSERT_FALSE(chunked_array_1->Equals(chunked_array_2)); + ASSERT_TRUE(chunked_array_1->Equals(chunked_array_2, options.use_atol(true))); + ASSERT_TRUE(chunked_array_1->ApproxEquals(*chunked_array_2, options)); } TEST_F(TestChunkedArray, SliceEquals) { diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 3b64a8fd09f..a86d8ba6734 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -24,13 +24,16 @@ #include #include #include +#include #include #include #include +#include #include #include "arrow/array.h" #include "arrow/array/diff.h" +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/scalar.h" #include "arrow/sparse_tensor.h" @@ -48,8 +51,9 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging_internal.h" #include "arrow/util/macros.h" -#include "arrow/util/memory.h" +#include "arrow/util/memory_internal.h" #include "arrow/util/ree_util.h" +#include "arrow/util/unreachable.h" #include "arrow/visit_scalar_inline.h" #include "arrow/visit_type_inline.h" @@ -106,7 +110,7 @@ struct FloatingEquality { bool operator()(uint16_t x, uint16_t y) const { Float16 f_x = Float16::FromBits(x); Float16 f_y = Float16::FromBits(y); - if (x == y) { + if (f_x == f_y) { return Flags::signed_zeros_equal || (f_x.signbit() == f_y.signbit()); } if (Flags::nans_equal && f_x.is_nan() && f_y.is_nan()) { @@ -167,7 +171,8 @@ void VisitFloatingEquality(const EqualOptions& options, bool floating_approximat } inline bool IdentityImpliesEqualityNansNotEqual(const DataType& type) { - if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE) { + if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE || + type.id() == Type::HALF_FLOAT) { return false; } for (const auto& child : type.fields()) { @@ -1152,9 +1157,8 @@ bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& o bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx, int64_t left_end_idx, int64_t right_start_idx, const EqualOptions& options) { - const bool floating_approximate = false; return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx, - options, floating_approximate); + options, options.use_atol()); } bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t left_start_idx, @@ -1166,8 +1170,7 @@ bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t left_ } bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts) { - const bool floating_approximate = false; - return ArrayEquals(left, right, opts, floating_approximate); + return ArrayEquals(left, right, opts, opts.use_atol()); } bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& opts) { @@ -1176,8 +1179,7 @@ bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions } bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options) { - const bool floating_approximate = false; - return ScalarEquals(left, right, options, floating_approximate); + return ScalarEquals(left, right, options, options.use_atol()); } bool ScalarApproxEquals(const Scalar& left, const Scalar& right, @@ -1523,4 +1525,63 @@ bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata } } +namespace { + +bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) { + bool result; + auto visitor = [&](auto&& compare_func) { result = compare_func(left, right); }; + VisitFloatingEquality(options, options.use_atol(), std::move(visitor)); + return result; +} + +template +bool ArrayStatisticsOptionalValueEquals(const std::optional& left, + const std::optional& right, + const EqualOptions& options) { + if (!left.has_value() || !right.has_value()) { + return left.has_value() == right.has_value(); + } else if constexpr (std::is_same_v) { + return DoubleEquals(left.value(), right.value(), options); + } else if (left->index() != right->index()) { + return false; + } else { + auto EqualsVisitor = [&](const auto& v1, const auto& v2) { + using type_1 = std::decay_t; + using type_2 = std::decay_t; + if constexpr (std::conjunction_v, + std::is_same>) { + return DoubleEquals(v1, v2, options); + } else if constexpr (std::is_same_v) { + return v1 == v2; + } + Unreachable("The types are different."); + return false; + }; + return std::visit(EqualsVisitor, left.value(), right.value()); + } +} + +bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& equal_options) { + return left.null_count == right.null_count && + ArrayStatisticsOptionalValueEquals(left.distinct_count, right.distinct_count, + equal_options) && + ArrayStatisticsOptionalValueEquals(left.max_byte_width, right.max_byte_width, + equal_options) && + left.is_average_byte_width_exact == right.is_average_byte_width_exact && + left.is_min_exact == right.is_min_exact && + left.is_max_exact == right.is_max_exact && + ArrayStatisticsOptionalValueEquals(left.average_byte_width, + right.average_byte_width, equal_options) && + ArrayStatisticsOptionalValueEquals(left.min, right.min, equal_options) && + ArrayStatisticsOptionalValueEquals(left.max, right.max, equal_options); +} + +} // namespace + +bool ArrayStatisticsEquals(const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& options) { + return ArrayStatisticsEqualsImpl(left, right, options); +} + } // namespace arrow diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 6dbacfa86af..2198495d7d2 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -27,6 +27,7 @@ namespace arrow { +struct ArrayStatistics; class Array; class DataType; class Tensor; @@ -58,7 +59,21 @@ class EqualOptions { return res; } + /// Whether the "atol" property is used in the comparison. + /// + /// This option only affects the Equals methods + /// and has no effect on ApproxEquals methods. + bool use_atol() const { return use_atol_; } + + /// Return a new EqualOptions object with the "use_atol" property changed. + EqualOptions use_atol(bool v) const { + auto res = EqualOptions(*this); + res.use_atol_ = v; + return res; + } + /// The absolute tolerance for approximate comparisons of floating-point values. + /// Note that this option is ignored if "use_atol" is set to false. double atol() const { return atol_; } /// Return a new EqualOptions object with the "atol" property changed. @@ -68,6 +83,38 @@ class EqualOptions { return res; } + /// Whether the \ref arrow::Schema property is used in the comparison. + /// + /// This option only affects the Equals methods + /// and has no effect on ApproxEquals methods. + bool use_schema() const { return use_schema_; } + + /// Return a new EqualOptions object with the "use_schema_" property changed. + /// + /// Setting this option is false making the value of \ref EqualOptions::use_metadata + /// is ignored. + EqualOptions use_schema(bool v) const { + auto res = EqualOptions(*this); + res.use_schema_ = v; + return res; + } + + /// Whether the "metadata" in \ref arrow::Schema is used in the comparison. + /// + /// This option only affects the Equals methods + /// and has no effect on the ApproxEquals methods. + /// + /// Note: This option is only considered when \ref arrow::EqualOptions::use_schema is + /// set to true. + bool use_metadata() const { return use_metadata_; } + + /// Return a new EqualOptions object with the "use_metadata" property changed. + EqualOptions use_metadata(bool v) const { + auto res = EqualOptions(*this); + res.use_metadata_ = v; + return res; + } + /// The ostream to which a diff will be formatted if arrays disagree. /// If this is null (the default) no diff will be formatted. std::ostream* diff_sink() const { return diff_sink_; } @@ -87,26 +134,37 @@ class EqualOptions { double atol_ = kDefaultAbsoluteTolerance; bool nans_equal_ = false; bool signed_zeros_equal_ = true; + bool use_atol_ = false; + bool use_schema_ = true; + bool use_metadata_ = false; std::ostream* diff_sink_ = NULLPTR; }; /// Returns true if the arrays are exactly equal +/// +/// Note that arrow::ArrayStatistics is not included in the comparison. ARROW_EXPORT bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& = EqualOptions::Defaults()); /// Returns true if the arrays are approximately equal. For non-floating point /// types, this is equivalent to ArrayEquals(left, right) +/// +/// Note that arrow::ArrayStatistics is not included in the comparison. ARROW_EXPORT bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& = EqualOptions::Defaults()); /// Returns true if indicated equal-length segment of arrays are exactly equal +/// +/// Note that arrow::ArrayStatistics is not included in the comparison. ARROW_EXPORT bool ArrayRangeEquals(const Array& left, const Array& right, int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const EqualOptions& = EqualOptions::Defaults()); /// Returns true if indicated equal-length segment of arrays are approximately equal +/// +/// Note that arrow::ArrayStatistics is not included in the comparison. ARROW_EXPORT bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t start_idx, int64_t end_idx, int64_t other_start_idx, @@ -127,6 +185,16 @@ ARROW_EXPORT bool SparseTensorEquals(const SparseTensor& left, const SparseTenso ARROW_EXPORT bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata = true); +/// \brief Check two \ref arrow::ArrayStatistics for equality +/// \param[in] left an \ref arrow::ArrayStatistics +/// \param[in] right an \ref arrow::ArrayStatistics +/// \param[in] options Options used to compare double values for equality. +/// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise, +/// false. +ARROW_EXPORT bool ArrayStatisticsEquals( + const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& options = EqualOptions::Defaults()); + /// Returns true if scalars are equal /// \param[in] left a Scalar /// \param[in] right a Scalar diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index 6deb2cbad8c..6c530a76e18 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -19,21 +19,33 @@ add_custom_target(arrow-compute-tests) arrow_install_all_headers("arrow/compute") -if(ARROW_COMPUTE) - # pkg-config support - arrow_add_pkg_config("arrow-compute") -endif() - # # Unit tests # +if(ARROW_TEST_LINKAGE STREQUAL "static") + set(ARROW_COMPUTE_TEST_LINK_LIBS arrow_compute_static ${ARROW_TEST_STATIC_LINK_LIBS}) +else() + set(ARROW_COMPUTE_TEST_LINK_LIBS arrow_compute_shared ${ARROW_TEST_SHARED_LINK_LIBS}) +endif() -# Define arrow_compute_testing object library for common test files +# Define arrow_compute_core_testing object library for common test files requiring +# only core compute. No extra kernels are required. if(ARROW_TESTING) - add_library(arrow_compute_testing OBJECT test_util_internal.cc) + add_library(arrow_compute_core_testing OBJECT test_util_internal.cc) # Even though this is still just an object library we still need to "link" our # dependencies so that include paths are configured correctly - target_link_libraries(arrow_compute_testing PUBLIC ${ARROW_GTEST_GMOCK}) + target_link_libraries(arrow_compute_core_testing PUBLIC ${ARROW_GTEST_GMOCK}) +endif() + +# Define arrow_compute_testing object library for test files requiring extra kernels. +if(ARROW_TESTING AND ARROW_COMPUTE) + set(ARROW_COMPUTE_TESTING_SRCS test_env.cc) + add_library(arrow_compute_testing OBJECT ${ARROW_COMPUTE_TESTING_SRCS}) + # Even though this is still just an object library we still need to "link" + # arrow_compute_core_testing so that is also included correctly + target_link_libraries(arrow_compute_testing + PUBLIC $ + PUBLIC ${ARROW_GTEST_GTEST}) endif() set(ARROW_COMPUTE_TEST_PREFIX "arrow-compute") @@ -86,9 +98,54 @@ function(ADD_ARROW_COMPUTE_TEST REL_TEST_NAME) ${PREFIX} LABELS ${LABELS} + STATIC_LINK_LIBS + ${ARROW_COMPUTE_TEST_LINK_LIBS} ${ARG_UNPARSED_ARGUMENTS}) endfunction() +# This function is used to add a custom main to the benchmarks in order +# to initialize the compute kernels registry before running them. +# This is necessary for benchmarks that use compute kernels that are not +# part of libarrow. +# It will also link the compute libraries to the benchmark target. +function(add_arrow_compute_benchmark REL_TEST_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args EXTRA_SOURCES EXTRA_LINK_LIBS) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + if(ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "arrow-compute") + endif() + set(EXTRA_SOURCES "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/benchmark_main.cc") + if(ARG_EXTRA_SOURCES) + list(APPEND EXTRA_SOURCES ${ARG_EXTRA_SOURCES}) + endif() + if(ARROW_TEST_LINKAGE STREQUAL "static") + set(EXTRA_LINK_LIBS arrow_compute_static) + else() + set(EXTRA_LINK_LIBS arrow_compute_shared) + endif() + if(ARG_EXTRA_LINK_LIBS) + list(APPEND EXTRA_LINK_LIBS ${ARG_EXTRA_LINK_LIBS}) + endif() + add_benchmark(${REL_TEST_NAME} + PREFIX + ${PREFIX} + LABELS + "arrow-benchmarks" + EXTRA_SOURCES + ${EXTRA_SOURCES} + EXTRA_LINK_LIBS + ${EXTRA_LINK_LIBS} + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + add_arrow_test(internals_test ${ARROW_COMPUTE_TEST_ARGS} SOURCES @@ -97,7 +154,7 @@ add_arrow_test(internals_test kernel_test.cc registry_test.cc EXTRA_LINK_LIBS - arrow_compute_testing) + arrow_compute_core_testing) add_arrow_compute_test(expression_test SOURCES @@ -117,7 +174,7 @@ add_arrow_compute_test(row_test EXTRA_LINK_LIBS arrow_compute_testing) -add_arrow_benchmark(function_benchmark PREFIX "arrow-compute") +add_arrow_compute_benchmark(function_benchmark) add_subdirectory(kernels) diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h index b701d992869..343e30643cf 100644 --- a/cpp/src/arrow/compute/api.h +++ b/cpp/src/arrow/compute/api.h @@ -34,6 +34,7 @@ #include "arrow/compute/cast.h" // IWYU pragma: export #include "arrow/compute/function.h" // IWYU pragma: export #include "arrow/compute/function_options.h" // IWYU pragma: export +#include "arrow/compute/initialize.h" // IWYU pragma: export #include "arrow/compute/kernel.h" // IWYU pragma: export #include "arrow/compute/registry.h" // IWYU pragma: export #include "arrow/datum.h" // IWYU pragma: export diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index b2ed64dc59d..0ed5eb88b79 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -20,6 +20,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging_internal.h" diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 8930d04de53..d31e0a73156 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -48,7 +48,7 @@ class ExecContext; class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { public: explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1); - static constexpr char const kTypeName[] = "ScalarAggregateOptions"; + static constexpr const char kTypeName[] = "ScalarAggregateOptions"; static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; } /// If true (the default), null values are ignored. Otherwise, if any value is null, @@ -72,7 +72,7 @@ class ARROW_EXPORT CountOptions : public FunctionOptions { ALL, }; explicit CountOptions(CountMode mode = CountMode::ONLY_VALID); - static constexpr char const kTypeName[] = "CountOptions"; + static constexpr const char kTypeName[] = "CountOptions"; static CountOptions Defaults() { return CountOptions{}; } CountMode mode; @@ -85,7 +85,7 @@ class ARROW_EXPORT CountOptions : public FunctionOptions { class ARROW_EXPORT ModeOptions : public FunctionOptions { public: explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0); - static constexpr char const kTypeName[] = "ModeOptions"; + static constexpr const char kTypeName[] = "ModeOptions"; static ModeOptions Defaults() { return ModeOptions{}; } int64_t n = 1; @@ -103,7 +103,7 @@ class ARROW_EXPORT ModeOptions : public FunctionOptions { class ARROW_EXPORT VarianceOptions : public FunctionOptions { public: explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0); - static constexpr char const kTypeName[] = "VarianceOptions"; + static constexpr const char kTypeName[] = "VarianceOptions"; static VarianceOptions Defaults() { return VarianceOptions{}; } int ddof = 0; @@ -119,7 +119,7 @@ class ARROW_EXPORT SkewOptions : public FunctionOptions { public: explicit SkewOptions(bool skip_nulls = true, bool biased = true, uint32_t min_count = 0); - static constexpr char const kTypeName[] = "SkewOptions"; + static constexpr const char kTypeName[] = "SkewOptions"; static SkewOptions Defaults() { return SkewOptions{}; } /// If true (the default), null values are ignored. Otherwise, if any value is null, @@ -154,7 +154,7 @@ class ARROW_EXPORT QuantileOptions : public FunctionOptions { enum Interpolation interpolation = LINEAR, bool skip_nulls = true, uint32_t min_count = 0); - static constexpr char const kTypeName[] = "QuantileOptions"; + static constexpr const char kTypeName[] = "QuantileOptions"; static QuantileOptions Defaults() { return QuantileOptions{}; } /// probability level of quantile must be between 0 and 1 inclusive @@ -178,7 +178,7 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions { explicit TDigestOptions(std::vector q, uint32_t delta = 100, uint32_t buffer_size = 500, bool skip_nulls = true, uint32_t min_count = 0); - static constexpr char const kTypeName[] = "TDigestOptions"; + static constexpr const char kTypeName[] = "TDigestOptions"; static TDigestOptions Defaults() { return TDigestOptions{}; } /// probability level of quantile must be between 0 and 1 inclusive @@ -268,7 +268,7 @@ class ARROW_EXPORT PivotWiderOptions : public FunctionOptions { UnexpectedKeyBehavior unexpected_key_behavior = kIgnore); // Default constructor for serialization PivotWiderOptions(); - static constexpr char const kTypeName[] = "PivotWiderOptions"; + static constexpr const char kTypeName[] = "PivotWiderOptions"; static PivotWiderOptions Defaults() { return PivotWiderOptions{}; } /// The values expected in the pivot key column @@ -283,7 +283,7 @@ class ARROW_EXPORT IndexOptions : public FunctionOptions { explicit IndexOptions(std::shared_ptr value); // Default constructor for serialization IndexOptions(); - static constexpr char const kTypeName[] = "IndexOptions"; + static constexpr const char kTypeName[] = "IndexOptions"; std::shared_ptr value; }; diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 117426af44c..b43eca542f3 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -26,6 +26,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -346,6 +347,9 @@ static auto kNullOptionsType = GetFunctionOptionsType( static auto kPadOptionsType = GetFunctionOptionsType( DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding), DataMember("lean_left_on_odd_padding", &PadOptions::lean_left_on_odd_padding)); +static auto kZeroFillOptionsType = GetFunctionOptionsType( + DataMember("width", &ZeroFillOptions::width), + DataMember("padding", &ZeroFillOptions::padding)); static auto kReplaceSliceOptionsType = GetFunctionOptionsType( DataMember("start", &ReplaceSliceOptions::start), DataMember("stop", &ReplaceSliceOptions::stop), @@ -498,6 +502,13 @@ PadOptions::PadOptions(int64_t width, std::string padding, bool lean_left_on_odd PadOptions::PadOptions() : PadOptions(0, " ") {} constexpr char PadOptions::kTypeName[]; +ZeroFillOptions::ZeroFillOptions(int64_t width, std::string padding) + : FunctionOptions(internal::kZeroFillOptionsType), + width(width), + padding(std::move(padding)) {} +ZeroFillOptions::ZeroFillOptions() : ZeroFillOptions(0, "0") {} +constexpr char ZeroFillOptions::kTypeName[]; + ReplaceSliceOptions::ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement) : FunctionOptions(internal::kReplaceSliceOptionsType), @@ -701,6 +712,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kNullOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kZeroFillOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRoundBinaryOptionsType)); diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 492ea05f6d5..8b341e865a1 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -41,14 +41,14 @@ namespace compute { class ARROW_EXPORT ArithmeticOptions : public FunctionOptions { public: explicit ArithmeticOptions(bool check_overflow = false); - static constexpr char const kTypeName[] = "ArithmeticOptions"; + static constexpr const char kTypeName[] = "ArithmeticOptions"; bool check_overflow; }; class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions { public: explicit ElementWiseAggregateOptions(bool skip_nulls = true); - static constexpr char const kTypeName[] = "ElementWiseAggregateOptions"; + static constexpr const char kTypeName[] = "ElementWiseAggregateOptions"; static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; } bool skip_nulls; }; @@ -83,7 +83,7 @@ class ARROW_EXPORT RoundOptions : public FunctionOptions { public: explicit RoundOptions(int64_t ndigits = 0, RoundMode round_mode = RoundMode::HALF_TO_EVEN); - static constexpr char const kTypeName[] = "RoundOptions"; + static constexpr const char kTypeName[] = "RoundOptions"; static RoundOptions Defaults() { return RoundOptions(); } /// Rounding precision (number of digits to round to) int64_t ndigits; @@ -94,7 +94,7 @@ class ARROW_EXPORT RoundOptions : public FunctionOptions { class ARROW_EXPORT RoundBinaryOptions : public FunctionOptions { public: explicit RoundBinaryOptions(RoundMode round_mode = RoundMode::HALF_TO_EVEN); - static constexpr char const kTypeName[] = "RoundBinaryOptions"; + static constexpr const char kTypeName[] = "RoundBinaryOptions"; static RoundBinaryOptions Defaults() { return RoundBinaryOptions(); } /// Rounding and tie-breaking mode RoundMode round_mode; @@ -120,7 +120,7 @@ class ARROW_EXPORT RoundTemporalOptions : public FunctionOptions { bool week_starts_monday = true, bool ceil_is_strictly_greater = false, bool calendar_based_origin = false); - static constexpr char const kTypeName[] = "RoundTemporalOptions"; + static constexpr const char kTypeName[] = "RoundTemporalOptions"; static RoundTemporalOptions Defaults() { return RoundTemporalOptions(); } /// Number of units to round to @@ -156,7 +156,7 @@ class ARROW_EXPORT RoundToMultipleOptions : public FunctionOptions { RoundMode round_mode = RoundMode::HALF_TO_EVEN); explicit RoundToMultipleOptions(std::shared_ptr multiple, RoundMode round_mode = RoundMode::HALF_TO_EVEN); - static constexpr char const kTypeName[] = "RoundToMultipleOptions"; + static constexpr const char kTypeName[] = "RoundToMultipleOptions"; static RoundToMultipleOptions Defaults() { return RoundToMultipleOptions(); } /// Rounding scale (multiple to round to). /// @@ -182,7 +182,7 @@ class ARROW_EXPORT JoinOptions : public FunctionOptions { }; explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL, std::string null_replacement = ""); - static constexpr char const kTypeName[] = "JoinOptions"; + static constexpr const char kTypeName[] = "JoinOptions"; static JoinOptions Defaults() { return JoinOptions(); } NullHandlingBehavior null_handling; std::string null_replacement; @@ -192,7 +192,7 @@ class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { public: explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false); MatchSubstringOptions(); - static constexpr char const kTypeName[] = "MatchSubstringOptions"; + static constexpr const char kTypeName[] = "MatchSubstringOptions"; /// The exact substring (or regex, depending on kernel) to look for inside input values. std::string pattern; @@ -203,7 +203,7 @@ class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { class ARROW_EXPORT SplitOptions : public FunctionOptions { public: explicit SplitOptions(int64_t max_splits = -1, bool reverse = false); - static constexpr char const kTypeName[] = "SplitOptions"; + static constexpr const char kTypeName[] = "SplitOptions"; /// Maximum number of splits allowed, or unlimited when -1 int64_t max_splits; @@ -216,7 +216,7 @@ class ARROW_EXPORT SplitPatternOptions : public FunctionOptions { explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1, bool reverse = false); SplitPatternOptions(); - static constexpr char const kTypeName[] = "SplitPatternOptions"; + static constexpr const char kTypeName[] = "SplitPatternOptions"; /// The exact substring to split on. std::string pattern; @@ -230,7 +230,7 @@ class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions { public: explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement); ReplaceSliceOptions(); - static constexpr char const kTypeName[] = "ReplaceSliceOptions"; + static constexpr const char kTypeName[] = "ReplaceSliceOptions"; /// Index to start slicing at int64_t start; @@ -245,7 +245,7 @@ class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions { explicit ReplaceSubstringOptions(std::string pattern, std::string replacement, int64_t max_replacements = -1); ReplaceSubstringOptions(); - static constexpr char const kTypeName[] = "ReplaceSubstringOptions"; + static constexpr const char kTypeName[] = "ReplaceSubstringOptions"; /// Pattern to match, literal, or regular expression depending on which kernel is used std::string pattern; @@ -259,7 +259,7 @@ class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions { public: explicit ExtractRegexOptions(std::string pattern); ExtractRegexOptions(); - static constexpr char const kTypeName[] = "ExtractRegexOptions"; + static constexpr const char kTypeName[] = "ExtractRegexOptions"; /// Regular expression with named capture fields std::string pattern; @@ -269,7 +269,7 @@ class ARROW_EXPORT ExtractRegexSpanOptions : public FunctionOptions { public: explicit ExtractRegexSpanOptions(std::string pattern); ExtractRegexSpanOptions(); - static constexpr char const kTypeName[] = "ExtractRegexSpanOptions"; + static constexpr const char kTypeName[] = "ExtractRegexSpanOptions"; /// Regular expression with named capture fields std::string pattern; @@ -303,7 +303,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { // DEPRECATED(will be removed after removing of skip_nulls) explicit SetLookupOptions(Datum value_set, bool skip_nulls); - static constexpr char const kTypeName[] = "SetLookupOptions"; + static constexpr const char kTypeName[] = "SetLookupOptions"; /// The set of values to look up input values into. Datum value_set; @@ -330,7 +330,7 @@ class ARROW_EXPORT StructFieldOptions : public FunctionOptions { explicit StructFieldOptions(std::initializer_list); explicit StructFieldOptions(FieldRef field_ref); StructFieldOptions(); - static constexpr char const kTypeName[] = "StructFieldOptions"; + static constexpr const char kTypeName[] = "StructFieldOptions"; /// The FieldRef specifying what to extract from struct or union. FieldRef field_ref; @@ -341,7 +341,7 @@ class ARROW_EXPORT StrptimeOptions : public FunctionOptions { explicit StrptimeOptions(std::string format, TimeUnit::type unit, bool error_is_null = false); StrptimeOptions(); - static constexpr char const kTypeName[] = "StrptimeOptions"; + static constexpr const char kTypeName[] = "StrptimeOptions"; /// The desired format string. std::string format; @@ -356,7 +356,7 @@ class ARROW_EXPORT StrftimeOptions : public FunctionOptions { explicit StrftimeOptions(std::string format, std::string locale = "C"); StrftimeOptions(); - static constexpr char const kTypeName[] = "StrftimeOptions"; + static constexpr const char kTypeName[] = "StrftimeOptions"; static constexpr const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%S"; @@ -371,7 +371,7 @@ class ARROW_EXPORT PadOptions : public FunctionOptions { explicit PadOptions(int64_t width, std::string padding = " ", bool lean_left_on_odd_padding = true); PadOptions(); - static constexpr char const kTypeName[] = "PadOptions"; + static constexpr const char kTypeName[] = "PadOptions"; /// The desired string length. int64_t width; @@ -383,11 +383,23 @@ class ARROW_EXPORT PadOptions : public FunctionOptions { bool lean_left_on_odd_padding = true; }; +class ARROW_EXPORT ZeroFillOptions : public FunctionOptions { + public: + explicit ZeroFillOptions(int64_t width, std::string padding = "0"); + ZeroFillOptions(); + static constexpr const char kTypeName[] = "ZeroFillOptions"; + + /// The desired string length. + int64_t width; + /// What to pad the string with. Should be one codepoint (Unicode). + std::string padding; +}; + class ARROW_EXPORT TrimOptions : public FunctionOptions { public: explicit TrimOptions(std::string characters); TrimOptions(); - static constexpr char const kTypeName[] = "TrimOptions"; + static constexpr const char kTypeName[] = "TrimOptions"; /// The individual characters to be trimmed from the string. std::string characters; @@ -398,7 +410,7 @@ class ARROW_EXPORT SliceOptions : public FunctionOptions { explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits::max(), int64_t step = 1); SliceOptions(); - static constexpr char const kTypeName[] = "SliceOptions"; + static constexpr const char kTypeName[] = "SliceOptions"; int64_t start, stop, step; }; @@ -408,7 +420,7 @@ class ARROW_EXPORT ListSliceOptions : public FunctionOptions { int64_t step = 1, std::optional return_fixed_size_list = std::nullopt); ListSliceOptions(); - static constexpr char const kTypeName[] = "ListSliceOptions"; + static constexpr const char kTypeName[] = "ListSliceOptions"; /// The start of list slicing. int64_t start; /// Optional stop of list slicing. If not set, then slice to end. (NotImplemented) @@ -424,7 +436,7 @@ class ARROW_EXPORT ListSliceOptions : public FunctionOptions { class ARROW_EXPORT NullOptions : public FunctionOptions { public: explicit NullOptions(bool nan_is_null = false); - static constexpr char const kTypeName[] = "NullOptions"; + static constexpr const char kTypeName[] = "NullOptions"; static NullOptions Defaults() { return NullOptions{}; } bool nan_is_null; @@ -451,7 +463,7 @@ class ARROW_EXPORT MakeStructOptions : public FunctionOptions { std::vector> m); explicit MakeStructOptions(std::vector n); MakeStructOptions(); - static constexpr char const kTypeName[] = "MakeStructOptions"; + static constexpr const char kTypeName[] = "MakeStructOptions"; /// Names for wrapped columns std::vector field_names; @@ -466,7 +478,7 @@ class ARROW_EXPORT MakeStructOptions : public FunctionOptions { struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions { public: explicit DayOfWeekOptions(bool count_from_zero = true, uint32_t week_start = 1); - static constexpr char const kTypeName[] = "DayOfWeekOptions"; + static constexpr const char kTypeName[] = "DayOfWeekOptions"; static DayOfWeekOptions Defaults() { return DayOfWeekOptions(); } /// Number days from 0 if true and from 1 if false @@ -499,7 +511,7 @@ struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions { Ambiguous ambiguous = AMBIGUOUS_RAISE, Nonexistent nonexistent = NONEXISTENT_RAISE); AssumeTimezoneOptions(); - static constexpr char const kTypeName[] = "AssumeTimezoneOptions"; + static constexpr const char kTypeName[] = "AssumeTimezoneOptions"; /// Timezone to convert timestamps from std::string timezone; @@ -514,7 +526,7 @@ struct ARROW_EXPORT WeekOptions : public FunctionOptions { public: explicit WeekOptions(bool week_starts_monday = true, bool count_from_zero = false, bool first_week_is_fully_in_year = false); - static constexpr char const kTypeName[] = "WeekOptions"; + static constexpr const char kTypeName[] = "WeekOptions"; static WeekOptions Defaults() { return WeekOptions{}; } static WeekOptions ISODefaults() { return WeekOptions{/*week_starts_monday*/ true, @@ -543,7 +555,7 @@ struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions { explicit Utf8NormalizeOptions(Form form = NFC); static Utf8NormalizeOptions Defaults() { return Utf8NormalizeOptions(); } - static constexpr char const kTypeName[] = "Utf8NormalizeOptions"; + static constexpr const char kTypeName[] = "Utf8NormalizeOptions"; /// The Unicode normalization form to apply Form form; @@ -558,7 +570,7 @@ class ARROW_EXPORT RandomOptions : public FunctionOptions { RandomOptions(Initializer initializer, uint64_t seed); RandomOptions(); - static constexpr char const kTypeName[] = "RandomOptions"; + static constexpr const char kTypeName[] = "RandomOptions"; static RandomOptions Defaults() { return RandomOptions(); } /// The type of initialization for random number generation - system or provided seed. @@ -582,7 +594,7 @@ class ARROW_EXPORT MapLookupOptions : public FunctionOptions { explicit MapLookupOptions(std::shared_ptr query_key, Occurrence occurrence); MapLookupOptions(); - constexpr static char const kTypeName[] = "MapLookupOptions"; + constexpr static const char kTypeName[] = "MapLookupOptions"; /// The key to lookup in the map std::shared_ptr query_key; diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 06e6cf6c1ad..538cdccaf2b 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -30,6 +30,7 @@ #include "arrow/compute/function_internal.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/datum.h" #include "arrow/record_batch.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 69e4b243c97..b1676219b16 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -44,7 +44,7 @@ class ARROW_EXPORT FilterOptions : public FunctionOptions { }; explicit FilterOptions(NullSelectionBehavior null_selection = DROP); - static constexpr char const kTypeName[] = "FilterOptions"; + static constexpr const char kTypeName[] = "FilterOptions"; static FilterOptions Defaults() { return FilterOptions(); } NullSelectionBehavior null_selection_behavior = DROP; @@ -53,7 +53,7 @@ class ARROW_EXPORT FilterOptions : public FunctionOptions { class ARROW_EXPORT TakeOptions : public FunctionOptions { public: explicit TakeOptions(bool boundscheck = true); - static constexpr char const kTypeName[] = "TakeOptions"; + static constexpr const char kTypeName[] = "TakeOptions"; static TakeOptions BoundsCheck() { return TakeOptions(true); } static TakeOptions NoBoundsCheck() { return TakeOptions(false); } static TakeOptions Defaults() { return BoundsCheck(); } @@ -73,7 +73,7 @@ class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions { }; explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK); - static constexpr char const kTypeName[] = "DictionaryEncodeOptions"; + static constexpr const char kTypeName[] = "DictionaryEncodeOptions"; static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); } NullEncodingBehavior null_encoding_behavior = MASK; @@ -83,7 +83,7 @@ class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions { class ARROW_EXPORT RunEndEncodeOptions : public FunctionOptions { public: explicit RunEndEncodeOptions(std::shared_ptr run_end_type = int32()); - static constexpr char const kTypeName[] = "RunEndEncodeOptions"; + static constexpr const char kTypeName[] = "RunEndEncodeOptions"; static RunEndEncodeOptions Defaults() { return RunEndEncodeOptions(); } std::shared_ptr run_end_type; @@ -93,7 +93,7 @@ class ARROW_EXPORT ArraySortOptions : public FunctionOptions { public: explicit ArraySortOptions(SortOrder order = SortOrder::Ascending, NullPlacement null_placement = NullPlacement::AtEnd); - static constexpr char const kTypeName[] = "ArraySortOptions"; + static constexpr const char kTypeName[] = "ArraySortOptions"; static ArraySortOptions Defaults() { return ArraySortOptions(); } /// Sorting order @@ -107,7 +107,7 @@ class ARROW_EXPORT SortOptions : public FunctionOptions { explicit SortOptions(std::vector sort_keys = {}, NullPlacement null_placement = NullPlacement::AtEnd); explicit SortOptions(const Ordering& ordering); - static constexpr char const kTypeName[] = "SortOptions"; + static constexpr const char kTypeName[] = "SortOptions"; static SortOptions Defaults() { return SortOptions(); } /// Convenience constructor to create an ordering from SortOptions /// @@ -127,7 +127,7 @@ class ARROW_EXPORT SortOptions : public FunctionOptions { class ARROW_EXPORT SelectKOptions : public FunctionOptions { public: explicit SelectKOptions(int64_t k = -1, std::vector sort_keys = {}); - static constexpr char const kTypeName[] = "SelectKOptions"; + static constexpr const char kTypeName[] = "SelectKOptions"; static SelectKOptions Defaults() { return SelectKOptions(); } static SelectKOptions TopKDefault(int64_t k, std::vector key_names = {}) { @@ -184,7 +184,7 @@ class ARROW_EXPORT RankOptions : public FunctionOptions { Tiebreaker tiebreaker = RankOptions::First) : RankOptions({SortKey("", order)}, null_placement, tiebreaker) {} - static constexpr char const kTypeName[] = "RankOptions"; + static constexpr const char kTypeName[] = "RankOptions"; static RankOptions Defaults() { return RankOptions(); } /// Column key(s) to order by and how to order by these sort keys. @@ -205,7 +205,7 @@ class ARROW_EXPORT RankQuantileOptions : public FunctionOptions { NullPlacement null_placement = NullPlacement::AtEnd) : RankQuantileOptions({SortKey("", order)}, null_placement) {} - static constexpr char const kTypeName[] = "RankQuantileOptions"; + static constexpr const char kTypeName[] = "RankQuantileOptions"; static RankQuantileOptions Defaults() { return RankQuantileOptions(); } /// Column key(s) to order by and how to order by these sort keys. @@ -220,7 +220,7 @@ class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { explicit PartitionNthOptions(int64_t pivot, NullPlacement null_placement = NullPlacement::AtEnd); PartitionNthOptions() : PartitionNthOptions(0) {} - static constexpr char const kTypeName[] = "PartitionNthOptions"; + static constexpr const char kTypeName[] = "PartitionNthOptions"; /// The index into the equivalent sorted array of the partition pivot element. int64_t pivot; @@ -232,7 +232,7 @@ class ARROW_EXPORT WinsorizeOptions : public FunctionOptions { public: WinsorizeOptions(double lower_limit, double upper_limit); WinsorizeOptions() : WinsorizeOptions(0, 1) {} - static constexpr char const kTypeName[] = "WinsorizeOptions"; + static constexpr const char kTypeName[] = "WinsorizeOptions"; /// The quantile below which all values are replaced with the quantile's value. /// @@ -254,7 +254,7 @@ class ARROW_EXPORT CumulativeOptions : public FunctionOptions { explicit CumulativeOptions(bool skip_nulls = false); explicit CumulativeOptions(double start, bool skip_nulls = false); explicit CumulativeOptions(std::shared_ptr start, bool skip_nulls = false); - static constexpr char const kTypeName[] = "CumulativeOptions"; + static constexpr const char kTypeName[] = "CumulativeOptions"; static CumulativeOptions Defaults() { return CumulativeOptions(); } /// Optional starting value for cumulative operation computation, default depends on the @@ -276,7 +276,7 @@ using CumulativeSumOptions = CumulativeOptions; // For backward compatibility class ARROW_EXPORT PairwiseOptions : public FunctionOptions { public: explicit PairwiseOptions(int64_t periods = 1); - static constexpr char const kTypeName[] = "PairwiseOptions"; + static constexpr const char kTypeName[] = "PairwiseOptions"; static PairwiseOptions Defaults() { return PairwiseOptions(); } /// Periods to shift for applying the binary operation, accepts negative values. @@ -287,7 +287,7 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { public: explicit ListFlattenOptions(bool recursive = false); - static constexpr char const kTypeName[] = "ListFlattenOptions"; + static constexpr const char kTypeName[] = "ListFlattenOptions"; static ListFlattenOptions Defaults() { return ListFlattenOptions(); } /// \brief If true, the list is flattened recursively until a non-list @@ -300,7 +300,7 @@ class ARROW_EXPORT InversePermutationOptions : public FunctionOptions { public: explicit InversePermutationOptions(int64_t max_index = -1, std::shared_ptr output_type = NULLPTR); - static constexpr char const kTypeName[] = "InversePermutationOptions"; + static constexpr const char kTypeName[] = "InversePermutationOptions"; static InversePermutationOptions Defaults() { return InversePermutationOptions(); } /// \brief The max value in the input indices to allow. The length of the function's @@ -319,7 +319,7 @@ class ARROW_EXPORT InversePermutationOptions : public FunctionOptions { class ARROW_EXPORT ScatterOptions : public FunctionOptions { public: explicit ScatterOptions(int64_t max_index = -1); - static constexpr char const kTypeName[] = "ScatterOptions"; + static constexpr const char kTypeName[] = "ScatterOptions"; static ScatterOptions Defaults() { return ScatterOptions(); } /// \brief The max value in the input indices to allow. The length of the function's diff --git a/cpp/src/arrow/compute/benchmark_main.cc b/cpp/src/arrow/compute/benchmark_main.cc new file mode 100644 index 00000000000..2c54d697824 --- /dev/null +++ b/cpp/src/arrow/compute/benchmark_main.cc @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "arrow/compute/initialize.h" +#include "arrow/testing/gtest_util.h" + +int main(int argc, char** argv) { + // Initialize compute functions before any benchmarks run + ABORT_NOT_OK(arrow::compute::Initialize()); + + // Initialize and run benchmarks + ::benchmark::Initialize(&argc, argv); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + return 0; +} diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index 268873788de..4a8bca3f1d2 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -31,6 +31,7 @@ #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/logging_internal.h" #include "arrow/util/reflection_internal.h" diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index 18e56092dda..ec5818239ac 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -45,7 +45,7 @@ class ARROW_EXPORT CastOptions : public FunctionOptions { public: explicit CastOptions(bool safe = true); - static constexpr char const kTypeName[] = "CastOptions"; + static constexpr const char kTypeName[] = "CastOptions"; static CastOptions Safe(TypeHolder to_type = {}) { CastOptions safe(true); safe.to_type = std::move(to_type); diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 597fdaf997b..3c2ec100402 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -219,7 +219,7 @@ void PrintTo(const Expression& expr, std::ostream* os) { } bool Expression::Equals(const Expression& other) const { - if (Identical(*this, other)) return true; + if (Expression::Identical(*this, other)) return true; if (impl_ == nullptr || other.impl_ == nullptr) return false; @@ -260,7 +260,9 @@ bool Expression::Equals(const Expression& other) const { return false; } -bool Identical(const Expression& l, const Expression& r) { return l.impl_ == r.impl_; } +bool Expression::Identical(const Expression& l, const Expression& r) { + return l.impl_ == r.impl_; +} size_t Expression::hash() const { if (auto lit = literal()) { @@ -543,67 +545,61 @@ Result BindNonRecursive(Expression::Call call, bool insert_implicit_ std::vector types = GetTypes(call.arguments); ARROW_ASSIGN_OR_RAISE(call.function, GetFunction(call, exec_context)); - auto FinishBind = [&] { - compute::KernelContext kernel_context(exec_context, call.kernel); - if (call.kernel->init) { - const FunctionOptions* options = - call.options ? call.options.get() : call.function->default_options(); - ARROW_ASSIGN_OR_RAISE( - call.kernel_state, - call.kernel->init(&kernel_context, {call.kernel, types, options})); - - kernel_context.SetState(call.kernel_state.get()); - } - - ARROW_ASSIGN_OR_RAISE( - call.type, call.kernel->signature->out_type().Resolve(&kernel_context, types)); - return Status::OK(); - }; - // First try and bind exactly Result maybe_exact_match = call.function->DispatchExact(types); if (maybe_exact_match.ok()) { call.kernel = *maybe_exact_match; - if (FinishBind().ok()) { - return Expression(std::move(call)); + } else { + if (!insert_implicit_casts) { + return maybe_exact_match.status(); } - } - if (!insert_implicit_casts) { - return maybe_exact_match.status(); - } + // If exact binding fails, and we are allowed to cast, then prefer casting literals + // first. Since DispatchBest generally prefers up-casting the best way to do this is + // first down-cast the literals as much as possible + types = GetTypesWithSmallestLiteralRepresentation(call.arguments); + ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&types)); - // If exact binding fails, and we are allowed to cast, then prefer casting literals - // first. Since DispatchBest generally prefers up-casting the best way to do this is - // first down-cast the literals as much as possible - types = GetTypesWithSmallestLiteralRepresentation(call.arguments); - ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&types)); + for (size_t i = 0; i < types.size(); ++i) { + if (types[i] == call.arguments[i].type()) continue; - for (size_t i = 0; i < types.size(); ++i) { - if (types[i] == call.arguments[i].type()) continue; + if (const Datum* lit = call.arguments[i].literal()) { + ARROW_ASSIGN_OR_RAISE(Datum new_lit, + compute::Cast(*lit, types[i].GetSharedPtr())); + call.arguments[i] = literal(std::move(new_lit)); + continue; + } - if (const Datum* lit = call.arguments[i].literal()) { - ARROW_ASSIGN_OR_RAISE(Datum new_lit, compute::Cast(*lit, types[i].GetSharedPtr())); - call.arguments[i] = literal(std::move(new_lit)); - continue; - } + // construct an implicit cast Expression with which to replace this argument + Expression::Call implicit_cast; + implicit_cast.function_name = "cast"; + implicit_cast.arguments = {std::move(call.arguments[i])}; - // construct an implicit cast Expression with which to replace this argument - Expression::Call implicit_cast; - implicit_cast.function_name = "cast"; - implicit_cast.arguments = {std::move(call.arguments[i])}; + // TODO(wesm): Use TypeHolder in options + implicit_cast.options = std::make_shared( + compute::CastOptions::Safe(types[i].GetSharedPtr())); - // TODO(wesm): Use TypeHolder in options - implicit_cast.options = std::make_shared( - compute::CastOptions::Safe(types[i].GetSharedPtr())); + ARROW_ASSIGN_OR_RAISE( + call.arguments[i], + BindNonRecursive(std::move(implicit_cast), + /*insert_implicit_casts=*/false, exec_context)); + } + } + compute::KernelContext kernel_context(exec_context, call.kernel); + if (call.kernel->init) { + const FunctionOptions* options = + call.options ? call.options.get() : call.function->default_options(); ARROW_ASSIGN_OR_RAISE( - call.arguments[i], - BindNonRecursive(std::move(implicit_cast), - /*insert_implicit_casts=*/false, exec_context)); + call.kernel_state, + call.kernel->init(&kernel_context, {call.kernel, types, options})); + + kernel_context.SetState(call.kernel_state.get()); } - RETURN_NOT_OK(FinishBind()); + ARROW_ASSIGN_OR_RAISE( + call.type, call.kernel->signature->out_type().Resolve(&kernel_context, types)); + return Expression(std::move(call)); } @@ -793,8 +789,7 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, types, options})); compute::detail::DatumAccumulator listener; - RETURN_NOT_OK( - executor->Execute(ExecBatch(std::move(arguments), input_length), &listener)); + RETURN_NOT_OK(executor->Execute(ExecBatch(arguments, input_length), &listener)); const auto out = executor->WrapResults(arguments, listener.values()); #ifndef NDEBUG DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str())); @@ -1265,6 +1260,15 @@ struct Inequality { auto options = checked_pointer_cast(is_in_call->options); + // The maximum number of values in the is_in expression set of values + // in order to use the simplification. + // If the set is large there are performance implications, see: + // https://github.com/apache/arrow/issues/46777 + constexpr int16_t kIsInSimplificationMaxValueSet = 50; + if (options->value_set.length() > kIsInSimplificationMaxValueSet) { + return std::nullopt; + } + const auto& lhs = Comparison::StripOrderPreservingCasts(is_in_call->arguments[0]); if (!lhs.field_ref()) return std::nullopt; if (*lhs.field_ref() != guarantee.target) return std::nullopt; @@ -1452,7 +1456,7 @@ Result SimplifyWithGuarantee(Expression expr, return inequality->Simplify(std::move(expr)); })); - if (Identical(simplified, expr)) continue; + if (Expression::Identical(simplified, expr)) continue; expr = std::move(simplified); RETURN_NOT_OK(CanonicalizeAndFoldConstants()); @@ -1463,7 +1467,7 @@ Result SimplifyWithGuarantee(Expression expr, auto simplified, SimplifyIsValidGuarantee(std::move(expr), *CallNotNull(guarantee))); - if (Identical(simplified, expr)) continue; + if (Expression::Identical(simplified, expr)) continue; expr = std::move(simplified); RETURN_NOT_OK(CanonicalizeAndFoldConstants()); diff --git a/cpp/src/arrow/compute/expression.h b/cpp/src/arrow/compute/expression.h index 9a36a6d3368..b8ce50675c8 100644 --- a/cpp/src/arrow/compute/expression.h +++ b/cpp/src/arrow/compute/expression.h @@ -132,11 +132,11 @@ class ARROW_EXPORT Expression { explicit Expression(Datum literal); explicit Expression(Parameter parameter); + static bool Identical(const Expression& l, const Expression& r); + private: using Impl = std::variant; std::shared_ptr impl_; - - ARROW_FRIEND_EXPORT friend bool Identical(const Expression& l, const Expression& r); }; inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); } diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index dcc0f2e5c0d..bbab57feebb 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -67,8 +67,6 @@ const std::shared_ptr kBoringSchema = schema({ field("ts_s_utc", timestamp(TimeUnit::SECOND, "UTC")), }); -#define EXPECT_OK ARROW_EXPECT_OK - Expression cast(Expression argument, std::shared_ptr to_type) { return call("cast", {std::move(argument)}, compute::CastOptions::Safe(std::move(to_type))); @@ -82,6 +80,16 @@ Expression add(Expression l, Expression r) { return call("add", {std::move(l), std::move(r)}); } +std::string make_range_json(int start, int end) { + std::string result = "["; + for (int i = start; i <= end; ++i) { + if (i > start) result += ","; + result += std::to_string(i); + } + result += "]"; + return result; +} + const auto no_change = std::nullopt; TEST(ExpressionUtils, Comparison) { @@ -614,6 +622,45 @@ TEST(Expression, BindCall) { add(cast(field_ref("i32"), float32()), literal(3.5F))); } +static Status RegisterInvalidInit() { + const std::string name = "invalid_init"; + struct CastableFunction : public ScalarFunction { + using ScalarFunction::ScalarFunction; + + Result DispatchBest(std::vector* types) const override { + return Status::Invalid("Shouldn't call DispatchBest on this function"); + } + }; + auto func = + std::make_shared(name, Arity::Unary(), FunctionDoc::Empty()); + + auto func_exec = [](KernelContext*, const ExecSpan&, ExecResult*) -> Status { + return Status::OK(); + }; + auto func_init = [](KernelContext*, + const KernelInitArgs&) -> Result> { + return Status::Invalid("Invalid Init"); + }; + + ScalarKernel kernel({int64()}, int64(), func_exec, func_init); + ARROW_RETURN_NOT_OK(func->AddKernel(kernel)); + + auto registry = GetFunctionRegistry(); + ARROW_RETURN_NOT_OK(registry->AddFunction(std::move(func))); + + return Status::OK(); +} + +// GH-47268: The bad status in call binding is discarded. +TEST(Expression, BindCallError) { + ASSERT_OK(RegisterInvalidInit()); + auto expr = call("invalid_init", {field_ref("i64")}); + EXPECT_FALSE(expr.IsBound()); + + ASSERT_RAISES_WITH_MESSAGE(Invalid, "Invalid: Invalid Init", + expr.Bind(*kBoringSchema).status()); +} + TEST(Expression, BindWithAliasCasts) { auto fm = GetFunctionRegistry(); EXPECT_OK(fm->AddAlias("alias_cast", "cast")); @@ -626,20 +673,78 @@ TEST(Expression, BindWithAliasCasts) { } TEST(Expression, BindWithDecimalArithmeticOps) { - for (std::string arith_op : {"add", "subtract", "multiply", "divide"}) { - auto expr = call(arith_op, {field_ref("d1"), field_ref("d2")}); - EXPECT_FALSE(expr.IsBound()); - - static const std::vector> scales = {{3, 9}, {6, 6}, {9, 3}}; - for (auto s : scales) { - auto schema = arrow::schema( - {field("d1", decimal256(30, s.first)), field("d2", decimal256(20, s.second))}); - ExpectBindsTo(expr, no_change, &expr, *schema); + static const std::vector> scales = {{3, 9}, {6, 6}, {9, 3}}; + + for (std::string suffix : {"", "_checked"}) { + for (std::string arith_op : {"add", "subtract", "multiply", "divide"}) { + std::string name = arith_op + suffix; + SCOPED_TRACE(name); + + for (auto s : scales) { + auto schema = arrow::schema({field("d1", decimal256(30, s.first)), + field("d2", decimal256(20, s.second))}); + auto expr = call(name, {field_ref("d1"), field_ref("d2")}); + EXPECT_FALSE(expr.IsBound()); + ExpectBindsTo(expr, no_change, &expr, *schema); + } } } } +TEST(Expression, BindWithDecimalDivision) { + auto expect_decimal_division_type = [](std::string name, + std::shared_ptr dividend, + std::shared_ptr divisor, + std::shared_ptr expected) { + auto schema = arrow::schema({field("dividend", dividend), field("divisor", divisor)}); + auto expr = call(name, {field_ref("dividend"), field_ref("divisor")}); + ASSERT_OK_AND_ASSIGN(auto bound, expr.Bind(*schema)); + EXPECT_TRUE(bound.IsBound()); + EXPECT_TRUE(bound.type()->Equals(expected)); + }; + + for (std::string name : {"divide", "divide_checked"}) { + SCOPED_TRACE(name); + + expect_decimal_division_type(name, int64(), arrow::decimal128(1, 0), + decimal128(23, 4)); + expect_decimal_division_type(name, arrow::decimal128(1, 0), int64(), + decimal128(21, 20)); + + expect_decimal_division_type(name, decimal128(2, 1), decimal128(2, 1), + decimal128(6, 4)); + expect_decimal_division_type(name, decimal256(2, 1), decimal256(2, 1), + decimal256(6, 4)); + expect_decimal_division_type(name, decimal128(2, 1), decimal256(2, 1), + decimal256(6, 4)); + expect_decimal_division_type(name, decimal256(2, 1), decimal128(2, 1), + decimal256(6, 4)); + + expect_decimal_division_type(name, decimal128(2, 0), decimal128(2, 1), + decimal128(7, 4)); + expect_decimal_division_type(name, decimal128(2, 1), decimal128(2, 0), + decimal128(5, 4)); + + // GH-39875: Expression call to decimal(3 ,2) / decimal(15, 2) wrong result type. + // decimal128(3, 2) / decimal128(15, 2) + // -> decimal128(19, 18) / decimal128(15, 2) = decimal128(19, 16) + expect_decimal_division_type(name, decimal128(3, 2), decimal128(15, 2), + decimal128(19, 16)); + + // GH-40911: Expression call to decimal(7 ,2) / decimal(6, 1) wrong result type. + // decimal128(7, 2) / decimal128(6, 1) + // -> decimal128(14, 9) / decimal128(6, 1) = decimal128(14, 8) + expect_decimal_division_type(name, decimal128(7, 2), decimal128(6, 1), + decimal128(14, 8)); + } +} + TEST(Expression, BindWithImplicitCasts) { + auto exciting_schema = schema( + {field("i64", int64()), field("dec128_3_2", decimal128(3, 2)), + field("dec128_4_2", decimal128(4, 2)), field("dec128_5_3", decimal128(5, 3)), + field("dec256_3_2", decimal256(3, 2)), field("dec256_4_2", decimal256(4, 2)), + field("dec256_5_3", decimal256(5, 3))}); for (auto cmp : {equal, not_equal, less, less_equal, greater, greater_equal}) { // cast arguments to common numeric type ExpectBindsTo(cmp(field_ref("i64"), field_ref("i32")), @@ -700,6 +805,82 @@ TEST(Expression, BindWithImplicitCasts) { ExpectBindsTo(cmp(field_ref("i32"), literal(std::make_shared(10.0))), cmp(cast(field_ref("i32"), float32()), literal(std::make_shared(10.0f)))); + + // decimal int + ExpectBindsTo(cmp(field_ref("dec128_3_2"), field_ref("i64")), + cmp(field_ref("dec128_3_2"), cast(field_ref("i64"), decimal128(21, 2))), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(cmp(field_ref("i64"), field_ref("dec128_3_2")), + cmp(cast(field_ref("i64"), decimal128(21, 2)), field_ref("dec128_3_2")), + /*bound_out=*/nullptr, *exciting_schema); + + // decimal decimal with different widths different precisions but same scale + ExpectBindsTo( + cmp(field_ref("dec128_3_2"), field_ref("dec256_4_2")), + cmp(cast(field_ref("dec128_3_2"), decimal256(3, 2)), field_ref("dec256_4_2")), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo( + cmp(field_ref("dec256_4_2"), field_ref("dec128_3_2")), + cmp(field_ref("dec256_4_2"), cast(field_ref("dec128_3_2"), decimal256(3, 2))), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo( + cmp(field_ref("dec128_4_2"), field_ref("dec256_3_2")), + cmp(cast(field_ref("dec128_4_2"), decimal256(4, 2)), field_ref("dec256_3_2")), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo( + cmp(field_ref("dec256_3_2"), field_ref("dec128_4_2")), + cmp(field_ref("dec256_3_2"), cast(field_ref("dec128_4_2"), decimal256(4, 2))), + /*bound_out=*/nullptr, *exciting_schema); + + // decimal decimal with different widths different scales + ExpectBindsTo( + cmp(field_ref("dec128_3_2"), field_ref("dec256_5_3")), + cmp(cast(field_ref("dec128_3_2"), decimal256(4, 3)), field_ref("dec256_5_3")), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo( + cmp(field_ref("dec256_5_3"), field_ref("dec128_3_2")), + cmp(field_ref("dec256_5_3"), cast(field_ref("dec128_3_2"), decimal256(4, 3))), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(cmp(field_ref("dec128_5_3"), field_ref("dec256_3_2")), + cmp(cast(field_ref("dec128_5_3"), decimal256(5, 3)), + cast(field_ref("dec256_3_2"), decimal256(4, 3))), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(cmp(field_ref("dec256_3_2"), field_ref("dec128_5_3")), + cmp(cast(field_ref("dec256_3_2"), decimal256(4, 3)), + cast(field_ref("dec128_5_3"), decimal256(5, 3))), + /*bound_out=*/nullptr, *exciting_schema); + + // decimal decimal with same width same precision but different scales (no cast) + ExpectBindsTo(cmp(field_ref("dec128_3_2"), field_ref("dec128_4_2")), + cmp(field_ref("dec128_3_2"), field_ref("dec128_4_2")), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(cmp(field_ref("dec128_4_2"), field_ref("dec128_3_2")), + cmp(field_ref("dec128_4_2"), field_ref("dec128_3_2")), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(cmp(field_ref("dec256_3_2"), field_ref("dec256_4_2")), + cmp(field_ref("dec256_3_2"), field_ref("dec256_4_2")), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(cmp(field_ref("dec256_4_2"), field_ref("dec256_3_2")), + cmp(field_ref("dec256_4_2"), field_ref("dec256_3_2")), + /*bound_out=*/nullptr, *exciting_schema); + + // decimal decimal with same width but different scales + ExpectBindsTo( + cmp(field_ref("dec128_3_2"), field_ref("dec128_5_3")), + cmp(cast(field_ref("dec128_3_2"), decimal128(4, 3)), field_ref("dec128_5_3")), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo( + cmp(field_ref("dec128_5_3"), field_ref("dec128_3_2")), + cmp(field_ref("dec128_5_3"), cast(field_ref("dec128_3_2"), decimal128(4, 3))), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo( + cmp(field_ref("dec256_3_2"), field_ref("dec256_5_3")), + cmp(cast(field_ref("dec256_3_2"), decimal256(4, 3)), field_ref("dec256_5_3")), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo( + cmp(field_ref("dec256_5_3"), field_ref("dec256_3_2")), + cmp(field_ref("dec256_5_3"), cast(field_ref("dec256_3_2"), decimal256(4, 3))), + /*bound_out=*/nullptr, *exciting_schema); } compute::SetLookupOptions in_a{ArrayFromJSON(utf8(), R"(["a"])")}; @@ -709,6 +890,51 @@ TEST(Expression, BindWithImplicitCasts) { call("is_in", {cast(field_ref("dict_str"), utf8())}, in_a)); } +TEST(Expression, BindWithImplicitCastsForCaseWhenOnDecimal) { + auto exciting_schema = schema( + {field("a", struct_({field("", boolean())})), + field("dec128_20_3", decimal128(20, 3)), field("dec128_21_3", decimal128(21, 3)), + field("dec128_20_1", decimal128(20, 1)), field("dec128_21_1", decimal128(21, 1)), + field("dec256_20_3", decimal256(20, 3)), field("dec256_21_3", decimal256(21, 3)), + field("dec256_20_1", decimal256(20, 1)), field("dec256_21_1", decimal256(21, 1))}); + ExpectBindsTo(call("case_when", {field_ref("a"), field_ref("dec128_20_3"), + field_ref("dec128_21_3")}), + call("case_when", + {field_ref("a"), cast(field_ref("dec128_20_3"), decimal128(21, 3)), + field_ref("dec128_21_3")}), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(call("case_when", {field_ref("a"), field_ref("dec128_20_1"), + field_ref("dec128_21_3")}), + call("case_when", + {field_ref("a"), cast(field_ref("dec128_20_1"), decimal128(22, 3)), + cast(field_ref("dec128_21_3"), decimal128(22, 3))}), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(call("case_when", {field_ref("a"), field_ref("dec128_20_3"), + field_ref("dec128_21_1")}), + call("case_when", + {field_ref("a"), cast(field_ref("dec128_20_3"), decimal128(23, 3)), + cast(field_ref("dec128_21_1"), decimal128(23, 3))}), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(call("case_when", {field_ref("a"), field_ref("dec128_20_3"), + field_ref("dec256_21_3")}), + call("case_when", + {field_ref("a"), cast(field_ref("dec128_20_3"), decimal256(21, 3)), + field_ref("dec256_21_3")}), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(call("case_when", {field_ref("a"), field_ref("dec256_20_1"), + field_ref("dec128_21_3")}), + call("case_when", + {field_ref("a"), cast(field_ref("dec256_20_1"), decimal256(22, 3)), + cast(field_ref("dec128_21_3"), decimal256(22, 3))}), + /*bound_out=*/nullptr, *exciting_schema); + ExpectBindsTo(call("case_when", {field_ref("a"), field_ref("dec256_20_3"), + field_ref("dec256_21_1")}), + call("case_when", + {field_ref("a"), cast(field_ref("dec256_20_3"), decimal256(23, 3)), + cast(field_ref("dec256_21_1"), decimal256(23, 3))}), + /*bound_out=*/nullptr, *exciting_schema); +} + TEST(Expression, BindNestedCall) { auto expr = add(field_ref("a"), call("subtract", {call("multiply", {field_ref("b"), field_ref("c")}), @@ -945,13 +1171,13 @@ TEST(Expression, ExecuteChunkedArray) { ExecBatch batch{inputs, 3}; ASSERT_OK_AND_ASSIGN(Datum res, ExecuteScalarExpression(expr, batch)); + ASSERT_TRUE(res.is_chunked_array()); - AssertDatumsEqual(res, ArrayFromJSON(float64(), - R"([ + AssertDatumsEqual(res, ChunkedArrayFromJSON(float64(), {R"([ 9.5, 1, 3.75 - ])")); + ])"})); } TEST(Expression, ExecuteDictionaryTransparent) { @@ -992,7 +1218,7 @@ TEST(Expression, ExecuteDictionaryTransparent) { void ExpectIdenticalIfUnchanged(Expression modified, Expression original) { if (modified == original) { // no change -> must be identical - EXPECT_TRUE(Identical(modified, original)) << " " << original.ToString(); + EXPECT_TRUE(Expression::Identical(modified, original)) << " " << original.ToString(); } } @@ -1681,6 +1907,16 @@ TEST(Expression, SimplifyIsIn) { Simplify{is_in(field_ref("u32"), int64(), "[1,3,5,7,9]", null_matching)} .WithGuarantee(greater(field_ref("u32"), literal(3))) .Expect(is_in(field_ref("u32"), int64(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("u32"), int64(), make_range_json(1, 40), null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(10))) + .Expect(is_in(field_ref("u32"), int64(), make_range_json(11, 40), null_matching)); + + // For large ranges we don't do any simplification, see + // `kIsInSimplificationMaxValueSet` in expression.cc. + Simplify{is_in(field_ref("u32"), int64(), make_range_json(1, 100), null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .ExpectUnchanged(); } Simplify{ diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index d7842f14eff..b0b12a690f8 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -410,14 +410,15 @@ Status Function::Validate() const { } Status ScalarFunction::AddKernel(std::vector in_types, OutputType out_type, - ArrayKernelExec exec, KernelInit init) { + ArrayKernelExec exec, KernelInit init, + std::shared_ptr constraint) { RETURN_NOT_OK(CheckArity(in_types.size())); if (arity_.is_varargs && in_types.size() != 1) { return Status::Invalid("VarArgs signatures must have exactly one input type"); } - auto sig = - KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs); + auto sig = KernelSignature::Make(std::move(in_types), std::move(out_type), + arity_.is_varargs, std::move(constraint)); kernels_.emplace_back(std::move(sig), exec, init); return Status::OK(); } diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 2b86f642166..399081e2a73 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -308,7 +308,8 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl { /// initialization, preallocation for fixed-width types, and default null /// handling (intersect validity bitmaps of inputs). Status AddKernel(std::vector in_types, OutputType out_type, - ArrayKernelExec exec, KernelInit init = NULLPTR); + ArrayKernelExec exec, KernelInit init = NULLPTR, + std::shared_ptr constraint = NULLPTR); /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. diff --git a/cpp/src/arrow/compute/initialize.cc b/cpp/src/arrow/compute/initialize.cc new file mode 100644 index 00000000000..d88835da04a --- /dev/null +++ b/cpp/src/arrow/compute/initialize.cc @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "arrow/compute/initialize.h" + +#include "arrow/compute/registry_internal.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/status.h" + +namespace arrow::compute { +namespace { + +Status RegisterComputeKernels() { + auto registry = GetFunctionRegistry(); + + // Register additional kernels on libarrow_compute + // Scalar functions + internal::RegisterScalarArithmetic(registry); + internal::RegisterScalarBoolean(registry); + internal::RegisterScalarComparison(registry); + internal::RegisterScalarIfElse(registry); + internal::RegisterScalarNested(registry); + internal::RegisterScalarRandom(registry); // Nullary + internal::RegisterScalarRoundArithmetic(registry); + internal::RegisterScalarSetLookup(registry); + internal::RegisterScalarStringAscii(registry); + internal::RegisterScalarStringUtf8(registry); + internal::RegisterScalarTemporalBinary(registry); + internal::RegisterScalarTemporalUnary(registry); + internal::RegisterScalarValidity(registry); + + // Vector functions + internal::RegisterVectorArraySort(registry); + internal::RegisterVectorCumulativeSum(registry); + internal::RegisterVectorNested(registry); + internal::RegisterVectorRank(registry); + internal::RegisterVectorReplace(registry); + internal::RegisterVectorSelectK(registry); + internal::RegisterVectorSort(registry); + internal::RegisterVectorRunEndEncode(registry); + internal::RegisterVectorRunEndDecode(registry); + internal::RegisterVectorPairwise(registry); + internal::RegisterVectorStatistics(registry); + + // Aggregate functions + internal::RegisterHashAggregateBasic(registry); + internal::RegisterHashAggregateNumeric(registry); + internal::RegisterHashAggregatePivot(registry); + internal::RegisterScalarAggregateBasic(registry); + internal::RegisterScalarAggregateMode(registry); + internal::RegisterScalarAggregatePivot(registry); + internal::RegisterScalarAggregateQuantile(registry); + internal::RegisterScalarAggregateTDigest(registry); + internal::RegisterScalarAggregateVariance(registry); + + return Status::OK(); +} + +} // namespace + +Status Initialize() { + static auto st = RegisterComputeKernels(); + return st; +} + +} // namespace arrow::compute diff --git a/cpp/src/arrow/compute/initialize.h b/cpp/src/arrow/compute/initialize.h new file mode 100644 index 00000000000..db5e231325b --- /dev/null +++ b/cpp/src/arrow/compute/initialize.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/compute/visibility.h" +#include "arrow/status.h" + +namespace arrow::compute { + +/// \brief Initialize the compute module. +/// +/// Register the compute kernel functions to be available on the +/// global FunctionRegistry. +/// This function will only be available if ARROW_COMPUTE is enabled. +ARROW_COMPUTE_EXPORT Status Initialize(); + +} // namespace arrow::compute diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index 4211593adf8..addbb29edd2 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -475,23 +475,69 @@ std::string OutputType::ToString() const { return "computed"; } +// ---------------------------------------------------------------------- +// MatchConstraint + +std::shared_ptr MatchConstraint::Make( + std::function&)> matches) { + class FunctionMatchConstraint : public MatchConstraint { + public: + explicit FunctionMatchConstraint( + std::function&)> matches) + : matches_(std::move(matches)) {} + + bool Matches(const std::vector& types) const override { + return matches_(types); + } + + private: + std::function&)> matches_; + }; + + return std::make_shared(std::move(matches)); +} + +std::shared_ptr DecimalsHaveSameScale() { + class DecimalsHaveSameScaleConstraint : public MatchConstraint { + public: + bool Matches(const std::vector& types) const override { + DCHECK_GE(types.size(), 2); + DCHECK(std::all_of(types.begin(), types.end(), + [](const TypeHolder& type) { return is_decimal(type.id()); })); + const auto& ty0 = checked_cast(*types[0].type); + auto s0 = ty0.scale(); + for (size_t i = 1; i < types.size(); ++i) { + const auto& ty = checked_cast(*types[i].type); + if (ty.scale() != s0) { + return false; + } + } + return true; + } + }; + static auto instance = std::make_shared(); + return instance; +} + // ---------------------------------------------------------------------- // KernelSignature KernelSignature::KernelSignature(std::vector in_types, OutputType out_type, - bool is_varargs) + bool is_varargs, + std::shared_ptr constraint) : in_types_(std::move(in_types)), out_type_(std::move(out_type)), is_varargs_(is_varargs), + constraint_(std::move(constraint)), hash_code_(0) { DCHECK(!is_varargs || (is_varargs && (in_types_.size() >= 1))); } -std::shared_ptr KernelSignature::Make(std::vector in_types, - OutputType out_type, - bool is_varargs) { +std::shared_ptr KernelSignature::Make( + std::vector in_types, OutputType out_type, bool is_varargs, + std::shared_ptr constraint) { return std::make_shared(std::move(in_types), std::move(out_type), - is_varargs); + is_varargs, std::move(constraint)); } bool KernelSignature::Equals(const KernelSignature& other) const { @@ -526,6 +572,9 @@ bool KernelSignature::MatchesInputs(const std::vector& types) const } } } + if (constraint_ && !constraint_->Matches(types)) { + return false; + } return true; } diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index cfb6265f129..0d4f9d6ff43 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -348,7 +348,24 @@ class ARROW_EXPORT OutputType { Resolver resolver_ = NULLPTR; }; -/// \brief Holds the input types and output type of the kernel. +/// \brief Additional constraints to apply to the input types of a kernel when matching a +/// specific kernel signature. +class ARROW_EXPORT MatchConstraint { + public: + virtual ~MatchConstraint() = default; + + /// \brief Return true if the input types satisfy the constraint. + virtual bool Matches(const std::vector& types) const = 0; + + /// \brief Convenience function to create a MatchConstraint from a match function. + static std::shared_ptr Make( + std::function&)> matches); +}; + +/// \brief Constraint that all input types are decimal types and have the same scale. +ARROW_EXPORT std::shared_ptr DecimalsHaveSameScale(); + +/// \brief Holds the input types, optional match constraint and output type of the kernel. /// /// VarArgs functions with minimum N arguments should pass up to N input types to be /// used to validate the input types of a function invocation. The first N-1 types @@ -357,15 +374,16 @@ class ARROW_EXPORT OutputType { class ARROW_EXPORT KernelSignature { public: KernelSignature(std::vector in_types, OutputType out_type, - bool is_varargs = false); + bool is_varargs = false, + std::shared_ptr constraint = NULLPTR); /// \brief Convenience ctor since make_shared can be awkward - static std::shared_ptr Make(std::vector in_types, - OutputType out_type, - bool is_varargs = false); + static std::shared_ptr Make( + std::vector in_types, OutputType out_type, bool is_varargs = false, + std::shared_ptr constraint = NULLPTR); - /// \brief Return true if the signature if compatible with the list of input - /// value descriptors. + /// \brief Return true if the signature is compatible with the list of input + /// value descriptors and satisfies the match constraint, if any. bool MatchesInputs(const std::vector& types) const; /// \brief Returns true if the input types of each signature are @@ -401,6 +419,7 @@ class ARROW_EXPORT KernelSignature { std::vector in_types_; OutputType out_type_; bool is_varargs_; + std::shared_ptr constraint_; // For caching the hash code after it's computed the first time mutable uint64_t hash_code_; diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc index e9664b104d7..9317ae7a42d 100644 --- a/cpp/src/arrow/compute/kernel_test.cc +++ b/cpp/src/arrow/compute/kernel_test.cc @@ -307,6 +307,40 @@ TEST(OutputType, Resolve) { ASSERT_EQ(result, int32()); } +// ---------------------------------------------------------------------- +// MatchConstraint + +TEST(MatchConstraint, ConvenienceMaker) { + { + auto always_match = + MatchConstraint::Make([](const std::vector& types) { return true; }); + + ASSERT_TRUE(always_match->Matches({})); + ASSERT_TRUE(always_match->Matches({int8(), int16(), int32()})); + } + + { + auto always_false = + MatchConstraint::Make([](const std::vector& types) { return false; }); + + ASSERT_FALSE(always_false->Matches({})); + ASSERT_FALSE(always_false->Matches({int8(), int16(), int32()})); + } +} + +TEST(MatchConstraint, DecimalsHaveSameScale) { + auto c = DecimalsHaveSameScale(); + constexpr int32_t precision = 12, scale = 2; + ASSERT_TRUE(c->Matches({decimal128(precision, scale), decimal128(precision, scale)})); + ASSERT_TRUE(c->Matches({decimal128(precision, scale), decimal256(precision, scale)})); + ASSERT_TRUE(c->Matches({decimal256(precision, scale), decimal128(precision, scale)})); + ASSERT_TRUE(c->Matches({decimal256(precision, scale), decimal256(precision, scale)})); + ASSERT_FALSE( + c->Matches({decimal128(precision, scale), decimal128(precision, scale + 1)})); + ASSERT_FALSE(c->Matches({decimal128(precision, scale), decimal128(precision, scale), + decimal128(precision, scale + 1)})); +} + // ---------------------------------------------------------------------- // KernelSignature @@ -419,6 +453,34 @@ TEST(KernelSignature, VarArgsMatchesInputs) { } } +TEST(KernelSignature, MatchesInputsWithConstraint) { + auto precisions = {12, 22}, scales = {2, 3}; + for (auto p1 : precisions) { + for (auto s1 : scales) { + auto d1 = decimal128(p1, s1); + for (auto p2 : precisions) { + for (auto s2 : scales) { + auto d2 = decimal128(p2, s2); + + { + // No constraint. + KernelSignature sig_no_constraint({Type::DECIMAL128, Type::DECIMAL128}, + boolean()); + ASSERT_TRUE(sig_no_constraint.MatchesInputs({d1, d2})); + } + + { + // All decimal types must have the same scale. + KernelSignature sig({Type::DECIMAL128, Type::DECIMAL128}, boolean(), + /*is_varargs=*/false, DecimalsHaveSameScale()); + ASSERT_EQ(sig.MatchesInputs({d1, d2}), s1 == s2); + } + } + } + } + } +} + TEST(KernelSignature, ToString) { std::vector in_types = {InputType(int8()), InputType(Type::DECIMAL), InputType(utf8())}; diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 81b7adeb4aa..15955b5ef88 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +arrow_install_all_headers("arrow/compute/kernels") + # ---------------------------------------------------------------------- # Tests that don't require the full kernel library @@ -32,7 +34,7 @@ add_arrow_test(scalar_cast_test scalar_cast_test.cc EXTRA_LINK_LIBS arrow_compute_kernels_testing - arrow_compute_testing) + arrow_compute_core_testing) # ---------------------------------------------------------------------- # Scalar kernels @@ -82,17 +84,18 @@ add_arrow_compute_test(scalar_utility_test arrow_compute_kernels_testing arrow_compute_testing) -add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_cast_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_if_else_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_list_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_random_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_round_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(scalar_temporal_benchmark PREFIX "arrow-compute") +# The following benchmarks require compute kernels initialization. +add_arrow_compute_benchmark(scalar_arithmetic_benchmark) +add_arrow_compute_benchmark(scalar_boolean_benchmark) +add_arrow_compute_benchmark(scalar_compare_benchmark) +add_arrow_compute_benchmark(scalar_if_else_benchmark) +add_arrow_compute_benchmark(scalar_list_benchmark) +add_arrow_compute_benchmark(scalar_random_benchmark) +add_arrow_compute_benchmark(scalar_round_benchmark) +add_arrow_compute_benchmark(scalar_set_lookup_benchmark) +add_arrow_compute_benchmark(scalar_string_benchmark) +add_arrow_compute_benchmark(scalar_temporal_benchmark) # ---------------------------------------------------------------------- # Vector kernels @@ -133,11 +136,12 @@ add_arrow_compute_test(vector_swizzle_test arrow_compute_testing) add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(vector_partition_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(vector_topk_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(vector_replace_benchmark PREFIX "arrow-compute") -add_arrow_benchmark(vector_selection_benchmark PREFIX "arrow-compute") +# The following benchmarks require compute kernels initialization. +add_arrow_compute_benchmark(vector_sort_benchmark) +add_arrow_compute_benchmark(vector_partition_benchmark) +add_arrow_compute_benchmark(vector_topk_benchmark) +add_arrow_compute_benchmark(vector_replace_benchmark) +add_arrow_compute_benchmark(vector_selection_benchmark) # ---------------------------------------------------------------------- # Aggregate kernels diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 68b1ac7c03c..03fba53ac02 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -18,8 +18,10 @@ #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_basic_internal.h" #include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/cpu_info.h" #include "arrow/util/hashing.h" @@ -367,9 +369,9 @@ struct ProductImpl : public ScalarAggregator { Status Finalize(KernelContext*, Datum* out) override { if ((!options.skip_nulls && this->nulls_observed) || (this->count < options.min_count)) { - out->value = std::make_shared(out_type); + out->value = std::make_shared(this->out_type); } else { - out->value = std::make_shared(this->product, out_type); + out->value = std::make_shared(this->product, this->out_type); } return Status::OK(); } @@ -851,6 +853,8 @@ void AddBasicAggKernels(KernelInit init, } } +namespace { + void AddScalarAggKernels(KernelInit init, const std::vector>& types, std::shared_ptr out_ty, @@ -870,16 +874,12 @@ void AddArrayScalarAggKernels(KernelInit init, AddScalarAggKernels(init, types, out_ty, func); } -namespace { - Result MinMaxType(KernelContext*, const std::vector& types) { // T -> struct auto ty = types.front().GetSharedPtr(); return struct_({field("min", ty), field("max", ty)}); } -} // namespace - Result FirstLastType(KernelContext*, const std::vector& types) { auto ty = types.front().GetSharedPtr(); return struct_({field("first", ty), field("last", ty)}); @@ -899,6 +899,8 @@ void AddFirstLastKernels(KernelInit init, } } +} // namespace + void AddMinMaxKernel(KernelInit init, internal::detail::GetTypeId get_id, ScalarAggregateFunction* func, SimdLevel::type simd_level) { auto sig = KernelSignature::Make({InputType(get_id.id)}, MinMaxType); @@ -1048,10 +1050,10 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { func = std::make_shared("sum", Arity::Unary(), sum_doc, &default_scalar_aggregate_options); AddArrayScalarAggKernels(SumInit, {boolean()}, uint64(), func.get()); - AddAggKernel(KernelSignature::Make({Type::DECIMAL128}, FirstType), SumInit, func.get(), - SimdLevel::NONE); - AddAggKernel(KernelSignature::Make({Type::DECIMAL256}, FirstType), SumInit, func.get(), - SimdLevel::NONE); + AddAggKernel(KernelSignature::Make({Type::DECIMAL128}, MaxPrecisionDecimalType), + SumInit, func.get(), SimdLevel::NONE); + AddAggKernel(KernelSignature::Make({Type::DECIMAL256}, MaxPrecisionDecimalType), + SumInit, func.get(), SimdLevel::NONE); AddArrayScalarAggKernels(SumInit, SignedIntTypes(), int64(), func.get()); AddArrayScalarAggKernels(SumInit, UnsignedIntTypes(), uint64(), func.get()); AddArrayScalarAggKernels(SumInit, FloatingPointTypes(), float64(), func.get()); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc index 49010d182cd..7f2bce4063d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc @@ -95,9 +95,9 @@ struct SumImpl : public ScalarAggregator { Status Finalize(KernelContext*, Datum* out) override { if ((!options.skip_nulls && this->nulls_observed) || (this->count < options.min_count)) { - out->value = std::make_shared(out_type); + out->value = std::make_shared(this->out_type); } else { - out->value = std::make_shared(this->sum, out_type); + out->value = std::make_shared(this->sum, this->out_type); } return Status::OK(); } @@ -168,6 +168,14 @@ struct SumLikeInit { const ScalarAggregateOptions& options) : ctx(ctx), type(type), options(options) {} + // If this returns true, then the aggregator will promote a decimal to the maximum + // precision for that type. For instance, a decimal128(3, 2) will be promoted to a + // decimal128(38, 2) + // + // TODO: Ideally this should be configurable via the function options with an enum + // PrecisionPolicy { PROMOTE_TO_MAX, DEMOTE_TO_DOUBLE, NO_PROMOTION } + virtual bool PromoteDecimal() const { return true; } + Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); } Status Visit(const HalfFloatType&) { @@ -187,10 +195,18 @@ struct SumLikeInit { return Status::OK(); } + // By default, we widen the decimal to max precision for SumLikes + // However, this may not be the desired behaviour (see, e.g., MeanKernelInit) template enable_if_decimal Visit(const Type&) { - state.reset(new KernelClass(type, options)); - return Status::OK(); + if (PromoteDecimal()) { + ARROW_ASSIGN_OR_RAISE(auto ty, WidenDecimalToMaxPrecision(type)); + state.reset(new KernelClass(ty, options)); + return Status::OK(); + } else { + state.reset(new KernelClass(type, options)); + return Status::OK(); + } } virtual Status Visit(const NullType&) { @@ -275,6 +291,8 @@ struct MeanKernelInit : public SumLikeInit { const ScalarAggregateOptions& options) : SumLikeInit(ctx, type, options) {} + bool PromoteDecimal() const override { return false; } + Status Visit(const NullType&) override { this->state.reset(new NullSumImpl(this->options)); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index e9723cef7b0..fbafa663b86 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -23,6 +23,7 @@ #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/result.h" #include "arrow/stl_allocator.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/compute/kernels/aggregate_pivot.cc b/cpp/src/arrow/compute/kernels/aggregate_pivot.cc index f4b8f5ea0b6..504c7cdd26d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_pivot.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_pivot.cc @@ -19,6 +19,7 @@ #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/pivot_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/scalar.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/logging_internal.h" diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc index 4355c32cfa2..4e3894d2f20 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc @@ -22,6 +22,7 @@ #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/stl_allocator.h" #include "arrow/util/logging_internal.h" diff --git a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc index 83d01091b3c..7ebc8594113 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc @@ -18,8 +18,9 @@ #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/bit_run_reader.h" -#include "arrow/util/tdigest.h" +#include "arrow/util/tdigest_internal.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index d821fc7e2c5..171aa17cc8b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -495,51 +495,67 @@ TEST_F(TestSumKernelRoundOff, Basics) { } TEST(TestDecimalSumKernel, SimpleSum) { - for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { + std::vector> init_types = {decimal128(3, 2), + decimal256(3, 2)}; + std::vector> out_types = {decimal128(38, 2), + decimal256(76, 2)}; + + for (size_t i = 0; i < init_types.size(); ++i) { + const auto& ty = init_types[i]; + const auto& out_ty = out_types[i]; + EXPECT_THAT(Sum(ArrayFromJSON(ty, R"([])")), - ResultWith(ScalarFromJSON(ty, R"(null)"))); + ResultWith(ScalarFromJSON(out_ty, R"(null)"))); EXPECT_THAT(Sum(ArrayFromJSON(ty, R"([null])")), - ResultWith(ScalarFromJSON(ty, R"(null)"))); + ResultWith(ScalarFromJSON(out_ty, R"(null)"))); EXPECT_THAT( Sum(ArrayFromJSON(ty, R"(["0.00", "1.01", "2.02", "3.03", "4.04", "5.05"])")), - ResultWith(ScalarFromJSON(ty, R"("15.15")"))); + ResultWith(ScalarFromJSON(out_ty, R"("15.15")"))); Datum chunks = ChunkedArrayFromJSON(ty, {R"(["0.00", "1.01", "2.02", "3.03", "4.04", "5.05"])"}); - EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(ty, R"("15.15")"))); + EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(out_ty, R"("15.15")"))); chunks = ChunkedArrayFromJSON( ty, {R"(["0.00", "1.01", "2.02"])", R"(["3.03", "4.04", "5.05"])"}); - EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(ty, R"("15.15")"))); + EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(out_ty, R"("15.15")"))); chunks = ChunkedArrayFromJSON( ty, {R"(["0.00", "1.01", "2.02"])", "[]", R"(["3.03", "4.04", "5.05"])"}); - EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(ty, R"("15.15")"))); + EXPECT_THAT(Sum(chunks), ResultWith(ScalarFromJSON(out_ty, R"("15.15")"))); ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0); EXPECT_THAT(Sum(ArrayFromJSON(ty, R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"("0.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("0.00")"))); EXPECT_THAT(Sum(ArrayFromJSON(ty, R"([null])"), options), - ResultWith(ScalarFromJSON(ty, R"("0.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("0.00")"))); chunks = ChunkedArrayFromJSON(ty, {}); - EXPECT_THAT(Sum(chunks, options), ResultWith(ScalarFromJSON(ty, R"("0.00")"))); + EXPECT_THAT(Sum(chunks, options), ResultWith(ScalarFromJSON(out_ty, R"("0.00")"))); EXPECT_THAT( Sum(ArrayFromJSON(ty, R"(["1.01", null, "3.03", null, "5.05", null, "7.07"])"), options), - ResultWith(ScalarFromJSON(ty, R"("16.16")"))); + ResultWith(ScalarFromJSON(out_ty, R"("16.16")"))); EXPECT_THAT(Sum(ScalarFromJSON(ty, R"("5.05")")), - ResultWith(ScalarFromJSON(ty, R"("5.05")"))); + ResultWith(ScalarFromJSON(out_ty, R"("5.05")"))); EXPECT_THAT(Sum(ScalarFromJSON(ty, R"(null)")), - ResultWith(ScalarFromJSON(ty, R"(null)"))); + ResultWith(ScalarFromJSON(out_ty, R"(null)"))); EXPECT_THAT(Sum(ScalarFromJSON(ty, R"(null)"), options), - ResultWith(ScalarFromJSON(ty, R"("0.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("0.00")"))); } } TEST(TestDecimalSumKernel, ScalarAggregateOptions) { - for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { - Datum null = ScalarFromJSON(ty, R"(null)"); - Datum zero = ScalarFromJSON(ty, R"("0.00")"); - Datum result = ScalarFromJSON(ty, R"("14.14")"); + std::vector> init_types = {decimal128(3, 2), + decimal256(3, 2)}; + std::vector> out_types = {decimal128(38, 2), + decimal256(76, 2)}; + + for (size_t i = 0; i < init_types.size(); ++i) { + auto& ty = init_types[i]; + auto& out_ty = out_types[i]; + + Datum null = ScalarFromJSON(out_ty, R"(null)"); + Datum zero = ScalarFromJSON(out_ty, R"("0.00")"); + Datum result = ScalarFromJSON(out_ty, R"("14.14")"); Datum arr = ArrayFromJSON(ty, R"(["1.01", null, "3.03", null, "3.03", null, "7.07"])"); @@ -579,7 +595,7 @@ TEST(TestDecimalSumKernel, ScalarAggregateOptions) { EXPECT_THAT(Sum(ScalarFromJSON(ty, R"("5.05")"), ScalarAggregateOptions(/*skip_nulls=*/false)), - ResultWith(ScalarFromJSON(ty, R"("5.05")"))); + ResultWith(ScalarFromJSON(out_ty, R"("5.05")"))); EXPECT_THAT(Sum(ScalarFromJSON(ty, R"("5.05")"), ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/2)), ResultWith(null)); @@ -712,49 +728,64 @@ TYPED_TEST(TestNumericProductKernel, ScalarAggregateOptions) { } TEST(TestDecimalProductKernel, SimpleProduct) { - for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { - Datum null = ScalarFromJSON(ty, R"(null)"); + std::vector> init_types = {decimal128(3, 2), + decimal256(3, 2)}; + std::vector> out_types = {decimal128(3, 2), decimal256(3, 2)}; + + for (size_t i = 0; i < init_types.size(); ++i) { + auto& ty = init_types[i]; + auto& out_ty = out_types[i]; + + Datum null = ScalarFromJSON(out_ty, R"(null)"); EXPECT_THAT(Product(ArrayFromJSON(ty, R"([])")), ResultWith(null)); EXPECT_THAT(Product(ArrayFromJSON(ty, R"([null])")), ResultWith(null)); EXPECT_THAT( Product(ArrayFromJSON(ty, R"(["0.00", "1.00", "2.00", "3.00", "4.00", "5.00"])")), - ResultWith(ScalarFromJSON(ty, R"("0.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("0.00")"))); Datum chunks = ChunkedArrayFromJSON(ty, {R"(["1.00", "2.00", "3.00", "4.00", "5.00"])"}); - EXPECT_THAT(Product(chunks), ResultWith(ScalarFromJSON(ty, R"("120.00")"))); + EXPECT_THAT(Product(chunks), ResultWith(ScalarFromJSON(out_ty, R"("120.00")"))); chunks = ChunkedArrayFromJSON(ty, {R"(["1.00", "2.00"])", R"(["-3.00", "4.00", "5.00"])"}); - EXPECT_THAT(Product(chunks), ResultWith(ScalarFromJSON(ty, R"("-120.00")"))); + EXPECT_THAT(Product(chunks), ResultWith(ScalarFromJSON(out_ty, R"("-120.00")"))); chunks = ChunkedArrayFromJSON( ty, {R"(["1.00", "2.00"])", R"([])", R"(["-3.00", "4.00", "-5.00"])"}); - EXPECT_THAT(Product(chunks), ResultWith(ScalarFromJSON(ty, R"("120.00")"))); + EXPECT_THAT(Product(chunks), ResultWith(ScalarFromJSON(out_ty, R"("120.00")"))); const ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0); EXPECT_THAT(Product(ArrayFromJSON(ty, R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"("1.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("1.00")"))); EXPECT_THAT(Product(ArrayFromJSON(ty, R"([null])"), options), - ResultWith(ScalarFromJSON(ty, R"("1.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("1.00")"))); chunks = ChunkedArrayFromJSON(ty, {}); - EXPECT_THAT(Product(chunks, options), ResultWith(ScalarFromJSON(ty, R"("1.00")"))); + EXPECT_THAT(Product(chunks, options), + ResultWith(ScalarFromJSON(out_ty, R"("1.00")"))); EXPECT_THAT(Product(ArrayFromJSON( ty, R"(["1.00", null, "-3.00", null, "3.00", null, "7.00"])"), options), - ResultWith(ScalarFromJSON(ty, R"("-63.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("-63.00")"))); EXPECT_THAT(Product(ScalarFromJSON(ty, R"("5.00")")), - ResultWith(ScalarFromJSON(ty, R"("5.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("5.00")"))); EXPECT_THAT(Product(null), ResultWith(null)); } } TEST(TestDecimalProductKernel, ScalarAggregateOptions) { - for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { - Datum null = ScalarFromJSON(ty, R"(null)"); - Datum one = ScalarFromJSON(ty, R"("1.00")"); - Datum result = ScalarFromJSON(ty, R"("63.00")"); + std::vector> init_types = {decimal128(3, 2), + decimal256(3, 2)}; + std::vector> out_types = {decimal128(3, 2), decimal256(3, 2)}; + + for (size_t i = 0; i < init_types.size(); ++i) { + auto& ty = init_types[i]; + auto& out_ty = out_types[i]; + + Datum null = ScalarFromJSON(out_ty, R"(null)"); + Datum one = ScalarFromJSON(out_ty, R"("1.00")"); + Datum result = ScalarFromJSON(out_ty, R"("63.00")"); Datum empty = ArrayFromJSON(ty, R"([])"); Datum null_arr = ArrayFromJSON(ty, R"([null])"); @@ -806,7 +837,7 @@ TEST(TestDecimalProductKernel, ScalarAggregateOptions) { EXPECT_THAT(Product(ScalarFromJSON(ty, R"("5.00")"), ScalarAggregateOptions(/*skip_nulls=*/false)), - ResultWith(ScalarFromJSON(ty, R"("5.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("5.00")"))); EXPECT_THAT(Product(ScalarFromJSON(ty, R"("5.00")"), ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/2)), ResultWith(null)); @@ -1336,94 +1367,116 @@ TYPED_TEST(TestRandomNumericMeanKernel, RandomArrayMeanOverflow) { TEST(TestDecimalMeanKernel, SimpleMean) { ScalarAggregateOptions options(/*skip_nulls=*/true, /*min_count=*/0); + std::vector> init_types = {decimal128(3, 2), + decimal256(3, 2)}; + std::vector> out_types = {decimal128(3, 2), decimal256(3, 2)}; + + for (size_t i = 0; i < init_types.size(); ++i) { + auto& ty = init_types[i]; + auto& out_ty = out_types[i]; - for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { // Decimal doesn't have NaN EXPECT_THAT(Mean(ArrayFromJSON(ty, R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"(null)"))); + ResultWith(ScalarFromJSON(out_ty, R"(null)"))); EXPECT_THAT(Mean(ArrayFromJSON(ty, R"([null])"), options), - ResultWith(ScalarFromJSON(ty, R"(null)"))); + ResultWith(ScalarFromJSON(out_ty, R"(null)"))); EXPECT_THAT(Mean(ArrayFromJSON(ty, R"([])")), - ResultWith(ScalarFromJSON(ty, R"(null)"))); + ResultWith(ScalarFromJSON(out_ty, R"(null)"))); EXPECT_THAT(Mean(ArrayFromJSON(ty, R"([null])")), - ResultWith(ScalarFromJSON(ty, R"(null)"))); + ResultWith(ScalarFromJSON(out_ty, R"(null)"))); EXPECT_THAT(Mean(ArrayFromJSON(ty, R"(["1.01", null, "1.01"])")), - ResultWith(ScalarFromJSON(ty, R"("1.01")"))); + ResultWith(ScalarFromJSON(out_ty, R"("1.01")"))); // Check rounding EXPECT_THAT( Mean(ArrayFromJSON( ty, R"(["1.01", "2.02", "3.03", "4.04", "5.05", "6.06", "7.07", "8.08"])")), // 4.545 unrounded - ResultWith(ScalarFromJSON(ty, R"("4.55")"))); + ResultWith(ScalarFromJSON(out_ty, R"("4.55")"))); EXPECT_THAT( Mean(ArrayFromJSON( ty, R"(["-1.01", "-2.02", "-3.03", "-4.04", "-5.05", "-6.06", "-7.07", "-8.08"])")), // -4.545 unrounded - ResultWith(ScalarFromJSON(ty, R"("-4.55")"))); + ResultWith(ScalarFromJSON(out_ty, R"("-4.55")"))); EXPECT_THAT( Mean(ArrayFromJSON( ty, R"(["1.01", "2.02", "3.00", "4.04", "5.05", "6.06", "7.07", "8.08"])")), // 4.54125 unrounded - ResultWith(ScalarFromJSON(ty, R"("4.54")"))); + ResultWith(ScalarFromJSON(out_ty, R"("4.54")"))); EXPECT_THAT( Mean(ArrayFromJSON( ty, R"(["-1.01", "-2.02", "-3.00", "-4.04", "-5.05", "-6.06", "-7.07", "-8.08"])")), // -4.54125 unrounded - ResultWith(ScalarFromJSON(ty, R"("-4.54")"))); + ResultWith(ScalarFromJSON(out_ty, R"("-4.54")"))); EXPECT_THAT( Mean(ArrayFromJSON( ty, R"(["0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00", "0.00"])")), - ResultWith(ScalarFromJSON(ty, R"("0.00")"))); + ResultWith(ScalarFromJSON(out_ty, R"("0.00")"))); EXPECT_THAT( Mean(ArrayFromJSON( ty, R"(["1.01", "1.01", "1.01", "1.01", "1.01", "1.01", "1.01", "1.01"])")), - ResultWith(ScalarFromJSON(ty, R"("1.01")"))); + ResultWith(ScalarFromJSON(out_ty, R"("1.01")"))); EXPECT_THAT(Mean(ScalarFromJSON(ty, R"("5.05")")), - ResultWith(ScalarFromJSON(ty, R"("5.05")"))); + ResultWith(ScalarFromJSON(out_ty, R"("5.05")"))); EXPECT_THAT(Mean(ScalarFromJSON(ty, R"(null)")), - ResultWith(ScalarFromJSON(ty, R"(null)"))); + ResultWith(ScalarFromJSON(out_ty, R"(null)"))); } - for (const auto& ty : {decimal128(3, -2), decimal256(3, -2)}) { + init_types = {decimal128(3, -2), decimal256(3, -2)}; + out_types = {decimal128(3, -2), decimal256(3, -2)}; + + for (size_t i = 0; i < init_types.size(); ++i) { + auto& ty = init_types[i]; + auto& out_ty = out_types[i]; + // Check rounding + // + // N.B. In what follows, the additional Cast is due to the implementation of + // DecimalScalarFromJSON, which will try to construct a decimal with too big precision EXPECT_THAT( Mean(DecimalArrayFromJSON( ty, R"(["101E2", "202E2", "303E2", "404E2", "505E2", "606E2", "707E2", "808E2"])")), // 45450 unrounded - ResultWith(DecimalScalarFromJSON(ty, R"("455E2")"))); + ResultWith(Cast(DecimalScalarFromJSON(ty, R"("455E2")"), out_ty))); EXPECT_THAT( Mean(DecimalArrayFromJSON( ty, R"(["-101E2", "-202E2", "-303E2", "-404E2", "-505E2", "-606E2", "-707E2", "-808E2"])")), // -45450 unrounded - ResultWith(DecimalScalarFromJSON(ty, R"("-455E2")"))); + ResultWith(Cast(DecimalScalarFromJSON(ty, R"("-455E2")"), out_ty))); EXPECT_THAT( Mean(DecimalArrayFromJSON( ty, R"(["101E2", "202E2", "300E2", "404E2", "505E2", "606E2", "707E2", "808E2"])")), // 45412.5 unrounded - ResultWith(DecimalScalarFromJSON(ty, R"("454E2")"))); + ResultWith(Cast(DecimalScalarFromJSON(ty, R"("454E2")"), out_ty))); EXPECT_THAT( Mean(DecimalArrayFromJSON( ty, R"(["-101E2", "-202E2", "-300E2", "-404E2", "-505E2", "-606E2", "-707E2", "-808E2"])")), // -45412.5 unrounded - ResultWith(DecimalScalarFromJSON(ty, R"("-454E2")"))); + ResultWith(Cast(DecimalScalarFromJSON(ty, R"("-454E2")"), out_ty))); } } TEST(TestDecimalMeanKernel, ScalarAggregateOptions) { - for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) { - Datum result = ScalarFromJSON(ty, R"("3.03")"); - Datum null = ScalarFromJSON(ty, R"(null)"); + std::vector> init_types = {decimal128(3, 2), + decimal256(3, 2)}; + std::vector> out_types = {decimal128(3, 2), decimal256(3, 2)}; + + for (size_t i = 0; i < init_types.size(); ++i) { + auto& ty = init_types[i]; + auto& out_ty = out_types[i]; + + Datum result = ScalarFromJSON(out_ty, R"("3.03")"); + Datum null = ScalarFromJSON(out_ty, R"(null)"); Datum arr = ArrayFromJSON(ty, R"(["1.01", null, "2.02", "2.02", null, "7.07"])"); EXPECT_THAT(Mean(ArrayFromJSON(ty, "[]"), @@ -1481,8 +1534,8 @@ TEST(TestDecimalMeanKernel, ScalarAggregateOptions) { EXPECT_THAT(Mean(ScalarFromJSON(ty, R"("5.05")"), ScalarAggregateOptions(/*skip_nulls=*/false)), - ResultWith(ScalarFromJSON(ty, R"("5.05")"))); - EXPECT_THAT(Mean(ScalarFromJSON(ty, R"("5.05")"), + ResultWith(ScalarFromJSON(out_ty, R"("5.05")"))); + EXPECT_THAT(Mean(ScalarFromJSON(out_ty, R"("5.05")"), ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/2)), ResultWith(null)); EXPECT_THAT(Mean(null, ScalarAggregateOptions(/*skip_nulls=*/false)), diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc index 021ca712c51..9a43f188647 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -23,6 +23,7 @@ #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/aggregate_var_std_internal.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int128_internal.h" diff --git a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h index 26c44a8ff07..960ba59892f 100644 --- a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h +++ b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h @@ -24,7 +24,9 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/int_util_overflow.h" +#include "arrow/util/logging.h" #include "arrow/util/macros.h" namespace arrow { @@ -34,6 +36,7 @@ using internal::DivideWithOverflow; using internal::MultiplyWithOverflow; using internal::NegateWithOverflow; using internal::SubtractWithOverflow; +using util::Float16; namespace compute { namespace internal { @@ -470,6 +473,11 @@ struct Negate { return -arg; } + template + static constexpr enable_if_half_float_value Call(KernelContext*, Arg arg, Status*) { + return -arg; + } + template static constexpr enable_if_unsigned_integer_value Call(KernelContext*, Arg arg, Status*) { @@ -517,6 +525,12 @@ struct NegateChecked { return -arg; } + template + static constexpr enable_if_half_float_value Call(KernelContext*, Arg arg, Status*) { + static_assert(std::is_same::value, ""); + return -arg; + } + template static constexpr enable_if_decimal_value Call(KernelContext*, Arg arg, Status*) { @@ -633,6 +647,15 @@ struct Sign { return std::isnan(arg) ? arg : ((arg == 0) ? 0 : (std::signbit(arg) ? -1 : 1)); } + template + static constexpr enable_if_half_float_value Call(KernelContext*, Arg arg, + Status*) { + return arg.is_nan() + ? arg + : (arg.is_zero() ? Float16::zero() + : (arg.signbit() ? -Float16::one() : Float16::one())); + } + template static constexpr enable_if_unsigned_integer_value Call(KernelContext*, Arg arg, Status*) { @@ -705,7 +728,13 @@ struct Identity; template <> struct Identity { template - static constexpr Value value{0}; + static constexpr Value value() { + if constexpr (std::is_same_v) { + return Float16::zero(); + } else { + return 0; + } + } }; template <> @@ -714,7 +743,13 @@ struct Identity : Identity {}; template <> struct Identity { template - static constexpr Value value{1}; + static constexpr Value value() { + if constexpr (std::is_same_v) { + return Float16::one(); + } else { + return 1; + } + } }; template <> @@ -723,13 +758,17 @@ struct Identity : Identity {}; template <> struct Identity { template - static constexpr Value value{std::numeric_limits::min()}; + static constexpr Value value() { + return std::numeric_limits::min(); + } }; template <> struct Identity { template - static constexpr Value value{std::numeric_limits::max()}; + static constexpr Value value() { + return std::numeric_limits::max(); + } }; } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 5bc8233016f..330bd185f25 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -27,6 +27,7 @@ #include "arrow/chunk_resolver.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/util/span.h" +#include "arrow/util/visibility.h" namespace arrow::compute::internal { @@ -120,11 +121,11 @@ class ChunkedArrayResolver { } }; -std::vector GetArrayPointers(const ArrayVector& arrays); +ARROW_EXPORT std::vector GetArrayPointers(const ArrayVector& arrays); // A class that turns logical (linear) indices into physical (chunked) indices, // and vice-versa. -class ChunkedIndexMapper { +class ARROW_EXPORT ChunkedIndexMapper { public: ChunkedIndexMapper(const std::vector& chunks, uint64_t* indices_begin, uint64_t* indices_end) diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 5f3efedec35..10ed9344d97 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -77,6 +77,11 @@ Result ListValuesType(KernelContext* ctx, return value_type; } +Result MaxPrecisionDecimalType(KernelContext*, + const std::vector& args) { + return WidenDecimalToMaxPrecision(args[0].GetSharedPtr()); +} + void EnsureDictionaryDecoded(std::vector* types) { EnsureDictionaryDecoded(types->data(), types->size()); } @@ -528,6 +533,28 @@ Status CastDecimalArgs(TypeHolder* begin, size_t count) { return Status::OK(); } +Result> WidenDecimalToMaxPrecision( + std::shared_ptr type) { + DCHECK(is_decimal(type->id())); + auto cast_type = checked_pointer_cast(type); + switch (type->id()) { + case Type::DECIMAL32: + return Decimal32Type::Make(Decimal32Type::kMaxPrecision, cast_type->scale()); + case Type::DECIMAL64: + return Decimal64Type::Make(Decimal64Type::kMaxPrecision, cast_type->scale()); + case Type::DECIMAL128: + return Decimal128Type::Make(Decimal128Type::kMaxPrecision, cast_type->scale()); + case Type::DECIMAL256: + return Decimal256Type::Make(Decimal256Type::kMaxPrecision, cast_type->scale()); + default: + DCHECK(false) << "An unknown DecimalType was passed to WidenDecimalToMaxPrecision: " + << type->ToString(); + return Status::TypeError( + "An unknown DecimalType was passed to WidenDecimalToMaxPrecision: " + + type->ToString()); + } +} + bool HasDecimal(const std::vector& types) { for (const auto& th : types) { if (is_decimal(th.id())) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 94677de9440..15a946fbdbb 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -44,6 +44,7 @@ #include "arrow/util/bitmap_writer.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -55,11 +56,13 @@ using internal::BinaryBitBlockCounter; using internal::BitBlockCount; using internal::BitmapReader; using internal::checked_cast; +using internal::checked_pointer_cast; using internal::FirstTimeBitmapWriter; using internal::GenerateBitsUnrolled; using internal::VisitBitBlocks; using internal::VisitBitBlocksVoid; using internal::VisitTwoBitBlocksVoid; +using util::Float16; namespace compute { namespace internal { @@ -131,6 +134,14 @@ struct GetViewType> { static T LogicalValue(PhysicalType value) { return value; } }; +template <> +struct GetViewType { + using T = Float16; + using PhysicalType = uint16_t; + + static T LogicalValue(PhysicalType value) { return T::FromBits(value); } +}; + template struct GetViewType::value || is_fixed_size_binary_type::value || @@ -197,6 +208,11 @@ struct GetOutputType> { using T = typename Type::c_type; }; +template <> +struct GetOutputType { + using T = Float16; +}; + template struct GetOutputType::value>> { using T = std::string; @@ -257,6 +273,9 @@ using enable_if_floating_value = enable_if_t::value, R template using enable_if_not_floating_value = enable_if_t::value, R>; +template +using enable_if_half_float_value = enable_if_t, R>; + template using enable_if_decimal_value = enable_if_t::value || std::is_same::value || @@ -285,6 +304,15 @@ struct ArrayIterator> { T operator()() { return *values++; } }; +template <> +struct ArrayIterator { + using T = Float16; + const T* values; + + explicit ArrayIterator(const ArraySpan& arr) : values(arr.GetValues(1)) {} + T operator()() { return *values++; } +}; + template struct ArrayIterator> { BitmapReader reader; @@ -381,6 +409,14 @@ struct UnboxScalar> { } }; +template <> +struct UnboxScalar { + using T = Float16; + static T Unbox(const Scalar& val) { + return T(checked_cast(val).value); + } +}; + template struct UnboxScalar> { using T = std::string_view; @@ -474,10 +510,14 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar // ---------------------------------------------------------------------- // Reusable type resolvers -Result FirstType(KernelContext*, const std::vector& types); -Result LastType(KernelContext*, const std::vector& types); -Result ListValuesType(KernelContext* ctx, - const std::vector& types); +ARROW_EXPORT Result FirstType(KernelContext*, + const std::vector& types); +ARROW_EXPORT Result LastType(KernelContext*, + const std::vector& types); +ARROW_EXPORT Result ListValuesType(KernelContext* ctx, + const std::vector& types); +ARROW_EXPORT Result MaxPrecisionDecimalType( + KernelContext*, const std::vector& types); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to @@ -557,7 +597,8 @@ struct OutputAdapter> { template struct OutputAdapter> { - using T = typename TypeTraits::ScalarType::ValueType; + using T = std::conditional_t, Float16, + typename TypeTraits::ScalarType::ValueType>; template static Status Write(KernelContext*, ArraySpan* out, Generator&& generator) { @@ -1224,6 +1265,7 @@ KernelType GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) { return Generator::Exec; case Type::UINT16: case Type::INT16: + case Type::HALF_FLOAT: return Generator::Exec; case Type::UINT32: case Type::INT32: @@ -1442,6 +1484,12 @@ Status CastBinaryDecimalArgs(DecimalPromotion promotion, std::vector ARROW_EXPORT Status CastDecimalArgs(TypeHolder* begin, size_t count); +/// Given a DataType, if it is a DecimalType, return a DecimalType with the same scale +/// and the maximum precision for that DecimalType. +ARROW_EXPORT +Result> WidenDecimalToMaxPrecision( + std::shared_ptr type); + ARROW_EXPORT bool HasDecimal(const std::vector& types); diff --git a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc index 6bb5568d2ff..8aa90823c1d 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc @@ -129,6 +129,32 @@ TEST(TestDispatchBest, CastDecimalArgs) { AssertTypeEqual(*args[2], *utf8()); } +TEST(TestDecimalPromotion, WidenDecimalToMaxPrecision) { + std::shared_ptr arg; + std::shared_ptr expected; + std::shared_ptr unwrapped; + + arg = decimal32(3, 2); + expected = decimal32(9, 2); + ASSERT_OK_AND_ASSIGN(unwrapped, WidenDecimalToMaxPrecision(arg)); + AssertTypeEqual(*unwrapped, *expected); + + arg = decimal64(3, 2); + expected = decimal64(18, 2); + ASSERT_OK_AND_ASSIGN(unwrapped, WidenDecimalToMaxPrecision(arg)); + AssertTypeEqual(*unwrapped, *expected); + + arg = decimal128(3, 2); + expected = decimal128(38, 2); + ASSERT_OK_AND_ASSIGN(unwrapped, WidenDecimalToMaxPrecision(arg)); + AssertTypeEqual(*unwrapped, *expected); + + arg = decimal256(3, 2); + expected = decimal256(76, 2); + ASSERT_OK_AND_ASSIGN(unwrapped, WidenDecimalToMaxPrecision(arg)); + AssertTypeEqual(*unwrapped, *expected); +} + TEST(TestDispatchBest, CommonTemporal) { std::vector args; diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 18a5590b2e3..19f7fc2e5b0 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -26,10 +26,13 @@ #include "arrow/array/concatenate.h" #include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" +#include "arrow/compute/kernel.h" #include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/hash_aggregate_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/compute/row/grouper.h" #include "arrow/compute/row/row_encoder_internal.h" #include "arrow/record_batch.h" diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_numeric.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_numeric.cc index 4a318942af6..acd485f530c 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_numeric.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_numeric.cc @@ -28,11 +28,12 @@ #include "arrow/compute/kernels/aggregate_var_std_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/hash_aggregate_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/compute/row/grouper.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int128_internal.h" #include "arrow/util/span.h" -#include "arrow/util/tdigest.h" +#include "arrow/util/tdigest_internal.h" #include "arrow/visit_type_inline.h" namespace arrow::compute::internal { @@ -54,7 +55,7 @@ struct GroupedReducingAggregator : public GroupedAggregator { reduced_ = TypedBufferBuilder(pool_); counts_ = TypedBufferBuilder(pool_); no_nulls_ = TypedBufferBuilder(pool_); - out_type_ = GetOutType(args.inputs[0].GetSharedPtr()); + ARROW_ASSIGN_OR_RAISE(out_type_, GetOutType(args.inputs[0].GetSharedPtr())); return Status::OK(); } @@ -154,17 +155,29 @@ struct GroupedReducingAggregator : public GroupedAggregator { std::shared_ptr out_type() const override { return out_type_; } template - static enable_if_t::value, std::shared_ptr> GetOutType( + enable_if_t::value, Result>> GetOutType( const std::shared_ptr& in_type) { return TypeTraits::type_singleton(); } template - static enable_if_decimal> GetOutType( + enable_if_decimal>> GetOutType( const std::shared_ptr& in_type) { - return in_type; + if (PromoteDecimal()) { + return WidenDecimalToMaxPrecision(in_type); + } else { + return in_type; + } } + // If this returns true, then the aggregator will promote a decimal to the maximum + // precision for that type. For instance, a decimal128(3, 2) will be promoted to a + // decimal128(38, 2) + // + // TODO: Ideally this should be configurable via the function options with an enum + // PrecisionPolicy { PROMOTE_TO_MAX, DEMOTE_TO_DOUBLE, NO_PROMOTION } + virtual bool PromoteDecimal() const { return true; } + int64_t num_groups_ = 0; ScalarAggregateOptions options_; TypedBufferBuilder reduced_; @@ -317,6 +330,8 @@ struct GroupedProductImpl final return MultiplyTraits::Multiply(out_type, u, v); } + bool PromoteDecimal() const override { return false; } + using Base::Finish; }; @@ -415,6 +430,8 @@ struct GroupedMeanImpl return values; } + bool PromoteDecimal() const override { return false; } + std::shared_ptr out_type() const override { if (is_decimal_type::value) return this->out_type_; return float64(); diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_pivot.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_pivot.cc index e9ae1dd25d9..f60aa367ca2 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_pivot.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_pivot.cc @@ -28,6 +28,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/hash_aggregate_internal.h" #include "arrow/compute/kernels/pivot_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/compute/row/grouper.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/compute/kernels/meson.build b/cpp/src/arrow/compute/kernels/meson.build new file mode 100644 index 00000000000..fb682443783 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/meson.build @@ -0,0 +1,167 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# ---------------------------------------------------------------------- +# Tests that don't require the full kernel library + +# Define arrow_compute_kernels_testing object library for common test files +kernel_testing_srcs = [] +if needs_testing + kernel_testing_srcs += files('test_util_internal.cc') +endif + +exc = executable( + 'arrow-scalar-cast-test', + sources: ['scalar_cast_test.cc'] + kernel_testing_srcs, + dependencies: [arrow_compute_test_dep], +) +test('arrow-scalar-cast-test', exc) + +# ---------------------------------------------------------------------- +# Scalar kernels + +scalar_kernel_tests = { + 'arrow-compute-scalar-type-test': { + 'sources': [ + 'scalar_boolean_test.cc', + 'scalar_nested_test.cc', + 'scalar_string_test.cc', + ], + }, + 'arrow-compute-scalar-if-else-test': {'sources': ['scalar_if_else_test.cc']}, + 'arrow-compute-scalar-temporal-test': { + 'sources': ['scalar_temporal_test.cc'], + }, + 'arrow-compute-scalar-math-test': { + 'sources': [ + 'scalar_arithmetic_test.cc', + 'scalar_compare_test.cc', + 'scalar_round_arithmetic_test.cc', + ], + }, + 'arrow-compute-scalar-utility-test': { + 'sources': [ + 'scalar_random_test.cc', + 'scalar_set_lookup_test.cc', + 'scalar_validity_test.cc', + ], + }, +} + +foreach key, val : scalar_kernel_tests + exc = executable( + key, + sources: val['sources'] + kernel_testing_srcs, + dependencies: [arrow_compute_test_dep], + ) + test(key, exc) +endforeach + +scalar_kernel_benchmarks = [ + 'scalar_arithmetic_benchmark', + 'scalar_boolean_benchmark', + 'scalar_cast_benchmark', + 'scalar_compare_benchmark', + 'scalar_if_else_benchmark', + 'scalar_list_benchmark', + 'scalar_random_benchmark', + 'scalar_set_lookup_benchmark', + 'scalar_string_benchmark', + 'scalar_temporal_benchmark', +] + +foreach benchmark : scalar_kernel_benchmarks + benchmark_name = 'arrow-compute-@0@'.format(benchmark.replace('_', '-')) + exc = executable( + benchmark_name, + sources: '@0@.cc'.format(benchmark), + dependencies: [arrow_benchmark_dep, gmock_dep], + ) + benchmark(benchmark_name, exc) +endforeach + +# ---------------------------------------------------------------------- +# Vector kernels + +vector_kernel_tests = { + 'arrow-compute-vector-test': { + 'sources': [ + 'vector_cumulative_ops_test.cc', + 'vector_pairwise_test.cc', + 'vector_hash_test.cc', + 'vector_nested_test.cc', + 'vector_replace_test.cc', + 'vector_run_end_encode_test.cc', + 'vector_statistics_test.cc', + 'select_k_test.cc', + ], + }, + 'arrow-compute-vector-sort-test': {'sources': ['vector_sort_test.cc']}, + 'arrow-compute-vector-selection-test': { + 'sources': ['vector_selection_test.cc'], + }, + 'arrow-compute-vector-swizzle-test': {'sources': ['vector_swizzle_test.cc']}, +} + +foreach key, val : vector_kernel_tests + exc = executable( + key, + sources: val['sources'] + kernel_testing_srcs, + dependencies: [arrow_compute_test_dep], + ) + test(key, exc) +endforeach + +vector_kernel_benchmarks = [ + 'vector_hash_benchmark', + 'vector_sort_benchmark', + 'vector_partition_benchmark', + 'vector_topk_benchmark', + 'vector_replace_benchmark', + 'vector_selection_benchmark', +] + +foreach benchmark : vector_kernel_benchmarks + benchmark_name = 'arrow-compute-@0@'.format(benchmark.replace('_', '-')) + exc = executable( + benchmark_name, + sources: '@0@.cc'.format(benchmark), + dependencies: [arrow_benchmark_dep], + ) + benchmark(benchmark_name, exc) +endforeach + +# ---------------------------------------------------------------------- +# Aggregate kernels + +# Aggregates +exc = executable( + 'arrow-compute-aggregate-test', + sources: ['aggregate_test.cc'] + kernel_testing_srcs, + dependencies: [arrow_compute_test_dep, filesystem_dep], +) +test('arrow-compute-aggregate-test', exc) + +# ---------------------------------------------------------------------- +# Utilities + +exc = executable( + 'arrow-compute-kernel-utility-test', + sources: ['codegen_internal_test.cc'], + dependencies: [arrow_compute_test_dep], +) +test('arrow-compute-kernel-utility-test', exc) diff --git a/cpp/src/arrow/compute/kernels/ree_util_internal.h b/cpp/src/arrow/compute/kernels/ree_util_internal.h index 52c4e9b6678..3e2bf8af87e 100644 --- a/cpp/src/arrow/compute/kernels/ree_util_internal.h +++ b/cpp/src/arrow/compute/kernels/ree_util_internal.h @@ -27,6 +27,7 @@ #include "arrow/array/data.h" #include "arrow/compute/exec.h" #include "arrow/compute/kernel.h" +#include "arrow/compute/visibility.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_traits.h" @@ -144,7 +145,7 @@ class ReadWriteValue> { public: // Every value is represented as a pointer to byte_width_ bytes - using ValueRepr = uint8_t const*; + using ValueRepr = const uint8_t*; private: const uint8_t* input_validity_; @@ -346,7 +347,7 @@ Result> PreallocateRunEndsArray( /// \param has_validity_buffer a validity buffer must be allocated /// \param length the length of the values array /// \param data_buffer_size the size of the data buffer for string and binary types -Result> PreallocateValuesArray( +ARROW_COMPUTE_EXPORT Result> PreallocateValuesArray( const std::shared_ptr& value_type, bool has_validity_buffer, int64_t length, MemoryPool* pool, int64_t data_buffer_size); @@ -362,7 +363,7 @@ Result> PreallocateValuesArray( /// data.child_data[1].buffer[0] != NULLPTR /// /// \param data_buffer_size the size of the data buffer for string and binary types -Result> PreallocateREEArray( +ARROW_COMPUTE_EXPORT Result> PreallocateREEArray( std::shared_ptr ree_type, bool has_validity_buffer, int64_t logical_length, int64_t physical_length, MemoryPool* pool, int64_t data_buffer_size); @@ -377,7 +378,7 @@ Result> PreallocateREEArray( /// - run_ends fits in the run-end type without overflow void WriteSingleRunEnd(ArrayData* run_ends_data, int64_t run_end); -Result> MakeNullREEArray( +ARROW_COMPUTE_EXPORT Result> MakeNullREEArray( const std::shared_ptr& run_end_type, int64_t logical_length, MemoryPool* pool); diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index c20cfc5688e..03c9422809b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -29,6 +29,7 @@ #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" @@ -597,10 +598,6 @@ Result ResolveDecimalAdditionOrSubtractionOutput( types, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) -> Result> { - if (s1 != s2) { - return Status::Invalid("Addition or subtraction of two decimal ", - "types scale1 != scale2. (", s1, s2, ")."); - } DCHECK_EQ(s1, s2); const int32_t scale = s1; const int32_t precision = std::max(p1 - s1, p2 - s2) + scale + 1; @@ -626,10 +623,6 @@ Result ResolveDecimalDivisionOutput(KernelContext*, types, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) -> Result> { - if (s1 < s2) { - return Status::Invalid("Division of two decimal types scale1 < scale2. ", "(", - s1, s2, ")."); - } DCHECK_GE(s1, s2); const int32_t scale = s1 - s2; const int32_t precision = p1; @@ -668,9 +661,11 @@ void AddDecimalUnaryKernels(ScalarFunction* func) { template void AddDecimalBinaryKernels(const std::string& name, ScalarFunction* func) { OutputType out_type(null()); + std::shared_ptr constraint = nullptr; const std::string op = name.substr(0, name.find("_")); if (op == "add" || op == "subtract") { out_type = OutputType(ResolveDecimalAdditionOrSubtractionOutput); + constraint = DecimalsHaveSameScale(); } else if (op == "multiply") { out_type = OutputType(ResolveDecimalMultiplicationOutput); } else if (op == "divide") { @@ -683,8 +678,18 @@ void AddDecimalBinaryKernels(const std::string& name, ScalarFunction* func) { auto in_type256 = InputType(Type::DECIMAL256); auto exec128 = ScalarBinaryNotNullEqualTypes::Exec; auto exec256 = ScalarBinaryNotNullEqualTypes::Exec; - DCHECK_OK(func->AddKernel({in_type128, in_type128}, out_type, exec128)); - DCHECK_OK(func->AddKernel({in_type256, in_type256}, out_type, exec256)); + DCHECK_OK(func->AddKernel({in_type128, in_type128}, out_type, exec128, /*init=*/nullptr, + constraint)); + DCHECK_OK(func->AddKernel({in_type256, in_type256}, out_type, exec256, /*init=*/nullptr, + constraint)); +} + +template +void AddHalfFloatUnaryKernel(ScalarFunction* func) { + OutputType out_type(FirstType); + auto in_type = InputType(Type::HALF_FLOAT); + auto exec = ScalarUnaryNotNull::Exec; + DCHECK_OK(func->AddKernel({in_type}, out_type, exec)); } // Generate a kernel given an arithmetic functor @@ -721,6 +726,17 @@ ArrayKernelExec GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) struct ArithmeticFunction : ScalarFunction { using ScalarFunction::ScalarFunction; + Result DispatchExact( + const std::vector& types) const override { + if ((name_ == "divide" || name_ == "divide_checked") && HasDecimal(types)) { + // Decimal division ALWAYS scales up the dividend, so there will NEVER be an exact + // match. + return arrow::compute::detail::NoMatchingKernel(this, types); + } + + return ScalarFunction::DispatchExact(types); + } + Result DispatchBest(std::vector* types) const override { RETURN_NOT_OK(CheckArity(types->size())); @@ -1695,6 +1711,7 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { // ---------------------------------------------------------------------- auto negate = MakeUnaryArithmeticFunction("negate", negate_doc); AddDecimalUnaryKernels(negate.get()); + AddHalfFloatUnaryKernel(negate.get()); // Add neg(duration) -> duration for (auto unit : TimeUnit::values()) { @@ -1708,6 +1725,7 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { auto negate_checked = MakeUnarySignedArithmeticFunctionNotNull( "negate_checked", negate_checked_doc); AddDecimalUnaryKernels(negate_checked.get()); + AddHalfFloatUnaryKernel(negate_checked.get()); // Add neg_checked(duration) -> duration for (auto unit : TimeUnit::values()) { @@ -1755,6 +1773,10 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { auto exec = ScalarUnary::Exec; DCHECK_OK(sign->AddKernel({duration(unit)}, int8(), std::move(exec))); } + { + auto exec = ScalarUnary::Exec; + DCHECK_OK(sign->AddKernel({InputType(Type::HALF_FLOAT)}, float16(), std::move(exec))); + } DCHECK_OK(registry->AddFunction(std::move(sign))); // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc index 908c642bae8..20a824cd7c3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc @@ -80,7 +80,7 @@ static void ArrayScalarKernel(benchmark::State& state) { rand.Numeric(array_size, min, max, args.null_proportion)); for (auto _ : state) { - ABORT_NOT_OK(Op(lhs, rhs, ArithmeticOptions(), nullptr).status()); + ABORT_NOT_OK(Op(lhs, rhs, ArithmeticOptions(), nullptr)); } state.SetItemsProcessed(state.iterations() * array_size); } @@ -103,7 +103,7 @@ static void ArrayArrayKernel(benchmark::State& state) { rand.Numeric(array_size, rmin, rmax, args.null_proportion)); for (auto _ : state) { - ABORT_NOT_OK(Op(lhs, rhs, ArithmeticOptions(), nullptr).status()); + ABORT_NOT_OK(Op(lhs, rhs, ArithmeticOptions(), nullptr)); } state.SetItemsProcessed(state.iterations() * array_size); } diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index 1162dad855d..6e9e0620f8f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -35,6 +35,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" #include "arrow/util/math_constants.h" #include "arrow/util/string.h" @@ -43,6 +44,9 @@ #include "arrow/testing/random.h" namespace arrow { + +using util::Float16; + namespace compute { namespace { @@ -57,8 +61,11 @@ using SignedIntegerTypes = testing::Types; -// TODO(kszucs): add half-float -using FloatingTypes = testing::Types; +using ArithmeticFloatingTypes = testing::Types; + +float ToCFloat(float v) { return v; } +double ToCFloat(double v) { return v; } +float ToCFloat(Float16 v) { return v.ToFloat(); } // Assert that all-null-type inputs results in a null-type output. void AssertNullToNull(const std::string& func_name) { @@ -80,33 +87,57 @@ void AssertNullToNull(const std::string& func_name) { } } -template -class TestBaseUnaryArithmetic : public ::testing::Test { +template +class TestBaseArithmetic : public ::testing::Test { protected: using ArrowType = T; - using CType = typename ArrowType::c_type; + using CType = std::conditional_t, Float16, + typename ArrowType::c_type>; static std::shared_ptr type_singleton() { return TypeTraits::type_singleton(); } - using UnaryFunction = - std::function(const Datum&, OptionsType, ExecContext*)>; - std::shared_ptr MakeNullScalar() { return arrow::MakeNullScalar(type_singleton()); } - std::shared_ptr MakeScalar(CType value) { - return *arrow::MakeScalar(type_singleton(), value); + template + std::shared_ptr MakeScalar(V value) { + if constexpr (std::is_same_v) { + return std::make_shared(Float16(value).bits()); + } else { + return *arrow::MakeScalar(type_singleton(), value); + } + } + + static constexpr bool is_half_float() { return std::is_same_v; } +}; + +// This has to be a macro, the test wouldn't be skipped from a helper function +#define SKIP_IF_HALF_FLOAT() \ + if (this->is_half_float()) { \ + GTEST_SKIP() << "Unsupported on half-float"; \ } - void SetUp() override {} +template +using enable_if_numeric_value = + std::enable_if_t || std::is_same_v, R>; + +template +class TestBaseUnaryArithmetic : public TestBaseArithmetic { + protected: + using Base = TestBaseArithmetic; + using CType = typename Base::CType; + + using UnaryFunction = + std::function(const Datum&, OptionsType, ExecContext*)>; // (CScalar, CScalar) - void AssertUnaryOp(UnaryFunction func, CType argument, CType expected) { - auto arg = MakeScalar(argument); - auto exp = MakeScalar(expected); + template + enable_if_numeric_value AssertUnaryOp(UnaryFunction func, V argument, V expected) { + auto arg = this->MakeScalar(argument); + auto exp = this->MakeScalar(expected); ASSERT_OK_AND_ASSIGN(auto actual, func(arg, options_, nullptr)); AssertScalarsApproxEqual(*exp, *actual.scalar(), /*verbose=*/true); } @@ -121,22 +152,22 @@ class TestBaseUnaryArithmetic : public ::testing::Test { // (JSON, JSON) void AssertUnaryOp(UnaryFunction func, const std::string& arg_json, const std::string& expected_json) { - auto arg = ArrayFromJSON(type_singleton(), arg_json); - auto expected = ArrayFromJSON(type_singleton(), expected_json); + auto arg = ArrayFromJSON(this->type_singleton(), arg_json); + auto expected = ArrayFromJSON(this->type_singleton(), expected_json); AssertUnaryOp(func, arg, expected); } // (Array, JSON) void AssertUnaryOp(UnaryFunction func, const std::shared_ptr& arg, const std::string& expected_json) { - const auto expected = ArrayFromJSON(type_singleton(), expected_json); + const auto expected = ArrayFromJSON(this->type_singleton(), expected_json); AssertUnaryOp(func, arg, expected); } // (JSON, Array) void AssertUnaryOp(UnaryFunction func, const std::string& arg_json, const std::shared_ptr& expected) { - auto arg = ArrayFromJSON(type_singleton(), arg_json); + auto arg = ArrayFromJSON(this->type_singleton(), arg_json); AssertUnaryOp(func, arg, expected); } @@ -157,16 +188,17 @@ class TestBaseUnaryArithmetic : public ::testing::Test { } // (CScalar, CScalar) - void AssertUnaryOpRaises(UnaryFunction func, CType argument, - const std::string& expected_msg) { - auto arg = MakeScalar(argument); + template + enable_if_numeric_value AssertUnaryOpRaises(UnaryFunction func, V argument, + const std::string& expected_msg) { + auto arg = this->MakeScalar(argument); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr(expected_msg), func(arg, options_, nullptr)); } void AssertUnaryOpRaises(UnaryFunction func, const std::string& argument, const std::string& expected_msg) { - auto arg = ArrayFromJSON(type_singleton(), argument); + auto arg = ArrayFromJSON(this->type_singleton(), argument); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr(expected_msg), func(arg, options_, nullptr)); for (int64_t i = 0; i < arg->length(); i++) { @@ -177,7 +209,7 @@ class TestBaseUnaryArithmetic : public ::testing::Test { } void AssertUnaryOpNotImplemented(UnaryFunction func, const std::string& argument) { - auto arg = ArrayFromJSON(type_singleton(), argument); + auto arg = ArrayFromJSON(this->type_singleton(), argument); const char* expected_msg = "has no kernel matching input types"; EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, ::testing::HasSubstr(expected_msg), func(arg, options_, nullptr)); @@ -185,7 +217,7 @@ class TestBaseUnaryArithmetic : public ::testing::Test { void ValidateAndAssertApproxEqual(const std::shared_ptr& actual, const std::string& expected) { - const auto exp = ArrayFromJSON(type_singleton(), expected); + const auto exp = ArrayFromJSON(this->type_singleton(), expected); ValidateAndAssertApproxEqual(actual, exp); } @@ -258,59 +290,51 @@ class TestArithmeticDecimal : public ::testing::Test { }; template -class TestBinaryArithmetic : public ::testing::Test { +class TestBinaryArithmetic : public TestBaseArithmetic { protected: - using ArrowType = T; - using CType = typename ArrowType::c_type; - - static std::shared_ptr type_singleton() { - return TypeTraits::type_singleton(); - } + using Base = TestBaseArithmetic; + using CType = typename Base::CType; using BinaryFunction = std::function(const Datum&, const Datum&, ArithmeticOptions, ExecContext*)>; void SetUp() override { options_.check_overflow = false; } - std::shared_ptr MakeNullScalar() { - return arrow::MakeNullScalar(type_singleton()); - } - - std::shared_ptr MakeScalar(CType value) { - return *arrow::MakeScalar(type_singleton(), value); - } - // (Scalar, Scalar) - void AssertBinop(BinaryFunction func, CType lhs, CType rhs, CType expected) { - auto left = MakeScalar(lhs); - auto right = MakeScalar(rhs); - auto exp = MakeScalar(expected); + template + enable_if_numeric_value AssertBinop(BinaryFunction func, V lhs, V rhs, V expected) { + auto left = this->MakeScalar(lhs); + auto right = this->MakeScalar(rhs); + auto exp = this->MakeScalar(expected); ASSERT_OK_AND_ASSIGN(auto actual, func(left, right, options_, nullptr)); AssertScalarsApproxEqual(*exp, *actual.scalar(), /*verbose=*/true); } // (Scalar, Array) - void AssertBinop(BinaryFunction func, CType lhs, const std::string& rhs, - const std::string& expected) { - auto left = MakeScalar(lhs); + template + enable_if_numeric_value AssertBinop(BinaryFunction func, V lhs, + const std::string& rhs, + const std::string& expected) { + auto left = this->MakeScalar(lhs); AssertBinop(func, left, rhs, expected); } // (Scalar, Array) void AssertBinop(BinaryFunction func, const std::shared_ptr& left, const std::string& rhs, const std::string& expected) { - auto right = ArrayFromJSON(type_singleton(), rhs); - auto exp = ArrayFromJSON(type_singleton(), expected); + auto right = ArrayFromJSON(this->type_singleton(), rhs); + auto exp = ArrayFromJSON(this->type_singleton(), expected); ASSERT_OK_AND_ASSIGN(auto actual, func(left, right, options_, nullptr)); ValidateAndAssertApproxEqual(actual.make_array(), expected); } // (Array, Scalar) - void AssertBinop(BinaryFunction func, const std::string& lhs, CType rhs, - const std::string& expected) { - auto right = MakeScalar(rhs); + template + enable_if_numeric_value AssertBinop(BinaryFunction func, const std::string& lhs, + V rhs, const std::string& expected) { + auto right = this->MakeScalar(rhs); AssertBinop(func, lhs, right, expected); } @@ -318,7 +342,7 @@ class TestBinaryArithmetic : public ::testing::Test { void AssertBinop(BinaryFunction func, const std::string& lhs, const std::shared_ptr& right, const std::shared_ptr& expected) { - auto left = ArrayFromJSON(type_singleton(), lhs); + auto left = ArrayFromJSON(this->type_singleton(), lhs); ASSERT_OK_AND_ASSIGN(auto actual, func(left, right, options_, nullptr)); ValidateAndAssertApproxEqual(actual.make_array(), expected); @@ -327,8 +351,8 @@ class TestBinaryArithmetic : public ::testing::Test { // (Array, Scalar) void AssertBinop(BinaryFunction func, const std::string& lhs, const std::shared_ptr& right, const std::string& expected) { - auto left = ArrayFromJSON(type_singleton(), lhs); - auto exp = ArrayFromJSON(type_singleton(), expected); + auto left = ArrayFromJSON(this->type_singleton(), lhs); + auto exp = ArrayFromJSON(this->type_singleton(), expected); ASSERT_OK_AND_ASSIGN(auto actual, func(left, right, options_, nullptr)); ValidateAndAssertApproxEqual(actual.make_array(), expected); @@ -337,8 +361,8 @@ class TestBinaryArithmetic : public ::testing::Test { // (Array, Array) void AssertBinop(BinaryFunction func, const std::string& lhs, const std::string& rhs, const std::string& expected) { - auto left = ArrayFromJSON(type_singleton(), lhs); - auto right = ArrayFromJSON(type_singleton(), rhs); + auto left = ArrayFromJSON(this->type_singleton(), lhs); + auto right = ArrayFromJSON(this->type_singleton(), rhs); AssertBinop(func, left, right, expected); } @@ -346,8 +370,8 @@ class TestBinaryArithmetic : public ::testing::Test { // (Array, Array) => Array void AssertBinop(BinaryFunction func, const std::string& lhs, const std::string& rhs, const std::shared_ptr& expected) { - auto left = ArrayFromJSON(type_singleton(), lhs); - auto right = ArrayFromJSON(type_singleton(), rhs); + auto left = ArrayFromJSON(this->type_singleton(), lhs); + auto right = ArrayFromJSON(this->type_singleton(), rhs); AssertBinop(func, left, right, expected); } @@ -356,7 +380,7 @@ class TestBinaryArithmetic : public ::testing::Test { void AssertBinop(BinaryFunction func, const std::shared_ptr& left, const std::shared_ptr& right, const std::string& expected_json) { - const auto expected = ArrayFromJSON(type_singleton(), expected_json); + const auto expected = ArrayFromJSON(this->type_singleton(), expected_json); AssertBinop(func, left, right, expected); } @@ -379,8 +403,8 @@ class TestBinaryArithmetic : public ::testing::Test { void AssertBinopRaises(BinaryFunction func, const std::string& lhs, const std::string& rhs, const std::string& expected_msg) { - auto left = ArrayFromJSON(type_singleton(), lhs); - auto right = ArrayFromJSON(type_singleton(), rhs); + auto left = ArrayFromJSON(this->type_singleton(), lhs); + auto right = ArrayFromJSON(this->type_singleton(), rhs); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr(expected_msg), func(left, right, options_, nullptr)); @@ -388,7 +412,7 @@ class TestBinaryArithmetic : public ::testing::Test { void ValidateAndAssertApproxEqual(const std::shared_ptr& actual, const std::string& expected) { - ValidateAndAssertApproxEqual(actual, ArrayFromJSON(type_singleton(), expected)); + ValidateAndAssertApproxEqual(actual, ArrayFromJSON(this->type_singleton(), expected)); } void ValidateAndAssertApproxEqual(const std::shared_ptr& actual, @@ -497,12 +521,12 @@ class TestBitWiseArithmetic : public ::testing::Test { TYPED_TEST_SUITE(TestUnaryArithmeticIntegral, IntegralTypes); TYPED_TEST_SUITE(TestUnaryArithmeticSigned, SignedIntegerTypes); TYPED_TEST_SUITE(TestUnaryArithmeticUnsigned, UnsignedIntegerTypes); -TYPED_TEST_SUITE(TestUnaryArithmeticFloating, FloatingTypes); +TYPED_TEST_SUITE(TestUnaryArithmeticFloating, ArithmeticFloatingTypes); TYPED_TEST_SUITE(TestBinaryArithmeticIntegral, IntegralTypes); TYPED_TEST_SUITE(TestBinaryArithmeticSigned, SignedIntegerTypes); TYPED_TEST_SUITE(TestBinaryArithmeticUnsigned, UnsignedIntegerTypes); -TYPED_TEST_SUITE(TestBinaryArithmeticFloating, FloatingTypes); +TYPED_TEST_SUITE(TestBinaryArithmeticFloating, ArithmeticFloatingTypes); TYPED_TEST_SUITE(TestBitWiseArithmetic, IntegralTypes); @@ -786,6 +810,8 @@ TYPED_TEST(TestBinaryArithmeticSigned, Mul) { // NOTE: cannot test Inf / -Inf (ARROW-9495) TYPED_TEST(TestBinaryArithmeticFloating, Add) { + SKIP_IF_HALF_FLOAT(); + this->AssertBinop(Add, "[]", "[]", "[]"); this->AssertBinop(Add, "[1.5, 0.5]", "[2.0, -3]", "[3.5, -2.5]"); @@ -797,18 +823,22 @@ TYPED_TEST(TestBinaryArithmeticFloating, Add) { this->AssertBinop(Add, "[null, 1.5, 0.5]", "[2.0, -3, null]", "[null, -1.5, null]"); // Scalar on the left - this->AssertBinop(Add, -1.5f, "[0.0, 2.0]", "[-1.5, 0.5]"); - this->AssertBinop(Add, -1.5f, "[null, 2.0]", "[null, 0.5]"); + this->AssertBinop(Add, this->MakeScalar(-1.5f), "[0.0, 2.0]", "[-1.5, 0.5]"); + this->AssertBinop(Add, this->MakeScalar(-1.5f), "[null, 2.0]", "[null, 0.5]"); this->AssertBinop(Add, this->MakeNullScalar(), "[0.0, 2.0]", "[null, null]"); this->AssertBinop(Add, this->MakeNullScalar(), "[null, 2.0]", "[null, null]"); // Scalar on the right - this->AssertBinop(Add, "[0.0, 2.0]", -1.5f, "[-1.5, 0.5]"); - this->AssertBinop(Add, "[null, 2.0]", -1.5f, "[null, 0.5]"); + this->AssertBinop(Add, "[0.0, 2.0]", this->MakeScalar(-1.5f), "[-1.5, 0.5]"); + this->AssertBinop(Add, "[null, 2.0]", this->MakeScalar(-1.5f), "[null, 0.5]"); this->AssertBinop(Add, "[0.0, 2.0]", this->MakeNullScalar(), "[null, null]"); this->AssertBinop(Add, "[null, 2.0]", this->MakeNullScalar(), "[null, null]"); } TYPED_TEST(TestBinaryArithmeticFloating, Div) { + SKIP_IF_HALF_FLOAT(); + + using CType = typename TestFixture::CType; + for (auto check_overflow : {false, true}) { this->SetOverflowCheck(check_overflow); // Empty arrays @@ -819,10 +849,10 @@ TYPED_TEST(TestBinaryArithmeticFloating, Div) { this->AssertBinop(Divide, "[null, 1, 3.3, null, 2]", "[1, 4, 2, 5, 0.1]", "[null, 0.25, 1.65, null, 20]"); // Scalar divides by array - this->AssertBinop(Divide, 10.0F, "[null, 1, 2.5, null, 2, 5]", + this->AssertBinop(Divide, this->MakeScalar(10.0f), "[null, 1, 2.5, null, 2, 5]", "[null, 10, 4, null, 5, 2]"); // Array divides by scalar - this->AssertBinop(Divide, "[null, 1, 2.5, null, 2, 5]", 10.0F, + this->AssertBinop(Divide, "[null, 1, 2.5, null, 2, 5]", this->MakeScalar(10.0f), "[null, 0.1, 0.25, null, 0.2, 0.5]"); // Array with infinity this->AssertBinop(Divide, "[3.4, Inf, -Inf]", "[1, 2, 3]", "[3.4, Inf, -Inf]"); @@ -830,7 +860,7 @@ TYPED_TEST(TestBinaryArithmeticFloating, Div) { this->SetNansEqual(true); this->AssertBinop(Divide, "[3.4, NaN, 2.0]", "[1, 2, 2.0]", "[3.4, NaN, 1.0]"); // Scalar divides by scalar - this->AssertBinop(Divide, 21.0F, 3.0F, 7.0F); + this->AssertBinop(Divide, CType{21.0f}, CType{3.0f}, CType{7.0f}); } } @@ -876,6 +906,8 @@ TYPED_TEST(TestBinaryArithmeticIntegral, DivideByZero) { } TYPED_TEST(TestBinaryArithmeticFloating, DivideByZero) { + SKIP_IF_HALF_FLOAT(); + this->SetOverflowCheck(true); this->AssertBinopRaises(Divide, "[3.0, 2.0, 6.0]", "[1.0, 1.0, 0.0]", "divide by zero"); this->AssertBinopRaises(Divide, "[3.0, 2.0, 0.0]", "[1.0, 1.0, 0.0]", "divide by zero"); @@ -901,6 +933,8 @@ TYPED_TEST(TestBinaryArithmeticSigned, DivideOverflowRaises) { } TYPED_TEST(TestBinaryArithmeticFloating, Power) { + SKIP_IF_HALF_FLOAT(); + using CType = typename TestFixture::CType; auto max = std::numeric_limits::max(); this->SetNansEqual(true); @@ -917,10 +951,10 @@ TYPED_TEST(TestBinaryArithmeticFloating, Power) { this->AssertBinop(Power, "[null, 1, 3.3, null, 2]", "[1, 4, 2, 5, 0.1]", "[null, 1, 10.89, null, 1.07177346]"); // Scalar exponentiated by array - this->AssertBinop(Power, 4.0F, "[null, 1, 0.5, null, 2, 5]", + this->AssertBinop(Power, this->MakeScalar(4.0f), "[null, 1, 0.5, null, 2, 5]", "[null, 4, 2.0, null, 16, 1024]"); // Array exponentiated by scalar - this->AssertBinop(Power, "[null, 1, 0.5, null, 2, 5]", 4.0F, + this->AssertBinop(Power, "[null, 1, 0.5, null, 2, 5]", this->MakeScalar(4.0f), "[null, 1, 0.0625, null, 16, 625]"); // Array with infinity this->AssertBinop(Power, "[3.4, Inf, -Inf, 1.1, 100000]", "[1, 2, 3, Inf, 100000]", @@ -928,11 +962,11 @@ TYPED_TEST(TestBinaryArithmeticFloating, Power) { // Array with NaN this->AssertBinop(Power, "[3.4, NaN, 2.0]", "[1, 2, 2.0]", "[3.4, NaN, 4.0]"); // Scalar exponentiated by scalar - this->AssertBinop(Power, 21.0F, 3.0F, 9261.0F); + this->AssertBinop(Power, CType{21.0f}, CType{3.0f}, CType{9261.0f}); // Divide by zero this->AssertBinop(Power, "[0.0, 0.0]", "[-1.0, -3.0]", "[Inf, Inf]"); // Check overflow behavior - this->AssertBinop(Power, max, 10, INFINITY); + this->AssertBinop(Power, CType{max}, CType{10}, CType{INFINITY}); } // Edge cases - removing NaNs @@ -970,7 +1004,7 @@ TYPED_TEST(TestBinaryArithmeticIntegral, Power) { this->AssertBinopRaises(Power, MakeArray(max), MakeArray(10), "overflow"); // Disable overflow check this->SetOverflowCheck(false); - this->AssertBinop(Power, max, 10, 1); + this->AssertBinop(Power, max, CType{10}, CType{1}); } TYPED_TEST(TestBinaryArithmeticSigned, Power) { @@ -1005,10 +1039,12 @@ TYPED_TEST(TestBinaryArithmeticSigned, Power) { this->AssertBinopRaises(Power, MakeArray(max), MakeArray(10), "overflow"); // Disable overflow check this->SetOverflowCheck(false); - this->AssertBinop(Power, max, 10, 1); + this->AssertBinop(Power, max, CType{10}, CType{1}); } TYPED_TEST(TestBinaryArithmeticFloating, Sub) { + SKIP_IF_HALF_FLOAT(); + this->AssertBinop(Subtract, "[]", "[]", "[]"); this->AssertBinop(Subtract, "[1.5, 0.5]", "[2.0, -3]", "[-0.5, 3.5]"); @@ -1020,18 +1056,20 @@ TYPED_TEST(TestBinaryArithmeticFloating, Sub) { this->AssertBinop(Subtract, "[null, 1.5, 0.5]", "[2.0, -3, null]", "[null, 4.5, null]"); // Scalar on the left - this->AssertBinop(Subtract, -1.5f, "[0.0, 2.0]", "[-1.5, -3.5]"); - this->AssertBinop(Subtract, -1.5f, "[null, 2.0]", "[null, -3.5]"); + this->AssertBinop(Subtract, this->MakeScalar(-1.5f), "[0.0, 2.0]", "[-1.5, -3.5]"); + this->AssertBinop(Subtract, this->MakeScalar(-1.5f), "[null, 2.0]", "[null, -3.5]"); this->AssertBinop(Subtract, this->MakeNullScalar(), "[0.0, 2.0]", "[null, null]"); this->AssertBinop(Subtract, this->MakeNullScalar(), "[null, 2.0]", "[null, null]"); // Scalar on the right - this->AssertBinop(Subtract, "[0.0, 2.0]", -1.5f, "[1.5, 3.5]"); - this->AssertBinop(Subtract, "[null, 2.0]", -1.5f, "[null, 3.5]"); + this->AssertBinop(Subtract, "[0.0, 2.0]", this->MakeScalar(-1.5f), "[1.5, 3.5]"); + this->AssertBinop(Subtract, "[null, 2.0]", this->MakeScalar(-1.5f), "[null, 3.5]"); this->AssertBinop(Subtract, "[0.0, 2.0]", this->MakeNullScalar(), "[null, null]"); this->AssertBinop(Subtract, "[null, 2.0]", this->MakeNullScalar(), "[null, null]"); } TYPED_TEST(TestBinaryArithmeticFloating, Mul) { + SKIP_IF_HALF_FLOAT(); + this->AssertBinop(Multiply, "[]", "[]", "[]"); this->AssertBinop(Multiply, "[1.5, 0.5]", "[2.0, -3]", "[3.0, -1.5]"); @@ -1050,17 +1088,17 @@ TYPED_TEST(TestBinaryArithmeticFloating, Mul) { "[null, -0.0, -0.0, null]"); // Scalar on the left - this->AssertBinop(Multiply, -1.5f, "[0.0, 2.0]", "[-0.0, -3.0]"); - this->AssertBinop(Multiply, -1.5f, "[null, 2.0]", "[null, -3.0]"); - this->AssertBinop(Multiply, -0.0f, "[3.0, -2.0]", "[-0.0, 0.0]"); - this->AssertBinop(Multiply, -0.0f, "[null, 2.0]", "[null, -0.0]"); + this->AssertBinop(Multiply, this->MakeScalar(-1.5f), "[0.0, 2.0]", "[-0.0, -3.0]"); + this->AssertBinop(Multiply, this->MakeScalar(-1.5f), "[null, 2.0]", "[null, -3.0]"); + this->AssertBinop(Multiply, this->MakeScalar(-0.0f), "[3.0, -2.0]", "[-0.0, 0.0]"); + this->AssertBinop(Multiply, this->MakeScalar(-0.0f), "[null, 2.0]", "[null, -0.0]"); this->AssertBinop(Multiply, this->MakeNullScalar(), "[0.0, 2.0]", "[null, null]"); this->AssertBinop(Multiply, this->MakeNullScalar(), "[null, 2.0]", "[null, null]"); // Scalar on the right - this->AssertBinop(Multiply, "[0.0, 2.0]", -1.5f, "[-0.0, -3.0]"); - this->AssertBinop(Multiply, "[null, 2.0]", -1.5f, "[null, -3.0]"); - this->AssertBinop(Multiply, "[3.0, -2.0]", -0.0f, "[-0.0, 0.0]"); - this->AssertBinop(Multiply, "[null, 2.0]", -0.0f, "[null, -0.0]"); + this->AssertBinop(Multiply, "[0.0, 2.0]", this->MakeScalar(-1.5f), "[-0.0, -3.0]"); + this->AssertBinop(Multiply, "[null, 2.0]", this->MakeScalar(-1.5f), "[null, -3.0]"); + this->AssertBinop(Multiply, "[3.0, -2.0]", this->MakeScalar(-0.0f), "[-0.0, 0.0]"); + this->AssertBinop(Multiply, "[null, 2.0]", this->MakeScalar(-0.0f), "[null, -0.0]"); this->AssertBinop(Multiply, "[0.0, 2.0]", this->MakeNullScalar(), "[null, null]"); this->AssertBinop(Multiply, "[null, 2.0]", this->MakeNullScalar(), "[null, null]"); } @@ -1174,7 +1212,7 @@ TEST(TestUnaryArithmetic, DispatchBest) { // All types for (std::string name : {"negate", "sign"}) { for (const auto& ty : {int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), - uint64(), float32(), float64()}) { + uint64(), float16(), float32(), float64()}) { CheckDispatchBest(name, {ty}, {ty}); CheckDispatchBest(name, {dictionary(int8(), ty)}, {ty}); } @@ -1182,7 +1220,8 @@ TEST(TestUnaryArithmetic, DispatchBest) { // Signed types for (std::string name : {"negate_checked"}) { - for (const auto& ty : {int8(), int16(), int32(), int64(), float32(), float64()}) { + for (const auto& ty : + {int8(), int16(), int32(), int64(), float16(), float32(), float64()}) { CheckDispatchBest(name, {ty}, {ty}); CheckDispatchBest(name, {dictionary(int8(), ty)}, {ty}); } @@ -1274,7 +1313,7 @@ TYPED_TEST(TestUnaryArithmeticSigned, Negate) { this->AssertUnaryOp(Negate, -1, 1); this->AssertUnaryOp(Negate, MakeArray(-1), "[1]"); // Min/max (wrap arounds and overflow) - this->AssertUnaryOp(Negate, max, min + 1); + this->AssertUnaryOp(Negate, max, static_cast(min + 1)); if (check_overflow) { this->AssertUnaryOpRaises(Negate, MakeArray(min), "overflow"); } else { @@ -1303,8 +1342,8 @@ TYPED_TEST(TestUnaryArithmeticUnsigned, Negate) { this->AssertUnaryOp(Negate, this->MakeNullScalar(), this->MakeNullScalar()); // Min/max (wrap around) this->AssertUnaryOp(Negate, min, min); - this->AssertUnaryOp(Negate, max, 1); - this->AssertUnaryOp(Negate, 1, max); + this->AssertUnaryOp(Negate, max, CType{1}); + this->AssertUnaryOp(Negate, CType{1}, max); // Not implemented kernels this->SetOverflowCheck(true); this->AssertUnaryOpNotImplemented(Negate, "[0]"); @@ -1414,6 +1453,8 @@ TYPED_TEST(TestUnaryArithmeticUnsigned, AbsoluteValue) { } TYPED_TEST(TestUnaryArithmeticFloating, AbsoluteValue) { + SKIP_IF_HALF_FLOAT(); + using CType = typename TestFixture::CType; auto min = std::numeric_limits::lowest(); @@ -1512,10 +1553,11 @@ TYPED_TEST(TestUnaryArithmeticSigned, Exp) { } TYPED_TEST(TestUnaryArithmeticFloating, Exp) { - using CType = typename TestFixture::CType; + SKIP_IF_HALF_FLOAT(); - auto min = std::numeric_limits::lowest(); - auto max = std::numeric_limits::max(); + using CType = typename TestFixture::CType; + constexpr auto min = std::numeric_limits::lowest(); + constexpr auto max = std::numeric_limits::max(); auto exp = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Exp(arg, ctx); @@ -1539,7 +1581,7 @@ TYPED_TEST(TestUnaryArithmeticFloating, Exp) { this->SetNansEqual(true); this->AssertUnaryOp(exp, "[NaN]", "[NaN]"); // Min/max - this->AssertUnaryOp(exp, min, 0.0); + this->AssertUnaryOp(exp, min, CType{0.0}); this->AssertUnaryOp(exp, max, std::numeric_limits::infinity()); } @@ -1600,10 +1642,11 @@ TYPED_TEST(TestUnaryArithmeticSigned, Expm1) { } TYPED_TEST(TestUnaryArithmeticFloating, Expm1) { - using CType = typename TestFixture::CType; + SKIP_IF_HALF_FLOAT(); - auto min = std::numeric_limits::lowest(); - auto max = std::numeric_limits::max(); + using CType = typename TestFixture::CType; + constexpr auto min = std::numeric_limits::lowest(); + constexpr auto max = std::numeric_limits::max(); auto expm1 = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Expm1(arg, ctx); @@ -1631,7 +1674,7 @@ TYPED_TEST(TestUnaryArithmeticFloating, Expm1) { this->SetNansEqual(true); this->AssertUnaryOp(expm1, "[NaN]", "[NaN]"); // Min/max - this->AssertUnaryOp(expm1, min, -1.0); + this->AssertUnaryOp(expm1, min, CType{-1.0}); this->AssertUnaryOp(expm1, max, std::numeric_limits::infinity()); } @@ -1857,6 +1900,61 @@ TEST_F(TestUnaryArithmeticDecimal, TrigTan) { class TestBinaryArithmeticDecimal : public TestArithmeticDecimal {}; +TEST_F(TestBinaryArithmeticDecimal, DispatchExact) { + for (std::string name : {"add", "subtract"}) { + for (std::string suffix : {"", "_checked"}) { + name += suffix; + ARROW_SCOPED_TRACE(name); + + CheckDispatchExact(name, {decimal128(2, 1), decimal128(2, 1)}); + CheckDispatchExact(name, {decimal128(3, 1), decimal128(2, 1)}); + CheckDispatchExactFails(name, {decimal128(2, 0), decimal128(2, 1)}); + CheckDispatchExactFails(name, {decimal128(2, 1), decimal128(2, 0)}); + + CheckDispatchExact(name, {decimal256(2, 1), decimal256(2, 1)}); + CheckDispatchExact(name, {decimal256(3, 1), decimal256(2, 1)}); + CheckDispatchExactFails(name, {decimal256(2, 0), decimal256(2, 1)}); + CheckDispatchExactFails(name, {decimal256(2, 1), decimal256(2, 0)}); + } + } + + { + std::string name = "multiply"; + for (std::string suffix : {"", "_checked"}) { + name += suffix; + ARROW_SCOPED_TRACE(name); + + CheckDispatchExact(name, {decimal128(2, 1), decimal128(2, 1)}); + CheckDispatchExact(name, {decimal128(3, 1), decimal128(2, 1)}); + CheckDispatchExact(name, {decimal128(2, 0), decimal128(2, 1)}); + CheckDispatchExact(name, {decimal128(2, 1), decimal128(2, 0)}); + + CheckDispatchExact(name, {decimal256(2, 1), decimal256(2, 1)}); + CheckDispatchExact(name, {decimal256(3, 1), decimal256(2, 1)}); + CheckDispatchExact(name, {decimal256(2, 0), decimal256(2, 1)}); + CheckDispatchExact(name, {decimal256(2, 1), decimal256(2, 0)}); + } + } + + { + std::string name = "divide"; + for (std::string suffix : {"", "_checked"}) { + name += suffix; + ARROW_SCOPED_TRACE(name); + + CheckDispatchExactFails(name, {decimal128(2, 1), decimal128(2, 1)}); + CheckDispatchExactFails(name, {decimal128(3, 1), decimal128(2, 1)}); + CheckDispatchExactFails(name, {decimal128(2, 1), decimal128(2, 0)}); + CheckDispatchExactFails(name, {decimal128(2, 0), decimal128(2, 1)}); + + CheckDispatchExactFails(name, {decimal256(2, 1), decimal256(2, 1)}); + CheckDispatchExactFails(name, {decimal256(3, 1), decimal256(2, 1)}); + CheckDispatchExactFails(name, {decimal256(2, 1), decimal256(2, 0)}); + CheckDispatchExactFails(name, {decimal256(2, 0), decimal256(2, 1)}); + } + } +} + TEST_F(TestBinaryArithmeticDecimal, DispatchBest) { // decimal, floating point for (std::string name : {"add", "subtract", "multiply", "divide"}) { @@ -1927,24 +2025,36 @@ TEST_F(TestBinaryArithmeticDecimal, DispatchBest) { name += suffix; SCOPED_TRACE(name); - CheckDispatchBest(name, {int64(), decimal128(1, 0)}, - {decimal128(23, 4), decimal128(1, 0)}); - CheckDispatchBest(name, {decimal128(1, 0), int64()}, - {decimal128(21, 20), decimal128(19, 0)}); - - CheckDispatchBest(name, {decimal128(2, 1), decimal128(2, 1)}, - {decimal128(6, 5), decimal128(2, 1)}); - CheckDispatchBest(name, {decimal256(2, 1), decimal256(2, 1)}, - {decimal256(6, 5), decimal256(2, 1)}); - CheckDispatchBest(name, {decimal128(2, 1), decimal256(2, 1)}, - {decimal256(6, 5), decimal256(2, 1)}); - CheckDispatchBest(name, {decimal256(2, 1), decimal128(2, 1)}, - {decimal256(6, 5), decimal256(2, 1)}); - - CheckDispatchBest(name, {decimal128(2, 0), decimal128(2, 1)}, - {decimal128(7, 5), decimal128(2, 1)}); - CheckDispatchBest(name, {decimal128(2, 1), decimal128(2, 0)}, - {decimal128(5, 4), decimal128(2, 0)}); + CheckDispatchBestWithCastedTypes(name, {int64(), decimal128(1, 0)}, + {decimal128(23, 4), decimal128(1, 0)}); + CheckDispatchBestWithCastedTypes(name, {decimal128(1, 0), int64()}, + {decimal128(21, 20), decimal128(19, 0)}); + + CheckDispatchBestWithCastedTypes(name, {decimal128(2, 1), decimal128(2, 1)}, + {decimal128(6, 5), decimal128(2, 1)}); + CheckDispatchBestWithCastedTypes(name, {decimal256(2, 1), decimal256(2, 1)}, + {decimal256(6, 5), decimal256(2, 1)}); + CheckDispatchBestWithCastedTypes(name, {decimal128(2, 1), decimal256(2, 1)}, + {decimal256(6, 5), decimal256(2, 1)}); + CheckDispatchBestWithCastedTypes(name, {decimal256(2, 1), decimal128(2, 1)}, + {decimal256(6, 5), decimal256(2, 1)}); + + CheckDispatchBestWithCastedTypes(name, {decimal128(2, 0), decimal128(2, 1)}, + {decimal128(7, 5), decimal128(2, 1)}); + CheckDispatchBestWithCastedTypes(name, {decimal128(2, 1), decimal128(2, 0)}, + {decimal128(5, 4), decimal128(2, 0)}); + + // GH-39875: Expression call to decimal(3 ,2) / decimal(15, 2) wrong result type. + // decimal128(3, 2) / decimal128(15, 2) + // -> decimal128(19, 18) / decimal128(15, 2) = decimal128(19, 16) + CheckDispatchBestWithCastedTypes(name, {decimal128(3, 2), decimal128(15, 2)}, + {decimal128(19, 18), decimal128(15, 2)}); + + // GH-40911: Expression call to decimal(7 ,2) / decimal(6, 1) wrong result type. + // decimal128(7, 2) / decimal128(6, 1) + // -> decimal128(14, 9) / decimal128(6, 1) = decimal128(14, 8) + CheckDispatchBestWithCastedTypes(name, {decimal128(7, 2), decimal128(6, 1)}, + {decimal128(14, 9), decimal128(6, 1)}); } } for (std::string name : {"atan2", "logb", "logb_checked", "power", "power_checked"}) { @@ -2234,6 +2344,14 @@ TEST_F(TestBinaryArithmeticDecimal, Divide) { CheckScalarBinary("divide", left, right, expected); } + // decimal(p1, s1) decimal(p2, s2) where s1 < s2 + { + auto left = ScalarFromJSON(decimal128(6, 5), R"("2.71828")"); + auto right = ScalarFromJSON(decimal128(7, 6), R"("3.141592")"); + auto expected = ScalarFromJSON(decimal128(14, 7), R"("0.8652555")"); + CheckScalarBinary("divide", left, right, expected); + } + // decimal128 decimal256 { auto left = ScalarFromJSON(decimal256(6, 5), R"("2.71828")"); @@ -2490,6 +2608,8 @@ TYPED_TEST(TestBinaryArithmeticUnsigned, ShiftRightOverflowRaises) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigSin) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); this->AssertUnaryOp(Sin, "[Inf, -Inf]", "[NaN, NaN]"); for (auto check_overflow : {false, true}) { @@ -2502,6 +2622,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigSin) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigSinh) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); auto sinh = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Sinh(arg, ctx); @@ -2514,6 +2636,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigSinh) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigCos) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); this->AssertUnaryOp(Cos, "[Inf, -Inf]", "[NaN, NaN]"); for (auto check_overflow : {false, true}) { @@ -2526,6 +2650,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigCos) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigCosh) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); auto cosh = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Cosh(arg, ctx); @@ -2538,6 +2664,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigCosh) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigTan) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); this->AssertUnaryOp(Tan, "[Inf, -Inf]", "[NaN, NaN]"); for (auto check_overflow : {false, true}) { @@ -2552,6 +2680,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigTan) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigTanh) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); auto tanh = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Tanh(arg, ctx); @@ -2564,6 +2694,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigTanh) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigAsin) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); this->AssertUnaryOp(Asin, "[Inf, -Inf, -2, 2]", "[NaN, NaN, NaN, NaN]"); for (auto check_overflow : {false, true}) { @@ -2576,6 +2708,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigAsin) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigAsinh) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); auto asinh = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Asinh(arg, ctx); @@ -2588,6 +2722,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigAsinh) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigAcos) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); this->AssertUnaryOp(Asin, "[Inf, -Inf, -2, 2]", "[NaN, NaN, NaN, NaN]"); for (auto check_overflow : {false, true}) { @@ -2600,6 +2736,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigAcos) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigAcosh) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); this->AssertUnaryOp(Acosh, "[0, -1, -Inf]", "[NaN, NaN, NaN]"); for (auto check_overflow : {false, true}) { @@ -2612,6 +2750,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigAcosh) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigAtan) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); auto atan = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Atan(arg, ctx); @@ -2623,6 +2763,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, TrigAtan) { } TYPED_TEST(TestBinaryArithmeticFloating, TrigAtan2) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); auto atan2 = [](const Datum& y, const Datum& x, ArithmeticOptions, ExecContext* ctx) { return Atan2(y, x, ctx); @@ -2637,6 +2779,8 @@ TYPED_TEST(TestBinaryArithmeticFloating, TrigAtan2) { } TYPED_TEST(TestUnaryArithmeticFloating, TrigAtanh) { + SKIP_IF_HALF_FLOAT(); + this->SetNansEqual(true); this->AssertUnaryOp(Atanh, "[-Inf, Inf, -2, 2]", "[NaN, NaN, NaN, NaN]"); this->AssertUnaryOp(Atanh, "[-1, 1]", "[-Inf, Inf]"); @@ -2679,27 +2823,31 @@ TYPED_TEST(TestBinaryArithmeticIntegral, Trig) { } TYPED_TEST(TestUnaryArithmeticFloating, Log) { + SKIP_IF_HALF_FLOAT(); + using CType = typename TestFixture::CType; + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); + const auto lowest_val = this->MakeScalar(std::numeric_limits::lowest()); + this->SetNansEqual(true); - auto min_val = std::numeric_limits::min(); - auto max_val = std::numeric_limits::max(); for (auto check_overflow : {false, true}) { this->SetOverflowCheck(check_overflow); this->AssertUnaryOp(Ln, "[1, 2.718281828459045, null, NaN, Inf]", "[0, 1, null, NaN, Inf]"); // N.B. min() for float types is smallest normal number > 0 - this->AssertUnaryOp(Ln, min_val, std::log(min_val)); - this->AssertUnaryOp(Ln, max_val, std::log(max_val)); + this->AssertUnaryOp(Ln, min_val, CType{std::log(ToCFloat(min_val))}); + this->AssertUnaryOp(Ln, max_val, CType{std::log(ToCFloat(max_val))}); this->AssertUnaryOp(Log10, "[1, 10, null, NaN, Inf]", "[0, 1, null, NaN, Inf]"); - this->AssertUnaryOp(Log10, min_val, std::log10(min_val)); - this->AssertUnaryOp(Log10, max_val, std::log10(max_val)); + this->AssertUnaryOp(Log10, min_val, CType{std::log10(ToCFloat(min_val))}); + this->AssertUnaryOp(Log10, max_val, CType{std::log10(ToCFloat(max_val))}); this->AssertUnaryOp(Log2, "[1, 2, null, NaN, Inf]", "[0, 1, null, NaN, Inf]"); - this->AssertUnaryOp(Log2, min_val, std::log2(min_val)); - this->AssertUnaryOp(Log2, max_val, std::log2(max_val)); + this->AssertUnaryOp(Log2, min_val, CType{std::log2(ToCFloat(min_val))}); + this->AssertUnaryOp(Log2, max_val, CType{std::log2(ToCFloat(max_val))}); this->AssertUnaryOp(Log1p, "[0, 1.718281828459045, null, NaN, Inf]", "[0, 1, null, NaN, Inf]"); - this->AssertUnaryOp(Log1p, min_val, std::log1p(min_val)); - this->AssertUnaryOp(Log1p, max_val, std::log1p(max_val)); + this->AssertUnaryOp(Log1p, min_val, CType{std::log1p(ToCFloat(min_val))}); + this->AssertUnaryOp(Log1p, max_val, CType{std::log1p(ToCFloat(max_val))}); } this->SetOverflowCheck(false); this->AssertUnaryOp(Ln, "[-Inf, -1, 0, Inf]", "[NaN, NaN, -Inf, Inf]"); @@ -2711,7 +2859,6 @@ TYPED_TEST(TestUnaryArithmeticFloating, Log) { this->AssertUnaryOpRaises(Ln, "[-1]", "logarithm of negative number"); this->AssertUnaryOpRaises(Ln, "[-Inf]", "logarithm of negative number"); - auto lowest_val = MakeScalar(std::numeric_limits::lowest()); // N.B. RapidJSON on some platforms raises "Number too big to be stored in double" so // don't bounce through JSON EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, @@ -2759,6 +2906,8 @@ TYPED_TEST(TestBinaryArithmeticIntegral, Log) { } TYPED_TEST(TestBinaryArithmeticFloating, Log) { + SKIP_IF_HALF_FLOAT(); + using CType = typename TestFixture::CType; this->SetNansEqual(true); auto min_val = std::numeric_limits::min(); @@ -2768,26 +2917,26 @@ TYPED_TEST(TestBinaryArithmeticFloating, Log) { // N.B. min() for float types is smallest normal number > 0 this->AssertBinop(Logb, "[1, 10, null, NaN, Inf]", "[100, 10, null, 2, 10]", "[0, 1, null, NaN, Inf]"); - this->AssertBinop(Logb, min_val, 10, - static_cast(std::log(min_val) / std::log(10))); - this->AssertBinop(Logb, max_val, 10, - static_cast(std::log(max_val) / std::log(10))); + this->AssertBinop(Logb, min_val, CType{10.}, + CType{std::log(ToCFloat(min_val)) / std::log(10.f)}); + this->AssertBinop(Logb, max_val, CType{10.}, + CType{std::log(ToCFloat(max_val)) / std::log(10.f)}); } this->AssertBinop(Logb, "[1.0, 10.0, null]", "[10.0, 10.0, null]", "[0.0, 1.0, null]"); this->AssertBinop(Logb, "[1.0, 2.0, null]", "[2.0, 2.0, null]", "[0.0, 1.0, null]"); - this->AssertBinop(Logb, "[10.0, 100.0, 1000.0, null]", this->MakeScalar(10), + this->AssertBinop(Logb, "[10.0, 100.0, 1000.0, null]", this->MakeScalar(10.), "[1.0, 2.0, 3.0, null]"); this->AssertBinop(Logb, "[1, 2, 4, 8]", this->MakeScalar(0.25), "[-0.0, -0.5, -1.0, -1.5]"); this->SetOverflowCheck(false); - this->AssertBinop(Logb, "[-Inf, -1, 0, Inf]", this->MakeScalar(10), + this->AssertBinop(Logb, "[-Inf, -1, 0, Inf]", this->MakeScalar(10.), "[NaN, NaN, -Inf, Inf]"); - this->AssertBinop(Logb, "[-Inf, -1, 0, Inf]", this->MakeScalar(2), + this->AssertBinop(Logb, "[-Inf, -1, 0, Inf]", this->MakeScalar(2.), "[NaN, NaN, -Inf, Inf]"); this->AssertBinop(Logb, "[-Inf, -1, 0, Inf]", "[2, 10, 0, 0]", "[NaN, NaN, NaN, NaN]"); - this->AssertBinop(Logb, "[-Inf, -1, 0, Inf]", this->MakeScalar(0), + this->AssertBinop(Logb, "[-Inf, -1, 0, Inf]", this->MakeScalar(0.), "[NaN, NaN, NaN, NaN]"); - this->AssertBinop(Logb, "[-Inf, -2, -1, Inf]", this->MakeScalar(2), + this->AssertBinop(Logb, "[-Inf, -2, -1, Inf]", this->MakeScalar(2.), "[NaN, NaN, NaN, Inf]"); this->SetOverflowCheck(true); this->AssertBinopRaises(Logb, "[0]", "[2]", "logarithm of zero"); @@ -2848,18 +2997,21 @@ TYPED_TEST(TestUnaryArithmeticIntegral, Sqrt) { } TYPED_TEST(TestUnaryArithmeticFloating, Sqrt) { + SKIP_IF_HALF_FLOAT(); + using CType = typename TestFixture::CType; + constexpr auto min = std::numeric_limits::min(); + constexpr auto max = std::numeric_limits::max(); + this->SetNansEqual(true); for (auto check_overflow : {false, true}) { - const auto min_val = std::numeric_limits::min(); this->SetOverflowCheck(check_overflow); this->AssertUnaryOp(Sqrt, "[1, 2, null, NaN, Inf]", "[1, 1.414213562, null, NaN, Inf]"); - this->AssertUnaryOp(Sqrt, min_val, static_cast(std::sqrt(min_val))); + this->AssertUnaryOp(Sqrt, min, CType(std::sqrt(ToCFloat(min)))); #ifndef __MINGW32__ // this is problematic and produces a slight difference on MINGW - const auto max_val = std::numeric_limits::max(); - this->AssertUnaryOp(Sqrt, max_val, static_cast(std::sqrt(max_val))); + this->AssertUnaryOp(Sqrt, max, CType(std::sqrt(ToCFloat(max)))); #endif } this->AssertUnaryOpRaises(Sqrt, "[-1]", "square root of negative number"); @@ -2868,8 +3020,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, Sqrt) { TYPED_TEST(TestUnaryArithmeticSigned, Sign) { using CType = typename TestFixture::CType; - auto min = std::numeric_limits::min(); - auto max = std::numeric_limits::max(); + constexpr auto min = std::numeric_limits::min(); + constexpr auto max = std::numeric_limits::max(); auto sign = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Sign(arg, ctx); @@ -2887,8 +3039,8 @@ TYPED_TEST(TestUnaryArithmeticSigned, Sign) { TYPED_TEST(TestUnaryArithmeticUnsigned, Sign) { using CType = typename TestFixture::CType; - auto min = std::numeric_limits::min(); - auto max = std::numeric_limits::max(); + constexpr auto min = std::numeric_limits::min(); + constexpr auto max = std::numeric_limits::max(); auto sign = [](const Datum& arg, ArithmeticOptions, ExecContext* ctx) { return Sign(arg, ctx); @@ -2905,8 +3057,8 @@ TYPED_TEST(TestUnaryArithmeticUnsigned, Sign) { TYPED_TEST(TestUnaryArithmeticFloating, Sign) { using CType = typename TestFixture::CType; - auto min = std::numeric_limits::lowest(); - auto max = std::numeric_limits::max(); + constexpr auto min = std::numeric_limits::lowest(); + constexpr auto max = std::numeric_limits::max(); this->SetNansEqual(true); @@ -2926,6 +3078,8 @@ TYPED_TEST(TestUnaryArithmeticFloating, Sign) { this->AssertUnaryOp(sign, this->MakeScalar(max), this->MakeScalar(1)); } +#undef SKIP_IF_HALF_FLOAT + } // namespace } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc index a5e2893d77b..ce864d13403 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc @@ -18,6 +18,7 @@ #include #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap.h" #include "arrow/util/bitmap_ops.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_boolean_benchmark.cc index 89091186ae2..73fc151b74b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean_benchmark.cc @@ -42,7 +42,7 @@ static void ArrayArrayKernel(benchmark::State& state) { auto rhs = rand.Boolean(array_size, /*true_probability=*/0.5, args.null_proportion); for (auto _ : state) { - ABORT_NOT_OK(Op(lhs, rhs, nullptr).status()); + ABORT_NOT_OK(Op(lhs, rhs, nullptr)); } state.SetItemsProcessed(state.iterations() * array_size); } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_cast_benchmark.cc index 04749a5bea2..3899254c9b9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_benchmark.cc @@ -37,7 +37,7 @@ static void BenchmarkNumericCast(benchmark::State& state, random::RandomArrayGenerator rand(kSeed); auto array = rand.Numeric(args.size, min, max, args.null_proportion); for (auto _ : state) { - ABORT_NOT_OK(Cast(array, to_type, options).status()); + ABORT_NOT_OK(Cast(array, to_type, options)); } } @@ -54,7 +54,7 @@ static void BenchmarkFloatingToIntegerCast(benchmark::State& state, std::shared_ptr values_as_float = *Cast(*array, from_type); for (auto _ : state) { - ABORT_NOT_OK(Cast(values_as_float, to_type, options).status()); + ABORT_NOT_OK(Cast(values_as_float, to_type, options)); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc index 9ea167f5552..53eb5c9791e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc @@ -33,6 +33,8 @@ using internal::CopyBitmap; namespace compute { namespace internal { +namespace { + Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const CastOptions& options = CastState::Get(ctx); const auto& out_type = checked_cast(*out->type()); @@ -95,6 +97,8 @@ void AddDictionaryCast(CastFunction* func) { DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel))); } +} // namespace + std::vector> GetDictionaryCasts() { auto cast_dict = std::make_shared("cast_dictionary", Type::DICTIONARY); AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dict.get()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 8e12e823574..3ab42d89b6e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -18,7 +18,7 @@ // Implementation of casting to (or between) list types #include -#include +#include #include #include @@ -338,41 +338,25 @@ struct CastStruct { std::vector fields_to_select(out_field_count, -1); - std::set all_in_field_names; + std::multimap in_fields; for (int in_field_index = 0; in_field_index < in_field_count; ++in_field_index) { - all_in_field_names.insert(in_type.field(in_field_index)->name()); + in_fields.insert({in_type.field(in_field_index)->name(), in_field_index}); } - for (int in_field_index = 0, out_field_index = 0; - out_field_index < out_field_count;) { + for (int out_field_index = 0; out_field_index < out_field_count; ++out_field_index) { const auto& out_field = out_type.field(out_field_index); - if (in_field_index < in_field_count) { - const auto& in_field = in_type.field(in_field_index); - // If there are more in_fields check if they match the out_field. - if (in_field->name() == out_field->name()) { - // Found matching in_field and out_field. - fields_to_select[out_field_index++] = in_field_index; - // Using the same in_field for multiple out_fields is not allowed. - in_field_index++; - continue; - } - } - if (all_in_field_names.count(out_field->name()) == 0 && out_field->nullable()) { - // Didn't match current in_field, but we can fill with null. - // Filling with null is only acceptable on nullable fields when there - // is definitely no in_field with matching name. - - fields_to_select[out_field_index++] = kFillNullSentinel; - } else if (in_field_index < in_field_count) { - // Didn't match current in_field, and the we cannot fill with null, so - // try next in_field. - in_field_index++; + + // Take the first field with matching name, if any. Extract it from the map so it + // can't be reused. + auto maybe_in_field_index = in_fields.extract(out_field->name()); + if (!maybe_in_field_index.empty()) { + fields_to_select[out_field_index] = maybe_in_field_index.mapped(); + } else if (out_field->nullable()) { + fields_to_select[out_field_index] = kFillNullSentinel; } else { - // Didn't match current in_field, we cannot fill with null, and there - // are no more in_fields to try, so fail. - return Status::TypeError( - "struct fields don't match or are in the wrong order: Input fields: ", - in_type.ToString(), " output fields: ", out_type.ToString()); + return Status::TypeError("struct fields don't match: non-nullable out field `", + out_field->name(), "` not found in in fields ", + in_type.ToString()); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index 9cb2d9a1b2a..ba2a8c4b563 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -41,6 +41,8 @@ using util::Float16; namespace compute { namespace internal { +namespace { + Status CastIntegerToInteger(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const auto& options = checked_cast(ctx->state())->options; if (!options.allow_int_overflow) { @@ -276,6 +278,8 @@ Status CastIntegerToFloating(KernelContext* ctx, const ExecSpan& batch, ExecResu return Status::OK(); } +} // namespace + // ---------------------------------------------------------------------- // Boolean to number diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index b0e0c09d234..3442d46f16e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -196,8 +196,8 @@ struct TemporalToStringCastFunctor { static const std::string kFormatString = "%Y-%m-%d %H:%M:%S%z"; static const std::string kUtcFormatString = "%Y-%m-%d %H:%M:%SZ"; DCHECK(!timezone.empty()); - ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone)); - ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale("C")); + ARROW_ASSIGN_OR_RAISE(auto tz, LocateZone(timezone)); + ARROW_ASSIGN_OR_RAISE(auto locale, GetLocale("C")); TimestampFormatter formatter{ timezone == "UTC" ? kUtcFormatString : kFormatString, tz, locale}; return VisitArraySpanInline( diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc index 4c62da5a39b..d076186e563 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc @@ -36,6 +36,8 @@ using internal::ParseYYYY_MM_DD; namespace compute { namespace internal { +namespace { + constexpr int64_t kMillisecondsInDay = 86400000; // ---------------------------------------------------------------------- @@ -142,6 +144,8 @@ Status ExtractTemporal(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou return Status::Invalid("Unknown timestamp unit: ", ty); } +} // namespace + // and template struct CastFunctor< @@ -491,6 +495,8 @@ struct CastFunctor void AddCrossUnitCast(CastFunction* func) { ScalarKernel kernel; @@ -652,6 +658,8 @@ std::shared_ptr GetTimestampCast() { return func; } +} // namespace + std::vector> GetTemporalCasts() { std::vector> functions; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 528e6acded6..44b50b31f75 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -3825,37 +3825,34 @@ static void CheckStructToStructSubset( // field does not exist ASSERT_OK_AND_ASSIGN(auto dest6, - StructArray::Make({a1, d1, nulls}, {"a", "d", "f"})); + StructArray::Make({a2, d2, nulls}, {"a", "d", "f"})); CheckCast(src, dest6); const auto dest7 = arrow::struct_( {std::make_shared("a", int8()), std::make_shared("d", int16()), std::make_shared("f", int64(), /*nullable=*/false)}); const auto options7 = CastOptions::Safe(dest7); - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src, options7)); + EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, + ::testing::HasSubstr("struct fields don't match"), + Cast(src, options7)); // fields in wrong order - const auto dest8 = arrow::struct_({std::make_shared("a", int8()), - std::make_shared("c", int16()), - std::make_shared("b", int64())}); - const auto options8 = CastOptions::Safe(dest8); - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src, options8)); + ASSERT_OK_AND_ASSIGN(auto dest8, StructArray::Make({a2, c2, b2}, {"a", "c", "b"})); + CheckCast(src, dest8); // duplicate missing field names - const auto dest9 = arrow::struct_( + ASSERT_OK_AND_ASSIGN(auto dest9, + StructArray::Make({a2, c2, d2, nulls}, {"a", "c", "d", "a"})); + CheckCast(src, dest9); + + const auto dest10 = arrow::struct_( {std::make_shared("a", int8()), std::make_shared("c", int16()), - std::make_shared("d", int32()), std::make_shared("a", int64())}); - const auto options9 = CastOptions::Safe(dest9); - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src, options9)); + std::make_shared("d", int32()), + std::make_shared("a", int64(), /*nullable=*/false)}); + const auto options10 = CastOptions::Safe(dest10); + EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, + ::testing::HasSubstr("struct fields don't match"), + Cast(src, options10)); // duplicate present field names ASSERT_OK_AND_ASSIGN( @@ -3875,6 +3872,21 @@ static void CheckStructToStructSubset( auto dest3_duplicate_field_names, StructArray::Make({a2, b2, c2}, std::vector{"a", "a", "a"})); CheckCast(src_duplicate_field_names, dest3_duplicate_field_names); + + // more duplicate outputs than duplicate inputs + ASSERT_OK_AND_ASSIGN(auto dest4_duplicate_field_names, + StructArray::Make({a2, b2, c2, nulls}, {"a", "a", "a", "a"})); + CheckCast(src_duplicate_field_names, dest4_duplicate_field_names); + + const auto dest5_duplicate_field_names = arrow::struct_( + {std::make_shared("a", int8()), std::make_shared("a", int8()), + std::make_shared("a", int8()), + std::make_shared("a", int8(), /*nullable=*/false)}); + const auto options5_duplicate_field_names = + CastOptions::Safe(dest5_duplicate_field_names); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, ::testing::HasSubstr("struct fields don't match"), + Cast(src_duplicate_field_names, options5_duplicate_field_names)); } } } @@ -3941,37 +3953,36 @@ static void CheckStructToStructSubsetWithNulls( // field does not exist ASSERT_OK_AND_ASSIGN( auto dest6_null, - StructArray::Make({a1, d1, nulls}, {"a", "d", "f"}, null_bitmap)); + StructArray::Make({a2, d2, nulls}, {"a", "d", "f"}, null_bitmap)); CheckCast(src_null, dest6_null); const auto dest7_null = arrow::struct_( {std::make_shared("a", int8()), std::make_shared("d", int16()), std::make_shared("f", int64(), /*nullable=*/false)}); const auto options7_null = CastOptions::Safe(dest7_null); - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src_null, options7_null)); + EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, + ::testing::HasSubstr("struct fields don't match"), + Cast(src_null, options7_null)); // fields in wrong order - const auto dest8_null = arrow::struct_({std::make_shared("a", int8()), - std::make_shared("c", int16()), - std::make_shared("b", int64())}); - const auto options8_null = CastOptions::Safe(dest8_null); - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src_null, options8_null)); + ASSERT_OK_AND_ASSIGN(auto dest8_null, + StructArray::Make({a2, c2, b2}, {"a", "c", "b"}, null_bitmap)); + CheckCast(src_null, dest8_null); // duplicate missing field names - const auto dest9_null = arrow::struct_( + ASSERT_OK_AND_ASSIGN( + auto dest9_null, + StructArray::Make({a2, c2, d2, nulls}, {"a", "c", "d", "a"}, null_bitmap)); + CheckCast(src_null, dest9_null); + + const auto dest10_null = arrow::struct_( {std::make_shared("a", int8()), std::make_shared("c", int16()), - std::make_shared("d", int32()), std::make_shared("a", int64())}); - const auto options9_null = CastOptions::Safe(dest9_null); - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src_null, options9_null)); + std::make_shared("d", int64()), + std::make_shared("a", int8(), /*nullable=*/false)}); + const auto options10_null = CastOptions::Safe(dest10_null); + EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, + ::testing::HasSubstr("struct fields don't match"), + Cast(src_null, options10_null)); // duplicate present field values ASSERT_OK_AND_ASSIGN( @@ -3994,6 +4005,22 @@ static void CheckStructToStructSubsetWithNulls( StructArray::Make({a2, b2, c2}, std::vector{"a", "a", "a"}, null_bitmap)); CheckCast(src_duplicate_field_names_null, dest3_duplicate_field_names_null); + + // more duplicate outputs than duplicate inputs + ASSERT_OK_AND_ASSIGN( + auto dest4_duplicate_field_names_null, + StructArray::Make({a2, b2, c2, nulls}, {"a", "a", "a", "a"}, null_bitmap)); + CheckCast(src_duplicate_field_names_null, dest4_duplicate_field_names_null); + + const auto dest5_duplicate_field_names_null = arrow::struct_( + {std::make_shared("a", int8()), std::make_shared("a", int8()), + std::make_shared("a", int8()), + std::make_shared("a", int8(), /*nullable=*/false)}); + const auto options5_duplicate_field_names_null = + CastOptions::Safe(dest5_duplicate_field_names_null); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, ::testing::HasSubstr("struct fields don't match"), + Cast(src_duplicate_field_names_null, options5_duplicate_field_names_null)); } } } @@ -4024,9 +4051,7 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { const auto options2 = CastOptions::Safe(dest2); EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src, options2)); + TypeError, ::testing::HasSubstr("struct fields don't match"), Cast(src, options2)); } TEST(Cast, StructToBiggerStruct) { @@ -4042,9 +4067,7 @@ TEST(Cast, StructToBiggerStruct) { const auto options1 = CastOptions::Safe(dest1); EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src, options1)); + TypeError, ::testing::HasSubstr("struct fields don't match"), Cast(src, options1)); const auto dest2 = arrow::struct_({std::make_shared("a", int8()), @@ -4053,9 +4076,7 @@ TEST(Cast, StructToBiggerStruct) { const auto options2 = CastOptions::Safe(dest2); EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct fields don't match or are in the wrong order"), - Cast(src, options2)); + TypeError, ::testing::HasSubstr("struct fields don't match"), Cast(src, options2)); } TEST(Cast, StructToBiggerNullableStruct) { diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc index f40a6d6b282..773a3f684bd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_compare.cc +++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc @@ -22,6 +22,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/type.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" @@ -435,8 +436,8 @@ std::shared_ptr MakeCompareFunction(std::string name, FunctionDo for (const auto id : {Type::DECIMAL128, Type::DECIMAL256}) { auto exec = GenerateDecimal(id); - DCHECK_OK( - func->AddKernel({InputType(id), InputType(id)}, boolean(), std::move(exec))); + DCHECK_OK(func->AddKernel({InputType(id), InputType(id)}, boolean(), std::move(exec), + /*init=*/nullptr, DecimalsHaveSameScale())); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc index fdfd63498f5..6e2bfc4434d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc @@ -38,7 +38,7 @@ static void BenchArrayScalar(benchmark::State& state, const std::string& op) { auto array = rand.ArrayOf(ty, args.size, args.null_proportion); auto scalar = *rand.ArrayOf(ty, 1, 0)->GetScalar(0); for (auto _ : state) { - ABORT_NOT_OK(CallFunction(op, {array, Datum(scalar)}).status()); + ABORT_NOT_OK(CallFunction(op, {array, Datum(scalar)})); } } @@ -50,7 +50,7 @@ static void BenchArrayArray(benchmark::State& state, const std::string& op) { auto lhs = rand.ArrayOf(ty, args.size, args.null_proportion); auto rhs = rand.ArrayOf(ty, args.size, args.null_proportion); for (auto _ : state) { - ABORT_NOT_OK(CallFunction(op, {lhs, rhs}).status()); + ABORT_NOT_OK(CallFunction(op, {lhs, rhs})); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index cabf520b727..d885db4cd93 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -23,6 +23,7 @@ #include "arrow/compute/api.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/copy_data_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/bit_block_counter.h" @@ -1450,6 +1451,20 @@ struct CaseWhenFunction : ScalarFunction { if (auto kernel = DispatchExactImpl(this, *types)) return kernel; return arrow::compute::detail::NoMatchingKernel(this, *types); } + + static std::shared_ptr DecimalMatchConstraint() { + static auto constraint = + MatchConstraint::Make([](const std::vector& types) -> bool { + DCHECK_GE(types.size(), 2); + DCHECK(std::all_of(types.begin() + 1, types.end(), [](const TypeHolder& type) { + return is_decimal(type.id()); + })); + return std::all_of( + types.begin() + 2, types.end(), + [&types](const TypeHolder& type) { return type == types[1]; }); + }); + return constraint; + } }; // Implement a 'case when' (SQL)/'select' (NumPy) function for any scalar conditions @@ -2711,10 +2726,11 @@ struct ChooseFunction : ScalarFunction { }; void AddCaseWhenKernel(const std::shared_ptr& scalar_function, - detail::GetTypeId get_id, ArrayKernelExec exec) { + detail::GetTypeId get_id, ArrayKernelExec exec, + std::shared_ptr constraint = nullptr) { ScalarKernel kernel( KernelSignature::Make({InputType(Type::STRUCT), InputType(get_id.id)}, LastType, - /*is_varargs=*/true), + /*is_varargs=*/true, std::move(constraint)), exec); if (is_fixed_width(get_id.id)) { kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; @@ -2870,7 +2886,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddPrimitiveIfElseKernels(func, TemporalTypes()); AddPrimitiveIfElseKernels(func, IntervalTypes()); AddPrimitiveIfElseKernels(func, DurationTypes()); - AddPrimitiveIfElseKernels(func, {boolean()}); + AddPrimitiveIfElseKernels(func, {boolean(), float16()}); AddNullIfElseKernel(func); AddBinaryIfElseKernels(func, BaseBinaryTypes()); AddFixedWidthIfElseKernel(func); @@ -2886,11 +2902,13 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddPrimitiveCaseWhenKernels(func, TemporalTypes()); AddPrimitiveCaseWhenKernels(func, IntervalTypes()); AddPrimitiveCaseWhenKernels(func, DurationTypes()); - AddPrimitiveCaseWhenKernels(func, {boolean(), null()}); + AddPrimitiveCaseWhenKernels(func, {boolean(), null(), float16()}); AddCaseWhenKernel(func, Type::FIXED_SIZE_BINARY, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec, + CaseWhenFunction::DecimalMatchConstraint()); + AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec, + CaseWhenFunction::DecimalMatchConstraint()); AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); AddNestedCaseWhenKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); @@ -2902,7 +2920,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddPrimitiveCoalesceKernels(func, TemporalTypes()); AddPrimitiveCoalesceKernels(func, IntervalTypes()); AddPrimitiveCoalesceKernels(func, DurationTypes()); - AddPrimitiveCoalesceKernels(func, {boolean(), null()}); + AddPrimitiveCoalesceKernels(func, {boolean(), null(), float16()}); AddCoalesceKernel(func, Type::FIXED_SIZE_BINARY, CoalesceFunctor::Exec); AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor::Exec); @@ -2920,7 +2938,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddPrimitiveChooseKernels(func, TemporalTypes()); AddPrimitiveChooseKernels(func, IntervalTypes()); AddPrimitiveChooseKernels(func, DurationTypes()); - AddPrimitiveChooseKernels(func, {boolean(), null()}); + AddPrimitiveChooseKernels(func, {boolean(), null(), float16()}); AddChooseKernel(func, Type::FIXED_SIZE_BINARY, ChooseFunctor::Exec); AddChooseKernel(func, Type::DECIMAL128, ChooseFunctor::Exec); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 76ad19f3c48..196912679ba 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -80,9 +80,9 @@ using IntegralArrowTypes = ::testing::Types; #else using IfElseNumericBasedTypes = ::testing::Types; + Int32Type, Int64Type, HalfFloatType, FloatType, DoubleType, + Date32Type, Date64Type, Time32Type, Time64Type, TimestampType, + MonthIntervalType, DurationType>; #endif TYPED_TEST_SUITE(TestIfElsePrimitive, IfElseNumericBasedTypes); @@ -1807,6 +1807,74 @@ TEST(TestCaseWhen, Decimal) { } } +TEST(TestCaseWhen, DecimalPromotion) { + auto check_case_when_decimal_promotion = + [](std::shared_ptr body_true, std::shared_ptr body_false, + std::shared_ptr promoted_true, std::shared_ptr promoted_false) { + auto cond_true = ScalarFromJSON(boolean(), "true"); + auto cond_false = ScalarFromJSON(boolean(), "false"); + CheckScalar("case_when", {MakeStruct({cond_true}), body_true, body_false}, + promoted_true); + CheckScalar("case_when", {MakeStruct({cond_false}), body_true, body_false}, + promoted_false); + }; + + const std::vector> precisions = {{10, 20}, {15, 15}, {20, 10}}; + const std::vector> scales = {{3, 9}, {6, 6}, {9, 3}}; + for (auto p : precisions) { + for (auto s : scales) { + auto p1 = p.first; + auto s1 = s.first; + auto p2 = p.second; + auto s2 = s.second; + + auto max_scale = std::max({s1, s2}); + auto scale_up_1 = max_scale - s1; + auto scale_up_2 = max_scale - s2; + auto max_precision = std::max({p1 + scale_up_1, p2 + scale_up_2}); + + // Operand string: 444.777... + std::string str_d1 = + R"(")" + std::string(p1 - s1, '4') + "." + std::string(s1, '7') + R"(")"; + std::string str_d2 = + R"(")" + std::string(p2 - s2, '4') + "." + std::string(s2, '7') + R"(")"; + + // Promoted string: 444.777...000 + std::string str_d1_promoted = R"(")" + std::string(p1 - s1, '4') + "." + + std::string(s1, '7') + + std::string(max_scale - s1, '0') + R"(")"; + std::string str_d2_promoted = R"(")" + std::string(p2 - s2, '4') + "." + + std::string(s2, '7') + + std::string(max_scale - s2, '0') + R"(")"; + + auto d128_1 = decimal128(p1, s1); + auto d128_2 = decimal128(p2, s2); + auto d256_1 = decimal256(p1, s1); + auto d256_2 = decimal256(p2, s2); + auto d128_promoted = decimal128(max_precision, max_scale); + auto d256_promoted = decimal256(max_precision, max_scale); + + auto scalar128_1 = ScalarFromJSON(d128_1, str_d1); + auto scalar128_2 = ScalarFromJSON(d128_2, str_d2); + auto scalar256_1 = ScalarFromJSON(d256_1, str_d1); + auto scalar256_2 = ScalarFromJSON(d256_2, str_d2); + auto scalar128_d1_promoted = ScalarFromJSON(d128_promoted, str_d1_promoted); + auto scalar128_d2_promoted = ScalarFromJSON(d128_promoted, str_d2_promoted); + auto scalar256_d1_promoted = ScalarFromJSON(d256_promoted, str_d1_promoted); + auto scalar256_d2_promoted = ScalarFromJSON(d256_promoted, str_d2_promoted); + + check_case_when_decimal_promotion(scalar128_1, scalar128_2, scalar128_d1_promoted, + scalar128_d2_promoted); + check_case_when_decimal_promotion(scalar128_1, scalar256_2, scalar256_d1_promoted, + scalar256_d2_promoted); + check_case_when_decimal_promotion(scalar256_1, scalar128_2, scalar256_d1_promoted, + scalar256_d2_promoted); + check_case_when_decimal_promotion(scalar256_1, scalar256_2, scalar256_d1_promoted, + scalar256_d2_promoted); + } + } +} + TEST(TestCaseWhen, FixedSizeBinary) { auto type = fixed_size_binary(3); auto cond_true = ScalarFromJSON(boolean(), "true"); @@ -2509,6 +2577,28 @@ TEST(TestCaseWhen, UnionBoolStringRandom) { } } +TEST(TestCaseWhen, DispatchExact) { + // Decimal types with same (p, s) + CheckDispatchExact("case_when", {struct_({field("", boolean())}), decimal128(20, 3), + decimal128(20, 3)}); + CheckDispatchExact("case_when", {struct_({field("", boolean())}), decimal256(20, 3), + decimal256(20, 3)}); + + // Decimal types with different (p, s) + CheckDispatchExactFails("case_when", {struct_({field("", boolean())}), + decimal128(20, 3), decimal128(21, 3)}); + CheckDispatchExactFails("case_when", {struct_({field("", boolean())}), + decimal128(20, 1), decimal128(20, 3)}); + CheckDispatchExactFails("case_when", {struct_({field("", boolean())}), + decimal128(20, 3), decimal256(20, 3)}); + CheckDispatchExactFails("case_when", {struct_({field("", boolean())}), + decimal256(20, 3), decimal128(21, 3)}); + CheckDispatchExactFails("case_when", {struct_({field("", boolean())}), + decimal256(20, 3), decimal256(21, 3)}); + CheckDispatchExactFails("case_when", {struct_({field("", boolean())}), + decimal256(20, 1), decimal256(20, 3)}); +} + TEST(TestCaseWhen, DispatchBest) { CheckDispatchBest("case_when", {struct_({field("", boolean())}), int64(), int32()}, {struct_({field("", boolean())}), int64(), int64()}); @@ -2559,6 +2649,32 @@ TEST(TestCaseWhen, DispatchBest) { CheckDispatchBest( "case_when", {struct_({field("", boolean())}), dictionary(int64(), utf8()), utf8()}, {struct_({field("", boolean())}), utf8(), utf8()}); + + // Decimal promotion + CheckDispatchBest( + "case_when", + {struct_({field("", boolean())}), decimal128(20, 3), decimal128(21, 3)}, + {struct_({field("", boolean())}), decimal128(21, 3), decimal128(21, 3)}); + CheckDispatchBest( + "case_when", + {struct_({field("", boolean())}), decimal128(20, 1), decimal128(21, 3)}, + {struct_({field("", boolean())}), decimal128(22, 3), decimal128(22, 3)}); + CheckDispatchBest( + "case_when", + {struct_({field("", boolean())}), decimal128(20, 3), decimal128(21, 1)}, + {struct_({field("", boolean())}), decimal128(23, 3), decimal128(23, 3)}); + CheckDispatchBest( + "case_when", + {struct_({field("", boolean())}), decimal128(20, 3), decimal256(21, 3)}, + {struct_({field("", boolean())}), decimal256(21, 3), decimal256(21, 3)}); + CheckDispatchBest( + "case_when", + {struct_({field("", boolean())}), decimal256(20, 1), decimal128(21, 3)}, + {struct_({field("", boolean())}), decimal256(22, 3), decimal256(22, 3)}); + CheckDispatchBest( + "case_when", + {struct_({field("", boolean())}), decimal256(20, 3), decimal256(21, 1)}, + {struct_({field("", boolean())}), decimal256(23, 3), decimal256(23, 3)}); } template @@ -3028,6 +3144,14 @@ TEST(TestCoalesce, Boolean) { ArrayFromJSON(type, "[true, true, false, true]")); CheckScalar("coalesce", {scalar1, values1}, ArrayFromJSON(type, "[false, false, false, false]")); + + // Regression test for GH-47234, which was failing due to a MSVC compiler bug + // (possibly https://developercommunity.visualstudio.com/t/10912292 + // or https://developercommunity.visualstudio.com/t/10945478). + auto values_with_null = ArrayFromJSON(type, "[true, false, false, false, false, null]"); + auto expected = ArrayFromJSON(type, "[true, false, false, false, false, true]"); + auto scalar2 = ScalarFromJSON(type, "true"); + CheckScalar("coalesce", {values_with_null, scalar2}, expected); } TEST(TestCoalesce, DayTimeInterval) { diff --git a/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc index cf8503d6868..4877df46a7a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_list_benchmark.cc @@ -41,7 +41,7 @@ static void BenchmarkListSlice(benchmark::State& state, const ListSliceOptions& auto ctx = default_exec_context(); std::vector input_args = {std::move(array)}; for (auto _ : state) { - ABORT_NOT_OK(CallFunction("list_slice", input_args, &opts, ctx).status()); + ABORT_NOT_OK(CallFunction("list_slice", input_args, &opts, ctx)); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 674ae6050ef..1fb0df56bb9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -22,6 +22,7 @@ #include "arrow/array/builder_nested.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/result.h" #include "arrow/type_fwd.h" #include "arrow/util/bit_block_counter.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_random.cc b/cpp/src/arrow/compute/kernels/scalar_random.cc index d6c96f94e96..5bcb7c36da6 100644 --- a/cpp/src/arrow/compute/kernels/scalar_random.cc +++ b/cpp/src/arrow/compute/kernels/scalar_random.cc @@ -23,6 +23,7 @@ #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/logging_internal.h" #include "arrow/util/pcg_random.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_random_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_random_benchmark.cc index bf4a339a9bd..baf698bede5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_random_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_random_benchmark.cc @@ -30,7 +30,7 @@ static void RandomKernel(benchmark::State& state, bool is_seed) { const auto options = is_seed ? RandomOptions::FromSeed(42) : RandomOptions::FromSystemRandom(); for (auto _ : state) { - ABORT_NOT_OK(CallFunction("random", ExecBatch({}, length), &options).status()); + ABORT_NOT_OK(CallFunction("random", ExecBatch({}, length), &options)); } state.SetItemsProcessed(state.iterations() * length); } diff --git a/cpp/src/arrow/compute/kernels/scalar_round.cc b/cpp/src/arrow/compute/kernels/scalar_round.cc index 00c448f470e..208b9875a1c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_round.cc +++ b/cpp/src/arrow/compute/kernels/scalar_round.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/decimal.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index 9a6cc98ca46..c6a3562233c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -20,6 +20,7 @@ #include "arrow/compute/cast.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/type.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_writer.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc index 233facf61ce..06e6f4bb506 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc @@ -24,6 +24,7 @@ #include "arrow/array/builder_nested.h" #include "arrow/array/builder_primitive.h" #include "arrow/compute/kernels/scalar_string_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/result.h" #include "arrow/util/config.h" #include "arrow/util/logging_internal.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index bce788ca38d..b279f991f6e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1384,10 +1384,15 @@ TYPED_TEST(TestStringKernels, IsDecimalUnicode) { } TYPED_TEST(TestStringKernels, IsDigitUnicode) { - // These are digits according to Python, but we don't have the information in - // utf8proc for this - // this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true, - // true]"); + // Tests for digits across various Unicode scripts. + // ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit), + // 123: Fullwidth 123. + // '¾' (vulgar fraction) is treated as a digit by utf8proc + this->CheckUnary( + "utf8_is_digit", + R"(["0", "٤", "۵", "३", "१२३", "٣٣", "²", "123", "٣٢", "٩", "①", "Ⅷ", "abc" , "⻁", ""])", + boolean(), + R"([true, true, true, true, true, true, true, true, true, true, true, false, false, false, false])"); } TYPED_TEST(TestStringKernels, IsNumericUnicode) { @@ -2241,6 +2246,51 @@ TYPED_TEST(TestStringKernels, PadUTF8) { CallFunction("utf8_lpad", {input}, &options_bad)); } +TYPED_TEST(TestStringKernels, UTF8ZeroFill) { + ZeroFillOptions options{/*width=*/3, "0"}; + this->CheckUnary("utf8_zero_fill", R"(["A", "AB", "ABC", null])", this->type(), + R"(["00A", "0AB", "ABC", null])", &options); + + options.width = 4; + this->CheckUnary("utf8_zero_fill", R"(["-1", "+1", "1"])", this->type(), + R"(["-001", "+001", "0001"])", &options); + + // width smaller than string → no padding + options.width = 2; + this->CheckUnary("utf8_zero_fill", R"(["AB", "-12", "+12", "XYZ"])", this->type(), + R"(["AB", "-12", "+12", "XYZ"])", &options); + + // Non-ASCII input strings + options.width = 4; + this->CheckUnary("utf8_zero_fill", R"(["ñ", "-ö", "+ß"])", this->type(), + R"(["000ñ", "-00ö", "+00ß"])", &options); + + // custom padding character + options = ZeroFillOptions{/*width=*/4, "x"}; + this->CheckUnary("utf8_zero_fill", R"(["1", "-2", "+3"])", this->type(), + R"(["xxx1", "-xx2", "+xx3"])", &options); + + // Non-ASCII padding character + options = ZeroFillOptions{/*width=*/5, "💠"}; + this->CheckUnary("utf8_zero_fill", R"(["1", "-2", "+3"])", this->type(), + R"(["💠💠💠💠1", "-💠💠💠2", "+💠💠💠3"])", &options); + + ZeroFillOptions default_options{/*width=*/4}; + this->CheckUnary("utf8_zero_fill", R"(["1", "-2", "+3"])", this->type(), + R"(["0001", "-002", "+003"])", &default_options); + + // padding error check + ZeroFillOptions options_bad{/*width=*/3, "spam"}; + auto input = ArrayFromJSON(this->type(), R"(["foo"])"); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + ::testing::HasSubstr("Padding must be one codepoint"), + CallFunction("utf8_zero_fill", {input}, &options_bad)); + options_bad.padding = ""; + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + ::testing::HasSubstr("Padding must be one codepoint"), + CallFunction("utf8_zero_fill", {input}, &options_bad)); +} + #ifdef ARROW_WITH_UTF8PROC TYPED_TEST(TestStringKernels, TrimWhitespaceUTF8) { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc index e7b7952df3a..fd340bba624 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc @@ -20,6 +20,7 @@ #include #include "arrow/compute/kernels/scalar_string_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/config.h" #include "arrow/util/logging_internal.h" #include "arrow/util/utf8_internal.h" @@ -138,9 +139,12 @@ static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { } static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { - // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. - // utf8proc has no support for this, this is the best we can do: - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); + // Approximates Python's str.isnumeric(): + // returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals + // ('Ⅷ') due to utf8proc limits. + // '¾' (vulgar fraction) is treated as a digit by utf8proc 'No' + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NO); } static inline bool IsNumericCharacterUnicode(uint32_t codepoint) { @@ -987,11 +991,73 @@ const FunctionDoc utf8_rpad_doc( "the given UTF8 codeunit.\nNull values emit null."), {"strings"}, "PadOptions", /*options_required=*/true); +struct Utf8ZeroFillTransform : public StringTransformBase { + using State = OptionsWrapper; + + const ZeroFillOptions& options_; + + explicit Utf8ZeroFillTransform(const ZeroFillOptions& options) : options_(options) {} + + Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override { + if (!options_.padding.empty()) { + auto str = reinterpret_cast(options_.padding.data()); + auto strlen = options_.padding.size(); + if (util::UTF8Length(str, str + strlen) != 1) { + return Status::Invalid("Padding must be one codepoint, got '", options_.padding, + "'"); + } + } else { + return Status::Invalid("Padding must be one codepoint, got ''"); + } + return Status::OK(); + } + + int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override { + return input_ncodeunits + 4 * ninputs * options_.width; + } + + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits); + if (input_width >= options_.width) { + std::copy(input, input + input_string_ncodeunits, output); + return input_string_ncodeunits; + } + uint8_t* start = output; + // sign-aware padding: + if (input_string_ncodeunits > 0 && (input[0] == '+' || input[0] == '-')) { + *output++ = input[0]; + input++; + input_string_ncodeunits--; + } + int64_t num_zeros = options_.width - input_width; + while (num_zeros > 0) { + output = std::copy(options_.padding.begin(), options_.padding.end(), output); + num_zeros--; + } + output = std::copy(input, input + input_string_ncodeunits, output); + return output - start; + } +}; + +template +using Utf8ZeroFill = StringTransformExecWithState; + +const FunctionDoc utf8_zero_fill_doc( + "Left-pad strings to a given width, preserving leading sign characters", + ("For each string in `strings`, emit a string of length `width` by \n" + "prepending the given padding character (defaults to '0' if not specified). \n" + "If the string starts with '+' or '-', the sign is preserved and padding \n" + "occurs after the sign. Null values emit null."), + {"strings"}, "ZeroFillOptions", /*options_required=*/true); + void AddUtf8StringPad(FunctionRegistry* registry) { MakeUnaryStringBatchKernelWithState("utf8_lpad", registry, utf8_lpad_doc); MakeUnaryStringBatchKernelWithState("utf8_rpad", registry, utf8_rpad_doc); MakeUnaryStringBatchKernelWithState("utf8_center", registry, utf8_center_doc); + MakeUnaryStringBatchKernelWithState("utf8_zero_fill", registry, + utf8_zero_fill_doc); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_benchmark.cc index 780e90c087e..bd90bbb6a58 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_benchmark.cc @@ -59,7 +59,7 @@ static void BenchmarkTemporalRounding(benchmark::State& state) { EXPECT_OK_AND_ASSIGN(auto timestamp_array, array->View(timestamp_type)); for (auto _ : state) { - ABORT_NOT_OK(Op(timestamp_array, options, ctx).status()); + ABORT_NOT_OK(Op(timestamp_array, options, ctx)); } state.SetItemsProcessed(state.iterations() * array_size); @@ -78,7 +78,7 @@ static void BenchmarkTemporal(benchmark::State& state) { EXPECT_OK_AND_ASSIGN(auto timestamp_array, array->View(timestamp_type)); for (auto _ : state) { - ABORT_NOT_OK(Op(timestamp_array, ctx).status()); + ABORT_NOT_OK(Op(timestamp_array, ctx)); } state.SetItemsProcessed(state.iterations() * array_size); @@ -96,7 +96,7 @@ static void BenchmarkTemporalBinary(benchmark::State& state) { auto rhs = rand.ArrayOf(timestamp_type, args.size, args.null_proportion); for (auto _ : state) { - ABORT_NOT_OK(Op(lhs, rhs, ctx).status()); + ABORT_NOT_OK(Op(lhs, rhs, ctx)); } state.SetItemsProcessed(state.iterations() * array_size); @@ -116,7 +116,7 @@ static void BenchmarkStrftime(benchmark::State& state) { auto options = StrftimeOptions(); for (auto _ : state) { - ABORT_NOT_OK(Strftime(timestamp_array, options, ctx).status()); + ABORT_NOT_OK(Strftime(timestamp_array, options, ctx)); } state.SetItemsProcessed(state.iterations() * array_size); @@ -139,7 +139,7 @@ static void BenchmarkStrptime(benchmark::State& state) { auto strptime_options = StrptimeOptions("%Y-%m-%dT%H:%M:%S", TimeUnit::MICRO, true); for (auto _ : state) { - ABORT_NOT_OK(Strptime(string_array, strptime_options, ctx).status()); + ABORT_NOT_OK(Strptime(string_array, strptime_options, ctx)); } state.SetItemsProcessed(state.iterations() * array_size); @@ -160,7 +160,7 @@ static void BenchmarkAssumeTimezone(benchmark::State& state) { "Pacific/Marquesas", AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_LATEST, AssumeTimezoneOptions::Nonexistent::NONEXISTENT_EARLIEST); for (auto _ : state) { - ABORT_NOT_OK(AssumeTimezone(timestamp_array, options, ctx).status()); + ABORT_NOT_OK(AssumeTimezone(timestamp_array, options, ctx)); } state.SetItemsProcessed(state.iterations() * array_size); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc index 6ef2a369d92..4437b8fe1db 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc @@ -23,6 +23,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/temporal_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging_internal.h" #include "arrow/util/time.h" @@ -43,17 +44,14 @@ using arrow_vendored::date::floor; using arrow_vendored::date::hh_mm_ss; using arrow_vendored::date::local_days; using arrow_vendored::date::local_time; -using arrow_vendored::date::locate_zone; using arrow_vendored::date::sys_days; using arrow_vendored::date::sys_time; -using arrow_vendored::date::time_zone; using arrow_vendored::date::trunc; using arrow_vendored::date::weekday; using arrow_vendored::date::weeks; using arrow_vendored::date::year_month_day; using arrow_vendored::date::year_month_weekday; using arrow_vendored::date::years; -using arrow_vendored::date::zoned_time; using arrow_vendored::date::literals::dec; using arrow_vendored::date::literals::jan; using arrow_vendored::date::literals::last; diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 3e2f20ecc70..236e641b3e8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -93,15 +93,16 @@ class ScalarTemporalTest : public ::testing::Test { "2010-01-01T05:25:25.005321", "2010-01-03T06:30:30.006163", "2010-01-04T07:35:35", "2006-01-01T08:40:40", "2005-12-31T09:45:45", "2008-12-28", "2008-12-29", "2012-01-01 01:02:03", null])"; - const char* times2 = - R"(["1970-01-01T00:00:59.103476799","2000-02-29T23:23:23.999999909", - "1899-01-01T03:30:42.001001001","2033-05-18T01:59:42.000000000", - "2020-01-01T19:05:05.005", "2019-12-31T02:03:10.000", - "2019-12-30T00:15:15.003", "2009-12-31T04:40:20.004432", - "2010-01-01T05:25:25.005021", "2010-01-03T01:30:30.006163", - "2010-01-04T23:35:59", "2006-01-01T08:40:07", - "2005-12-31T09:45:45", "2008-12-28", "2008-12-29", - "2012-01-01 01:02:03", null])"; + const char* times_with_offsets = + R"(["1970-01-01T00:00:59+0430", "2000-02-29T23:23:23+0430", + "1899-01-01T03:30:42+0430", "2033-05-18T01:59:42+0430", + "2020-01-01T19:05:05+0430", "2019-12-31T02:03:10+0430", + "2019-12-30T00:15:15+0430", "2009-12-31T04:40:20+0430", + "2010-01-01T05:25:25+0430", "2010-01-03T01:30:30+0430", + "2010-01-04T23:35:59+0430", "2006-01-01T08:40:07+0430", + "2005-12-31T09:45:45+0430", "2008-12-28T19:12:01+0430", + "2008-12-29T21:12:21+0430", "2012-01-01T23:02:03+0430", + null])"; const char* times_seconds_precision = R"(["1970-01-01T00:00:59","2000-02-29T23:23:23", "1899-01-01T00:59:20","2033-05-18T03:33:20", @@ -710,20 +711,25 @@ TEST_F(ScalarTemporalTest, TestIsLeapYear) { times, boolean(), is_leap_year_broken_hill); CheckScalarUnary("is_leap_year", timestamp(TimeUnit::NANO, "Pacific/Pago_Pago"), times, boolean(), is_leap_year_pago_pago); + CheckScalarUnary("is_leap_year", timestamp(TimeUnit::NANO, "-13:00"), times, boolean(), + is_leap_year_pago_pago); } TEST_F(ScalarTemporalTest, TestZoned1) { - auto unit = timestamp(TimeUnit::NANO, "Pacific/Marquesas"); - auto year = - "[1969, 2000, 1898, 2033, 2019, 2019, 2019, 2009, 2009, 2010, 2010, 2005, 2005, " - "2008, 2008, 2011, null]"; - auto is_leap_year = - "[false, true, false, false, false, false, false, false, false, false, false, " - "false, false, true, true, false, null]"; - auto month = "[12, 2, 12, 5, 12, 12, 12, 12, 12, 1, 1, 12, 12, 12, 12, 12, null]"; - auto day = "[31, 29, 31, 17, 31, 30, 29, 30, 31, 2, 3, 31, 31, 27, 28, 31, null]"; - auto year_month_day = ArrayFromJSON(year_month_day_type, - R"([{"year": 1969, "month": 12, "day": 31}, + std::vector timezones = {"Pacific/Marquesas", "-09:30"}; + for (const auto& timezone : timezones) { + auto unit = timestamp(TimeUnit::NANO, timezone); + + auto year = + "[1969, 2000, 1898, 2033, 2019, 2019, 2019, 2009, 2009, 2010, 2010, 2005, 2005, " + "2008, 2008, 2011, null]"; + auto is_leap_year = + "[false, true, false, false, false, false, false, false, false, false, false, " + "false, false, true, true, false, null]"; + auto month = "[12, 2, 12, 5, 12, 12, 12, 12, 12, 1, 1, 12, 12, 12, 12, 12, null]"; + auto day = "[31, 29, 31, 17, 31, 30, 29, 30, 31, 2, 3, 31, 31, 27, 28, 31, null]"; + auto year_month_day = ArrayFromJSON(year_month_day_type, + R"([{"year": 1969, "month": 12, "day": 31}, {"year": 2000, "month": 2, "day": 29}, {"year": 1898, "month": 12, "day": 31}, {"year": 2033, "month": 5, "day": 17}, @@ -739,24 +745,25 @@ TEST_F(ScalarTemporalTest, TestZoned1) { {"year": 2008, "month": 12, "day": 27}, {"year": 2008, "month": 12, "day": 28}, {"year": 2011, "month": 12, "day": 31}, null])"); - auto day_of_week = "[2, 1, 5, 1, 1, 0, 6, 2, 3, 5, 6, 5, 5, 5, 6, 5, null]"; - auto day_of_year = - "[365, 60, 365, 137, 365, 364, 363, 364, 365, 2, 3, 365, 365, 362, 363, 365, null]"; - std::string is_dst = - "[false, false, false, false, false, false, false, false, false, false, false, " - "false, false, false, false, false, null]"; - auto us_year = - "[1969, 2000, 1898, 2033, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, 2005, " - "2008, 2008, 2011, null]"; - auto iso_year = - "[1970, 2000, 1898, 2033, 2020, 2020, 2019, 2009, 2009, 2009, 2009, 2005, 2005, " - "2008, 2008, 2011, null]"; - auto iso_week = "[1, 9, 52, 20, 1, 1, 52, 53, 53, 53, 53, 52, 52, 52, 52, 52, null]"; - auto us_week = "[53, 9, 52, 20, 1, 1, 1, 52, 52, 52, 1, 52, 52, 52, 53, 52, null]"; - auto week = "[1, 9, 52, 20, 1, 1, 52, 53, 53, 53, 53, 52, 52, 52, 52, 52, null]"; - auto iso_calendar = - ArrayFromJSON(iso_calendar_type, - R"([{"iso_year": 1970, "iso_week": 1, "iso_day_of_week": 3}, + auto day_of_week = "[2, 1, 5, 1, 1, 0, 6, 2, 3, 5, 6, 5, 5, 5, 6, 5, null]"; + auto day_of_year = + "[365, 60, 365, 137, 365, 364, 363, 364, 365, 2, 3, 365, 365, 362, 363, 365, " + "null]"; + std::string is_dst = + "[false, false, false, false, false, false, false, false, false, false, false, " + "false, false, false, false, false, null]"; + auto us_year = + "[1969, 2000, 1898, 2033, 2020, 2020, 2020, 2009, 2009, 2009, 2010, 2005, 2005, " + "2008, 2008, 2011, null]"; + auto iso_year = + "[1970, 2000, 1898, 2033, 2020, 2020, 2019, 2009, 2009, 2009, 2009, 2005, 2005, " + "2008, 2008, 2011, null]"; + auto iso_week = "[1, 9, 52, 20, 1, 1, 52, 53, 53, 53, 53, 52, 52, 52, 52, 52, null]"; + auto us_week = "[53, 9, 52, 20, 1, 1, 1, 52, 52, 52, 1, 52, 52, 52, 53, 52, null]"; + auto week = "[1, 9, 52, 20, 1, 1, 52, 53, 53, 53, 53, 52, 52, 52, 52, 52, null]"; + auto iso_calendar = + ArrayFromJSON(iso_calendar_type, + R"([{"iso_year": 1970, "iso_week": 1, "iso_day_of_week": 3}, {"iso_year": 2000, "iso_week": 9, "iso_day_of_week": 2}, {"iso_year": 1898, "iso_week": 52, "iso_day_of_week": 6}, {"iso_year": 2033, "iso_week": 20, "iso_day_of_week": 2}, @@ -772,32 +779,38 @@ TEST_F(ScalarTemporalTest, TestZoned1) { {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 6}, {"iso_year": 2008, "iso_week": 52, "iso_day_of_week": 7}, {"iso_year": 2011, "iso_week": 52, "iso_day_of_week": 6}, null])"); - auto quarter = "[4, 1, 4, 2, 4, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 4, null]"; - auto hour = "[14, 13, 15, 18, 15, 16, 17, 18, 19, 21, 22, 23, 0, 14, 14, 15, null]"; - auto minute = "[30, 53, 41, 3, 35, 40, 45, 50, 55, 0, 5, 10, 15, 30, 30, 32, null]"; + auto quarter = "[4, 1, 4, 2, 4, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 4, null]"; + auto hour = "[14, 13, 15, 18, 15, 16, 17, 18, 19, 21, 22, 23, 0, 14, 14, 15, null]"; + auto minute = "[30, 53, 41, 3, 35, 40, 45, 50, 55, 0, 5, 10, 15, 30, 30, 32, null]"; + if (timezone == "-09:30") { + // Prior to October 1st 1912 Pacific/Marquesas was on solar time (probably) + // and is on +09:30 since. + minute = "[30, 53, 29, 3, 35, 40, 45, 50, 55, 0, 5, 10, 15, 30, 30, 32, null]"; + } - CheckScalarUnary("year", unit, times, int64(), year); - CheckScalarUnary("month", unit, times, int64(), month); - CheckScalarUnary("day", unit, times, int64(), day); - CheckScalarUnary("year_month_day", ArrayFromJSON(unit, times), year_month_day); - CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); - CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); - CheckScalarUnary("is_dst", unit, times, boolean(), is_dst); - CheckScalarUnary("us_year", unit, times, int64(), us_year); - CheckScalarUnary("iso_year", unit, times, int64(), iso_year); - CheckScalarUnary("iso_week", unit, times, int64(), iso_week); - CheckScalarUnary("is_leap_year", unit, times, boolean(), is_leap_year); - CheckScalarUnary("us_week", unit, times, int64(), us_week); - CheckScalarUnary("week", unit, times, int64(), week); - CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); - CheckScalarUnary("quarter", unit, times, int64(), quarter); - CheckScalarUnary("hour", unit, times, int64(), hour); - CheckScalarUnary("minute", unit, times, int64(), minute); - CheckScalarUnary("second", unit, times, int64(), second); - CheckScalarUnary("millisecond", unit, times, int64(), millisecond); - CheckScalarUnary("microsecond", unit, times, int64(), microsecond); - CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); - CheckScalarUnary("subsecond", unit, times, float64(), subsecond); + CheckScalarUnary("year", unit, times, int64(), year); + CheckScalarUnary("month", unit, times, int64(), month); + CheckScalarUnary("day", unit, times, int64(), day); + CheckScalarUnary("year_month_day", ArrayFromJSON(unit, times), year_month_day); + CheckScalarUnary("day_of_week", unit, times, int64(), day_of_week); + CheckScalarUnary("day_of_year", unit, times, int64(), day_of_year); + CheckScalarUnary("is_dst", unit, times, boolean(), is_dst); + CheckScalarUnary("us_year", unit, times, int64(), us_year); + CheckScalarUnary("iso_year", unit, times, int64(), iso_year); + CheckScalarUnary("iso_week", unit, times, int64(), iso_week); + CheckScalarUnary("is_leap_year", unit, times, boolean(), is_leap_year); + CheckScalarUnary("us_week", unit, times, int64(), us_week); + CheckScalarUnary("week", unit, times, int64(), week); + CheckScalarUnary("iso_calendar", ArrayFromJSON(unit, times), iso_calendar); + CheckScalarUnary("quarter", unit, times, int64(), quarter); + CheckScalarUnary("hour", unit, times, int64(), hour); + CheckScalarUnary("minute", unit, times, int64(), minute); + CheckScalarUnary("second", unit, times, int64(), second); + CheckScalarUnary("millisecond", unit, times, int64(), millisecond); + CheckScalarUnary("microsecond", unit, times, int64(), microsecond); + CheckScalarUnary("nanosecond", unit, times, int64(), nanosecond); + CheckScalarUnary("subsecond", unit, times, float64(), subsecond); + } } TEST_F(ScalarTemporalTest, TestZoned2) { @@ -890,31 +903,35 @@ TEST_F(ScalarTemporalTest, TestZoned2) { TEST_F(ScalarTemporalTest, TestNonexistentTimezone) { auto data_buffer = Buffer::Wrap(std::vector{1, 2, 3}); auto null_buffer = Buffer::FromString("\xff"); - - for (auto u : TimeUnit::values()) { - auto ts_type = timestamp(u, "Mars/Mariner_Valley"); - auto timestamp_array = std::make_shared>( - ts_type, 2, data_buffer, null_buffer, 0); - ASSERT_RAISES(Invalid, Year(timestamp_array)); - ASSERT_RAISES(Invalid, IsLeapYear(timestamp_array)); - ASSERT_RAISES(Invalid, Month(timestamp_array)); - ASSERT_RAISES(Invalid, Day(timestamp_array)); - ASSERT_RAISES(Invalid, YearMonthDay(timestamp_array)); - ASSERT_RAISES(Invalid, DayOfWeek(timestamp_array)); - ASSERT_RAISES(Invalid, DayOfYear(timestamp_array)); - ASSERT_RAISES(Invalid, IsDaylightSavings(timestamp_array)); - ASSERT_RAISES(Invalid, USYear(timestamp_array)); - ASSERT_RAISES(Invalid, ISOYear(timestamp_array)); - ASSERT_RAISES(Invalid, Week(timestamp_array)); - ASSERT_RAISES(Invalid, ISOCalendar(timestamp_array)); - ASSERT_RAISES(Invalid, Quarter(timestamp_array)); - ASSERT_RAISES(Invalid, Hour(timestamp_array)); - ASSERT_RAISES(Invalid, Minute(timestamp_array)); - ASSERT_RAISES(Invalid, Second(timestamp_array)); - ASSERT_RAISES(Invalid, Millisecond(timestamp_array)); - ASSERT_RAISES(Invalid, Microsecond(timestamp_array)); - ASSERT_RAISES(Invalid, Nanosecond(timestamp_array)); - ASSERT_RAISES(Invalid, Subsecond(timestamp_array)); + auto nonexistent_timezones = { + "Mars/Mariner_Valley", "+25:00", "-25:00", "15:00", "5:00", "500", + "+05:00:00", "+050000"}; + for (auto timezone : nonexistent_timezones) { + for (auto u : TimeUnit::values()) { + auto ts_type = timestamp(u, timezone); + auto timestamp_array = std::make_shared>( + ts_type, 2, data_buffer, null_buffer, 0); + ASSERT_RAISES(Invalid, Year(timestamp_array)); + ASSERT_RAISES(Invalid, IsLeapYear(timestamp_array)); + ASSERT_RAISES(Invalid, Month(timestamp_array)); + ASSERT_RAISES(Invalid, Day(timestamp_array)); + ASSERT_RAISES(Invalid, YearMonthDay(timestamp_array)); + ASSERT_RAISES(Invalid, DayOfWeek(timestamp_array)); + ASSERT_RAISES(Invalid, DayOfYear(timestamp_array)); + ASSERT_RAISES(Invalid, IsDaylightSavings(timestamp_array)); + ASSERT_RAISES(Invalid, USYear(timestamp_array)); + ASSERT_RAISES(Invalid, ISOYear(timestamp_array)); + ASSERT_RAISES(Invalid, Week(timestamp_array)); + ASSERT_RAISES(Invalid, ISOCalendar(timestamp_array)); + ASSERT_RAISES(Invalid, Quarter(timestamp_array)); + ASSERT_RAISES(Invalid, Hour(timestamp_array)); + ASSERT_RAISES(Invalid, Minute(timestamp_array)); + ASSERT_RAISES(Invalid, Second(timestamp_array)); + ASSERT_RAISES(Invalid, Millisecond(timestamp_array)); + ASSERT_RAISES(Invalid, Microsecond(timestamp_array)); + ASSERT_RAISES(Invalid, Nanosecond(timestamp_array)); + ASSERT_RAISES(Invalid, Subsecond(timestamp_array)); + } } } @@ -1872,19 +1889,28 @@ TEST_F(ScalarTemporalTest, TestLocalTimestamp) { times_seconds_precision, timestamp(u), expected_local_kolkata); CheckScalarUnary("local_timestamp", timestamp(u, "Pacific/Marquesas"), times_seconds_precision, timestamp(u), expected_local_marquesas); + CheckScalarUnary("local_timestamp", timestamp(u, "-09:30"), times_seconds_precision, + timestamp(u), expected_local_marquesas); + CheckScalarUnary("local_timestamp", timestamp(u, "-0930"), times_seconds_precision, + timestamp(u), expected_local_marquesas); } } TEST_F(ScalarTemporalTest, TestAssumeTimezone) { std::string timezone_utc = "UTC"; std::string timezone_kolkata = "Asia/Kolkata"; - std::string timezone_us_central = "US/Central"; + std::string timezone_us_central = "America/Chicago"; + std::string timezone_tbilisi = "Asia/Tbilisi"; + std::string timezone_tbilisi_offset = "+04:00"; const char* times_utc = R"(["1970-01-01T00:00:00", null])"; const char* times_kolkata = R"(["1970-01-01T05:30:00", null])"; + const char* times_tbilisi = R"(["1970-01-01T04:00:00", null])"; const char* times_us_central = R"(["1969-12-31T18:00:00", null])"; auto options_utc = AssumeTimezoneOptions(timezone_utc); auto options_kolkata = AssumeTimezoneOptions(timezone_kolkata); auto options_us_central = AssumeTimezoneOptions(timezone_us_central); + auto options_tbilisi = AssumeTimezoneOptions(timezone_tbilisi); + auto options_tbilisi_offset = AssumeTimezoneOptions(timezone_tbilisi_offset); auto options_invalid = AssumeTimezoneOptions("Europe/Brusselsss"); for (auto u : TimeUnit::values()) { @@ -1892,6 +1918,8 @@ TEST_F(ScalarTemporalTest, TestAssumeTimezone) { auto unit_utc = timestamp(u, timezone_utc); auto unit_kolkata = timestamp(u, timezone_kolkata); auto unit_us_central = timestamp(u, timezone_us_central); + auto unit_tbilisi = timestamp(u, timezone_tbilisi); + auto unit_tbilisi_offset = timestamp(u, timezone_tbilisi_offset); CheckScalarUnary("assume_timezone", unit, times_utc, unit_utc, times_utc, &options_utc); @@ -1899,6 +1927,10 @@ TEST_F(ScalarTemporalTest, TestAssumeTimezone) { &options_kolkata); CheckScalarUnary("assume_timezone", unit, times_us_central, unit_us_central, times_utc, &options_us_central); + CheckScalarUnary("assume_timezone", unit, times_tbilisi, unit_tbilisi, times_utc, + &options_tbilisi); + CheckScalarUnary("assume_timezone", unit, times_tbilisi, unit_tbilisi_offset, + times_utc, &options_tbilisi_offset); ASSERT_RAISES(Invalid, AssumeTimezone(ArrayFromJSON(unit_kolkata, times_utc), options_utc)); ASSERT_RAISES(Invalid, @@ -1979,6 +2011,16 @@ TEST_F(ScalarTemporalTest, TestAssumeTimezoneNonexistent) { &options_earliest); } +TEST_F(ScalarTemporalTest, StrftimeOffsetTimezone) { + auto options_ymdhms = StrftimeOptions("%Y-%m-%dT%H:%M:%S"); + + const char* seconds = R"(["1970-01-01T01:59:00", "2021-08-18T16:12:00", null])"; + const char* seconds_offset = R"(["1970-01-01T03:00:00", "2021-08-18T17:13:00", null])"; + + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND, "+01:01"), seconds, utf8(), + seconds_offset, &options_ymdhms); +} + TEST_F(ScalarTemporalTest, Strftime) { auto options_default = StrftimeOptions(); auto options = StrftimeOptions("%Y-%m-%dT%H:%M:%S%z"); @@ -2004,7 +2046,7 @@ TEST_F(ScalarTemporalTest, Strftime) { string_milliseconds, &options); CheckScalarUnary("strftime", timestamp(TimeUnit::MICRO, "Asia/Kolkata"), microseconds, utf8(), string_microseconds, &options); - CheckScalarUnary("strftime", timestamp(TimeUnit::NANO, "US/Hawaii"), nanoseconds, + CheckScalarUnary("strftime", timestamp(TimeUnit::NANO, "Pacific/Honolulu"), nanoseconds, utf8(), string_nanoseconds, &options); auto options_hms = StrftimeOptions("%H:%M:%S"); @@ -2103,7 +2145,7 @@ TEST_F(ScalarTemporalTest, StrftimeInvalidTimezone) { const char* seconds = R"(["1970-01-01T00:00:59", null])"; auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "nonexistent"), seconds); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Cannot locate timezone 'nonexistent'"), + Invalid, testing::HasSubstr("Cannot locate or parse timezone 'nonexistent'"), Strftime(arr, StrftimeOptions())); } @@ -2140,6 +2182,25 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) { utf8(), string_locale_specific, &options_locale_specific); } +TEST_F(ScalarTemporalTest, StrftimeRoundtrip) { + auto options = StrftimeOptions("%Y-%m-%dT%H:%M:%S%z", "C"); + + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND, "+04:30"), times_with_offsets, + utf8(), times_with_offsets, &options); + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND, "+00:30"), + R"(["1970-01-01T00:00:00+0020"])", utf8(), + R"(["1970-01-01T00:10:00+0030"])", &options); + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND, "-00:10"), + R"(["1970-01-01T00:20:00+0010"])", utf8(), + R"(["1970-01-01T00:00:00-0010"])", &options); + + auto invalid_arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "-00:10:00"), + R"(["1970-01-01T00:20:00"])"); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Cannot locate or parse timezone '-00:10:00'"), + Strftime(invalid_arr, options)); +} + TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { #ifdef _WIN32 GTEST_SKIP() << "There is a known bug in strftime for locales on Windows (ARROW-15922)"; @@ -3414,6 +3475,27 @@ TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalBrussels) { CheckScalarUnary("round_temporal", unit, times, unit, round_2_hours, &round_to_2_hours); } +TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalOffset) { + RoundTemporalOptions round_to_1_hours = RoundTemporalOptions(1, CalendarUnit::HOUR); + RoundTemporalOptions round_to_2_hours = RoundTemporalOptions(2, CalendarUnit::HOUR); + auto unit = timestamp(TimeUnit::NANO, "+03:00"); + + const char* times = R"(["2021-12-23 12:17:00", null])"; + const char* ceil_1_hours = R"(["2021-12-23 13:00", null])"; + const char* ceil_2_hours = R"(["2021-12-23 13:00", null])"; + const char* floor_1_hours = R"(["2021-12-23 12:00", null])"; + const char* floor_2_hours = R"(["2021-12-23 11:00", null])"; + const char* round_1_hours = R"(["2021-12-23 12:00", null])"; + const char* round_2_hours = R"(["2021-12-23 13:00", null])"; + + CheckScalarUnary("ceil_temporal", unit, times, unit, ceil_1_hours, &round_to_1_hours); + CheckScalarUnary("ceil_temporal", unit, times, unit, ceil_2_hours, &round_to_2_hours); + CheckScalarUnary("floor_temporal", unit, times, unit, floor_1_hours, &round_to_1_hours); + CheckScalarUnary("floor_temporal", unit, times, unit, floor_2_hours, &round_to_2_hours); + CheckScalarUnary("round_temporal", unit, times, unit, round_1_hours, &round_to_1_hours); + CheckScalarUnary("round_temporal", unit, times, unit, round_2_hours, &round_to_2_hours); +} + TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, RoundUTC) { std::string op = "round_temporal"; diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index ed4f0c3c8ea..8c7bdceb228 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -21,8 +21,10 @@ #include "arrow/builder.h" #include "arrow/compute/api_scalar.h" +#include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/temporal_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging_internal.h" #include "arrow/util/time.h" @@ -34,8 +36,7 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -namespace compute { -namespace internal { +namespace compute::internal { namespace { @@ -58,7 +59,6 @@ using arrow_vendored::date::year; using arrow_vendored::date::year_month_day; using arrow_vendored::date::year_month_weekday; using arrow_vendored::date::years; -using arrow_vendored::date::zoned_time; using arrow_vendored::date::literals::dec; using arrow_vendored::date::literals::jan; using arrow_vendored::date::literals::last; @@ -662,15 +662,19 @@ struct Nanosecond { template struct IsDaylightSavings { - explicit IsDaylightSavings(const FunctionOptions* options, const time_zone* tz) + explicit IsDaylightSavings(const FunctionOptions* options, const ArrowTimeZone tz) : tz_(tz) {} template T Call(KernelContext*, Arg0 arg, Status*) const { - return tz_->get_info(sys_time{Duration{arg}}).save.count() != 0; + return std::visit( + [&arg](const auto& tz) -> bool { + return tz->get_info(sys_time{Duration{arg}}).save.count() != 0; + }, + tz_); } - const time_zone* tz_; + const ArrowTimeZone tz_; }; // ---------------------------------------------------------------------- @@ -1164,7 +1168,7 @@ Result GetLocale(const std::string& locale) { template struct Strftime { const StrftimeOptions& options; - const time_zone* tz; + const ArrowTimeZone tz; const std::locale locale; static Result Make(KernelContext* ctx, const DataType& type) { @@ -1185,9 +1189,7 @@ struct Strftime { options.format); } } - - ARROW_ASSIGN_OR_RAISE(const time_zone* tz, - LocateZone(timezone.empty() ? "UTC" : timezone)); + ARROW_ASSIGN_OR_RAISE(auto tz, LocateZone(timezone.empty() ? "UTC" : timezone)); ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale(options.locale)); @@ -1352,31 +1354,31 @@ Result ResolveLocalTimestampOutput(KernelContext* ctx, template struct AssumeTimezone { - explicit AssumeTimezone(const AssumeTimezoneOptions* options, const time_zone* tz) + explicit AssumeTimezone(const AssumeTimezoneOptions* options, const ArrowTimeZone tz) : options(*options), tz_(tz) {} template - T get_local_time(Arg0 arg, const time_zone* tz) const { - return static_cast(zoned_time(tz, local_time(Duration{arg})) - .get_sys_time() - .time_since_epoch() - .count()); + T get_local_time(Arg0 arg, const ArrowTimeZone* tz) const { + const auto lt = local_time(Duration{arg}); + auto local_to_sys_time = [&](auto&& t) { + return t.get_sys_time().time_since_epoch().count(); + }; + return ApplyTimeZone(tz_, lt, std::nullopt, local_to_sys_time); } template - T get_local_time(Arg0 arg, const arrow_vendored::date::choose choose, - const time_zone* tz) const { - return static_cast( - zoned_time(tz, local_time(Duration{arg}), choose) - .get_sys_time() - .time_since_epoch() - .count()); + T get_local_time(Arg0 arg, const choose c, const ArrowTimeZone* tz) const { + const auto lt = local_time(Duration{arg}); + auto local_to_sys_time = [&](auto&& t) { + return t.get_sys_time().time_since_epoch().count(); + }; + return ApplyTimeZone(tz_, lt, c, local_to_sys_time); } template T Call(KernelContext*, Arg0 arg, Status* st) const { try { - return get_local_time(arg, tz_); + return get_local_time(arg, &tz_); } catch (const arrow_vendored::date::nonexistent_local_time& e) { switch (options.nonexistent) { case AssumeTimezoneOptions::Nonexistent::NONEXISTENT_RAISE: { @@ -1385,11 +1387,12 @@ struct AssumeTimezone { return arg; } case AssumeTimezoneOptions::Nonexistent::NONEXISTENT_EARLIEST: { - return get_local_time(arg, arrow_vendored::date::choose::latest, tz_) - + return get_local_time(arg, arrow_vendored::date::choose::latest, + &tz_) - 1; } case AssumeTimezoneOptions::Nonexistent::NONEXISTENT_LATEST: { - return get_local_time(arg, arrow_vendored::date::choose::latest, tz_); + return get_local_time(arg, arrow_vendored::date::choose::latest, &tz_); } } } catch (const arrow_vendored::date::ambiguous_local_time& e) { @@ -1401,17 +1404,17 @@ struct AssumeTimezone { } case AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_EARLIEST: { return get_local_time(arg, arrow_vendored::date::choose::earliest, - tz_); + &tz_); } case AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_LATEST: { - return get_local_time(arg, arrow_vendored::date::choose::latest, tz_); + return get_local_time(arg, arrow_vendored::date::choose::latest, &tz_); } } } return 0; } AssumeTimezoneOptions options; - const time_zone* tz_; + const ArrowTimeZone tz_; }; // ---------------------------------------------------------------------- @@ -2033,6 +2036,5 @@ void RegisterScalarTemporalUnary(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(round_temporal))); } -} // namespace internal -} // namespace compute +} // namespace compute::internal } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc index d9b4e77e94a..5913b756f1c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc @@ -19,15 +19,18 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/float16.h" #include "arrow/util/logging_internal.h" namespace arrow { using internal::CopyBitmap; using internal::InvertBitmap; +using util::Float16; namespace compute { namespace internal { @@ -60,14 +63,22 @@ Status IsValidExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { struct IsFiniteOperator { template static constexpr OutType Call(KernelContext*, const InType& value, Status*) { - return std::isfinite(value); + if constexpr (std::is_same_v) { + return value.is_finite(); + } else { + return std::isfinite(value); + } } }; struct IsInfOperator { template static constexpr OutType Call(KernelContext*, const InType& value, Status*) { - return std::isinf(value); + if constexpr (std::is_same_v) { + return value.is_infinity(); + } else { + return std::isinf(value); + } } }; @@ -77,7 +88,14 @@ template static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_offset) { const T* data = arr.GetValues(1); for (int64_t i = 0; i < arr.length; ++i) { - if (std::isnan(data[i])) { + bool is_nan(false); + if constexpr (std::is_same_v) { + is_nan = Float16::FromBits(data[i]).is_nan(); + } else { + is_nan = std::isnan(data[i]); + } + + if (is_nan) { bit_util::SetBit(out_bitmap, i + out_offset); } } @@ -111,6 +129,9 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { case Type::DOUBLE: SetNanBits(arr, out_bitmap, out_span->offset); break; + case Type::HALF_FLOAT: + SetNanBits(arr, out_bitmap, out_span->offset); + break; default: return Status::NotImplemented("NaN detection not implemented for type ", arr.type->ToString()); @@ -122,7 +143,11 @@ Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { struct IsNanOperator { template static constexpr OutType Call(KernelContext*, const InType& value, Status*) { - return std::isnan(value); + if constexpr (std::is_same_v) { + return value.is_nan(); + } else { + return std::isnan(value); + } } }; @@ -161,6 +186,7 @@ std::shared_ptr MakeIsFiniteFunction(std::string name, FunctionD AddFloatValidityKernel(float32(), func.get()); AddFloatValidityKernel(float64(), func.get()); + AddFloatValidityKernel(float16(), func.get()); for (const auto& ty : IntTypes()) { DCHECK_OK(func->AddKernel({InputType(ty->id())}, boolean(), ConstBoolExec)); @@ -180,6 +206,7 @@ std::shared_ptr MakeIsInfFunction(std::string name, FunctionDoc AddFloatValidityKernel(float32(), func.get()); AddFloatValidityKernel(float64(), func.get()); + AddFloatValidityKernel(float16(), func.get()); for (const auto& ty : IntTypes()) { DCHECK_OK(func->AddKernel({InputType(ty->id())}, boolean(), ConstBoolExec)); @@ -199,6 +226,7 @@ std::shared_ptr MakeIsNanFunction(std::string name, FunctionDoc AddFloatValidityKernel(float32(), func.get()); AddFloatValidityKernel(float64(), func.get()); + AddFloatValidityKernel(float16(), func.get()); for (const auto& ty : IntTypes()) { DCHECK_OK(func->AddKernel({InputType(ty->id())}, boolean(), ConstBoolExec)); diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc index 2d1167a1813..4613176b48c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc @@ -26,6 +26,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/float16.h" namespace arrow { namespace compute { @@ -250,7 +251,8 @@ class TestFloatingPointValidityKernels : public TestValidityKernels { } }; -TYPED_TEST_SUITE(TestFloatingPointValidityKernels, RealArrowTypes); +using RealTypesWithHalfFloat = testing::Types; +TYPED_TEST_SUITE(TestFloatingPointValidityKernels, RealTypesWithHalfFloat); TYPED_TEST(TestFloatingPointValidityKernels, IsNull) { this->TestIsNull(); } diff --git a/cpp/src/arrow/compute/kernels/temporal_internal.cc b/cpp/src/arrow/compute/kernels/temporal_internal.cc new file mode 100644 index 00000000000..056055fddb9 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/temporal_internal.cc @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/kernels/temporal_internal.h" + +namespace arrow::compute::internal { + +Result LocateZone(const std::string_view timezone) { + if (timezone[0] == '+' || timezone[0] == '-') { + // Valid offset strings have to have 4 digits and a sign prefix. + // Valid examples: +01:23 and -0123. + // Invalid examples: 1:23, 123, 0123, 01:23, +25:00, -12:34:45, +090000. + auto offset = std::string(timezone.substr(1)); + std::chrono::minutes zone_offset; + switch (timezone.length()) { + case 6: + if (arrow::internal::detail::ParseHH_MM(offset.c_str(), &zone_offset)) { + break; + } + [[fallthrough]]; + case 5: + if (arrow::internal::detail::ParseHHMM(offset.c_str(), &zone_offset)) { + break; + } + [[fallthrough]]; + default: + return Status::Invalid("Cannot locate or parse timezone '", timezone, "'"); + } + zone_offset = timezone[0] == '-' ? -zone_offset : zone_offset; + return OffsetZone(zone_offset); + } + + try { + return locate_zone(timezone); + } catch (const std::runtime_error& ex) { + return Status::Invalid("Cannot locate or parse timezone '", timezone, + "': ", ex.what()); + } +} + +} // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/temporal_internal.h b/cpp/src/arrow/compute/kernels/temporal_internal.h index 6e6931951f8..3674c233dc9 100644 --- a/cpp/src/arrow/compute/kernels/temporal_internal.h +++ b/cpp/src/arrow/compute/kernels/temporal_internal.h @@ -21,13 +21,14 @@ #include #include "arrow/compute/api_scalar.h" -#include "arrow/vendored/datetime.h" - -namespace arrow { - -namespace compute { -namespace internal { - +#include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/util/date_internal.h" +#include "arrow/util/value_parsing.h" + +namespace arrow::compute::internal { +using arrow::internal::checked_cast; +using arrow::internal::OffsetZone; +using arrow_vendored::date::choose; using arrow_vendored::date::days; using arrow_vendored::date::floor; using arrow_vendored::date::local_days; @@ -40,17 +41,46 @@ using arrow_vendored::date::year_month_day; using arrow_vendored::date::zoned_time; using std::chrono::duration_cast; +// https://howardhinnant.github.io/date/tz.html#Examples +using ArrowTimeZone = std::variant; + +template +auto ApplyTimeZone(const ArrowTimeZone& tz, sys_time st, Func&& func) + -> decltype(func(zoned_time{})) { + return std::visit( + [&](auto&& zone) { + if constexpr (std::is_pointer_v >) { + return func(zoned_time{zone, st}); + } else { + return func(zoned_time{&zone, st}); + } + }, + tz); +} + +template +auto ApplyTimeZone(const ArrowTimeZone& tz, local_time lt, + std::optional c, Func&& func) + -> decltype(func(zoned_time{})) { + return std::visit( + [&](auto&& zone) { + if constexpr (std::is_pointer_v >) { + return c.has_value() ? func(zoned_time{zone, lt, c.value()}) + : func(zoned_time{zone, lt}); + } else { + // Offset zone conversion to/from UTC is always unambiguous + // therefore `c` can be ignored. + return func(zoned_time{&zone, lt}); + } + }, + tz); +} + inline int64_t GetQuarter(const year_month_day& ymd) { return static_cast((static_cast(ymd.month()) - 1) / 3); } -static inline Result LocateZone(const std::string& timezone) { - try { - return locate_zone(timezone); - } catch (const std::runtime_error& ex) { - return Status::Invalid("Cannot locate timezone '", timezone, "': ", ex.what()); - } -} +ARROW_EXPORT Result LocateZone(const std::string_view timezone); static inline const std::string& GetInputTimezone(const DataType& type) { static const std::string no_timezone = ""; @@ -100,19 +130,24 @@ struct ZonedLocalizer { using days_t = local_days; // Timezone-localizing conversions: UTC -> local time - const time_zone* tz; + const ArrowTimeZone tz_; template local_time ConvertTimePoint(int64_t t) const { - return tz->to_local(sys_time(Duration{t})); + const auto st = sys_time(Duration{t}); + return std::visit( + [st](const auto& tz) -> local_time { return tz->to_local(st); }, tz_); } template Duration ConvertLocalToSys(Duration t, Status* st) const { + const auto lt = local_time(t); + auto local_to_sys_time = [&](auto&& t) { + return t.get_sys_time().time_since_epoch(); + }; + try { - return zoned_time{tz, local_time(t)} - .get_sys_time() - .time_since_epoch(); + return ApplyTimeZone(tz_, lt, std::nullopt, local_to_sys_time); } catch (const arrow_vendored::date::nonexistent_local_time& e) { *st = Status::Invalid("Local time does not exist: ", e.what()); return Duration{0}; @@ -128,12 +163,12 @@ struct ZonedLocalizer { template struct TimestampFormatter { const char* format; - const time_zone* tz; + const ArrowTimeZone tz; std::ostringstream bufstream; - explicit TimestampFormatter(const std::string& format, const time_zone* tz, + explicit TimestampFormatter(const std::string& format, const ArrowTimeZone time_zone, const std::locale& locale) - : format(format.c_str()), tz(tz) { + : format(format.c_str()), tz(time_zone) { bufstream.imbue(locale); // Propagate errors as C++ exceptions (to get an actual error message) bufstream.exceptions(std::ios::failbit | std::ios::badbit); @@ -141,14 +176,17 @@ struct TimestampFormatter { Result operator()(int64_t arg) { bufstream.str(""); - const auto zt = zoned_time{tz, sys_time(Duration{arg})}; - try { - arrow_vendored::date::to_stream(bufstream, format, zt); - } catch (const std::runtime_error& ex) { - bufstream.clear(); - return Status::Invalid("Failed formatting timestamp: ", ex.what()); - } - // XXX could return a view with std::ostringstream::view() (C++20) + const auto timepoint = sys_time(Duration{arg}); + auto format_zoned_time = [&](auto&& zt) { + try { + arrow_vendored::date::to_stream(bufstream, format, zt); + return Status::OK(); + } catch (const std::runtime_error& ex) { + bufstream.clear(); + return Status::Invalid("Failed formatting timestamp: ", ex.what()); + } + }; + RETURN_NOT_OK(ApplyTimeZone(tz, timepoint, format_zoned_time)); return std::move(bufstream).str(); } }; @@ -318,6 +356,4 @@ struct TemporalComponentExtract } }; -} // namespace internal -} // namespace compute -} // namespace arrow +} // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/test_util_internal.cc b/cpp/src/arrow/compute/kernels/test_util_internal.cc index d48b3b0781e..b184fe7d44c 100644 --- a/cpp/src/arrow/compute/kernels/test_util_internal.cc +++ b/cpp/src/arrow/compute/kernels/test_util_internal.cc @@ -172,6 +172,8 @@ void CheckScalar(std::string func_name, const DatumVector& inputs, Datum expecte } } +namespace { + Datum CheckDictionaryNonRecursive(const std::string& func_name, const DatumVector& args, bool result_is_encoded) { EXPECT_OK_AND_ASSIGN(Datum actual, CallFunction(func_name, args)); @@ -204,6 +206,8 @@ Datum CheckDictionaryNonRecursive(const std::string& func_name, const DatumVecto return actual; } +} // namespace + void CheckDictionary(const std::string& func_name, const DatumVector& args, bool result_is_encoded) { auto actual = CheckDictionaryNonRecursive(func_name, args, result_is_encoded); @@ -281,6 +285,11 @@ void CheckScalarBinaryCommutative(std::string func_name, Datum left_input, CheckScalar(func_name, {right_input, left_input}, expected, options); } +void CheckDispatchExact(std::string func_name, std::vector types) { + ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(func_name)); + ASSERT_OK(function->DispatchExact(types)); +} + void CheckDispatchBest(std::string func_name, std::vector original_values, std::vector expected_equivalent_values) { ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(func_name)); @@ -302,6 +311,23 @@ void CheckDispatchBest(std::string func_name, std::vector original_v } } +void CheckDispatchBestWithCastedTypes(std::string func_name, + std::vector values, + const std::vector& expected_values) { + ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(func_name)); + ASSERT_OK_AND_ASSIGN(auto kernel, function->DispatchBest(&values)); + ASSERT_NE(kernel, nullptr); + EXPECT_EQ(values.size(), expected_values.size()); + for (size_t i = 0; i < values.size(); i++) { + AssertTypeEqual(*values[i], *expected_values[i]); + } +} + +void CheckDispatchExactFails(std::string func_name, std::vector types) { + ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(func_name)); + ASSERT_NOT_OK(function->DispatchExact(types)); +} + void CheckDispatchFails(std::string func_name, std::vector types) { ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(func_name)); ASSERT_NOT_OK(function->DispatchBest(&types)); diff --git a/cpp/src/arrow/compute/kernels/test_util_internal.h b/cpp/src/arrow/compute/kernels/test_util_internal.h index e3a27ab9add..231e5762135 100644 --- a/cpp/src/arrow/compute/kernels/test_util_internal.h +++ b/cpp/src/arrow/compute/kernels/test_util_internal.h @@ -155,11 +155,23 @@ void TestRandomPrimitiveCTypes() { DoTestFunctor::Test(duration(TimeUnit::MILLI)); } +// Check that DispatchExact on a given function yields a valid Kernel +void CheckDispatchExact(std::string func_name, std::vector types); + // Check that DispatchBest on a given function yields the same Kernel as // produced by DispatchExact on another set of types void CheckDispatchBest(std::string func_name, std::vector types, std::vector exact_types); +// Check that DispatchBest on a given function yields a valid Kernel and casts the input +// types as expected +void CheckDispatchBestWithCastedTypes(std::string func_name, + std::vector types, + const std::vector& expected_types); + +// Check that function fails to produce a Kernel via DispatchExact for the set of types +void CheckDispatchExactFails(std::string func_name, std::vector types); + // Check that function fails to produce a Kernel for the set of types void CheckDispatchFails(std::string func_name, std::vector types); diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc index 4a3a5eef89f..950de47733b 100644 --- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc @@ -28,6 +28,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/util_internal.h" #include "arrow/compute/kernels/vector_sort_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/type_traits.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bitmap.h" diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc index 8e9134d7f58..3c4f833b752 100644 --- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc +++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc @@ -24,6 +24,7 @@ #include "arrow/compute/kernels/base_arithmetic_internal.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/result.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" @@ -73,7 +74,7 @@ struct CumulativeBinaryOp { OutValue current_value; - CumulativeBinaryOp() { current_value = Identity::template value; } + CumulativeBinaryOp() { current_value = Identity::template value(); } explicit CumulativeBinaryOp(const std::shared_ptr start) { current_value = UnboxScalar::Unbox(*start); @@ -285,6 +286,11 @@ struct CumulativeStatefulKernelFactory { return arrow::Status::OK(); } + Status Visit(const HalfFloatType& type) { + return Status::NotImplemented("Cumulative kernel not implemented for type ", + type.ToString()); + } + Status Visit(const DataType& type) { return Status::NotImplemented("Cumulative kernel not implemented for type ", type.ToString()); diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index bd8cbdb0430..e666f2b9f7a 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -29,7 +29,9 @@ #include "arrow/buffer.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" +#include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" #include "arrow/util/int_util.h" diff --git a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc index 3200f7a469a..3f8683b74d9 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc @@ -48,7 +48,7 @@ static void BuildDictionary(benchmark::State& state) { // NOLINT non-const refe ArrayFromVector(is_valid, values, &arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr).status()); + ABORT_NOT_OK(DictionaryEncode(arr)); } state.counters["null_percent"] = static_cast(arr->null_count()) / arr->length() * 100; @@ -75,7 +75,7 @@ static void BuildStringDictionary( ArrayFromVector(data, &arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr).status()); + ABORT_NOT_OK(DictionaryEncode(arr)); } state.SetBytesProcessed(state.iterations() * total_bytes); state.SetItemsProcessed(state.iterations() * data.size()); @@ -133,7 +133,7 @@ void BenchUnique(benchmark::State& state, const ParamType& params) { params.GenerateTestData(&arr); while (state.KeepRunning()) { - ABORT_NOT_OK(Unique(arr).status()); + ABORT_NOT_OK(Unique(arr)); } params.SetMetadata(state); } @@ -143,7 +143,7 @@ void BenchDictionaryEncode(benchmark::State& state, const ParamType& params) { std::shared_ptr arr; params.GenerateTestData(&arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr).status()); + ABORT_NOT_OK(DictionaryEncode(arr)); } params.SetMetadata(state); } @@ -215,7 +215,7 @@ void BenchValueCountsDictionaryChunks(benchmark::State& state, const ParamType& auto chunked_array = std::make_shared(chunks); while (state.KeepRunning()) { - ABORT_NOT_OK(ValueCounts(chunked_array).status()); + ABORT_NOT_OK(ValueCounts(chunked_array)); } params.SetMetadata(state); } diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc index 0a966a66f4f..b0fa296e007 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc @@ -43,8 +43,6 @@ #include "arrow/compute/api.h" #include "arrow/compute/kernels/test_util_internal.h" -#include "arrow/ipc/json_simple.h" - namespace arrow { using internal::checked_cast; diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 1c5cced3597..d515e60fbee 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -20,6 +20,7 @@ #include "arrow/array/array_base.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/result.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index e7c92d78a07..2c61afcc25a 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -28,6 +28,7 @@ #include "arrow/compute/kernels/base_arithmetic_internal.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/compute/util.h" #include "arrow/status.h" #include "arrow/type.h" @@ -147,7 +148,7 @@ void RegisterPairwiseDiffKernels(std::string_view func_name, doc, GetDefaultPairwiseOptions()); auto base_func_result = registry->GetFunction(std::string(base_func_name)); - DCHECK_OK(base_func_result.status()); + DCHECK_OK(base_func_result); const auto& base_func = checked_cast(**base_func_result); DCHECK_EQ(base_func.arity().num_args, 2); diff --git a/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc index 7f199b25b76..7d0418e029d 100644 --- a/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc @@ -31,7 +31,7 @@ constexpr auto kSeed = 0x0ff1ce; static void NthToIndicesBenchmark(benchmark::State& state, const std::shared_ptr& values, int64_t n) { for (auto _ : state) { - ABORT_NOT_OK(NthToIndices(*values, n).status()); + ABORT_NOT_OK(NthToIndices(*values, n)); } state.SetItemsProcessed(state.iterations() * values->length()); } diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 81b7640b0fe..ef7419ea7c5 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -21,6 +21,7 @@ #include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/logging_internal.h" #include "arrow/util/math_internal.h" @@ -381,9 +382,13 @@ class RankMetaFunction : public RankMetaFunctionBase { } RankMetaFunction() - : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, &kDefaultOptions) {} + : RankMetaFunctionBase("rank", Arity::Unary(), rank_doc, GetDefaultOptions()) {} - static inline const auto kDefaultOptions = RankOptions::Defaults(); + private: + static const RankOptions* GetDefaultOptions() { + static const auto kDefaultOptions = RankOptions::Defaults(); + return &kDefaultOptions; + } }; class RankQuantileMetaFunction : public RankMetaFunctionBase { @@ -398,9 +403,13 @@ class RankQuantileMetaFunction : public RankMetaFunctionBase { @@ -415,9 +424,13 @@ class RankNormalMetaFunction : public RankMetaFunctionBase signature, ArrayKernelExec exec, VectorKernel::ChunkedExec exec_chunked, FunctionRegistry* registry, VectorFunction* func) { @@ -842,6 +842,7 @@ void RegisterVectorFunction(FunctionRegistry* registry, } add_primitive_kernel(null()); add_primitive_kernel(boolean()); + add_primitive_kernel(float16()); AddKernel(Type::FIXED_SIZE_BINARY, Functor::GetSignature(Type::FIXED_SIZE_BINARY), Functor::Exec, ChunkedFunctor::Exec, @@ -867,6 +868,8 @@ void RegisterVectorFunction(FunctionRegistry* registry, // TODO(ARROW-9431): "replace_with_indices" } +} // namespace + const FunctionDoc replace_with_mask_doc( "Replace items selected with a mask", ("Given an array and a boolean mask (either scalar or of equal length),\n" diff --git a/cpp/src/arrow/compute/kernels/vector_replace_test.cc b/cpp/src/arrow/compute/kernels/vector_replace_test.cc index 31afbff3367..587b9f2a60e 100644 --- a/cpp/src/arrow/compute/kernels/vector_replace_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_replace_test.cc @@ -233,8 +233,9 @@ class TestReplaceBinary : public TestReplaceKernel { using NumericBasedTypes = ::testing::Types; + Int32Type, Int64Type, HalfFloatType, FloatType, DoubleType, + Date32Type, Date64Type, Time32Type, Time64Type, TimestampType, + MonthIntervalType>; TYPED_TEST_SUITE(TestReplaceNumeric, NumericBasedTypes); TYPED_TEST_SUITE(TestReplaceDecimal, DecimalArrowTypes); @@ -1490,6 +1491,7 @@ TYPED_TEST(TestFillNullNumeric, FillNullForwardLargeInput) { ASSERT_OK_AND_ASSIGN(auto array_null, MakeArrayOfNull(array_random->type(), len_null)); auto array_null_filled = ConstantArrayGenerator::Numeric(len_null, x_ptr[len_random - 1]); + ASSERT_NE(array_null_filled, nullptr); { ASSERT_OK_AND_ASSIGN(auto value_array, Concatenate({array_random, array_null, array_random})); diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc index 1eaf8588766..bc8b25de4ec 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc @@ -21,6 +21,7 @@ #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/ree_util_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging_internal.h" @@ -489,6 +490,7 @@ static ArrayKernelExec GenerateREEKernelExec(Type::type type_id) { return Functor::template Exec; case Type::UINT16: case Type::INT16: + case Type::HALF_FLOAT: return Functor::template Exec; case Type::UINT32: case Type::INT32: @@ -496,6 +498,7 @@ static ArrayKernelExec GenerateREEKernelExec(Type::type type_id) { case Type::DATE32: case Type::TIME32: case Type::INTERVAL_MONTHS: + case Type::DECIMAL32: return Functor::template Exec; case Type::UINT64: case Type::INT64: @@ -505,6 +508,7 @@ static ArrayKernelExec GenerateREEKernelExec(Type::type type_id) { case Type::TIME64: case Type::DURATION: case Type::INTERVAL_DAY_TIME: + case Type::DECIMAL64: return Functor::template Exec; case Type::INTERVAL_MONTH_DAY_NANO: return Functor::template Exec; @@ -563,6 +567,7 @@ void RegisterVectorRunEndEncode(FunctionRegistry* registry) { for (const auto& ty : NumericTypes()) { add_kernel(ty->id()); } + add_kernel(Type::HALF_FLOAT); add_kernel(Type::DATE32); add_kernel(Type::DATE64); add_kernel(Type::TIME32); @@ -572,8 +577,9 @@ void RegisterVectorRunEndEncode(FunctionRegistry* registry) { for (const auto& ty : IntervalTypes()) { add_kernel(ty->id()); } - add_kernel(Type::DECIMAL128); - add_kernel(Type::DECIMAL256); + for (const auto& type_id : DecimalTypeIds()) { + add_kernel(type_id); + } add_kernel(Type::FIXED_SIZE_BINARY); add_kernel(Type::STRING); add_kernel(Type::BINARY); @@ -604,6 +610,7 @@ void RegisterVectorRunEndDecode(FunctionRegistry* registry) { for (const auto& ty : NumericTypes()) { add_kernel(ty->id()); } + add_kernel(Type::HALF_FLOAT); add_kernel(Type::DATE32); add_kernel(Type::DATE64); add_kernel(Type::TIME32); @@ -613,8 +620,9 @@ void RegisterVectorRunEndDecode(FunctionRegistry* registry) { for (const auto& ty : IntervalTypes()) { add_kernel(ty->id()); } - add_kernel(Type::DECIMAL128); - add_kernel(Type::DECIMAL256); + for (const auto& type_id : DecimalTypeIds()) { + add_kernel(type_id); + } add_kernel(Type::FIXED_SIZE_BINARY); add_kernel(Type::STRING); add_kernel(Type::BINARY); diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc index 7f19321fe7a..a78e9fe957b 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc @@ -356,6 +356,13 @@ std::vector GenerateTestData() { REETestData::TypeMinMaxNull(), REETestData::TypeMinMaxNull(), REETestData::TypeMinMaxNull(), + // Float types + REETestData::JSON(float16(), "[1, 1, 0, -5, -5, -5, 255, 255]", "[1, 0, -5, 255]", + "[2, 3, 6, 8]"), + REETestData::JSON(float32(), "[1, 1, 0, -5, -5, -5, 255, 255]", "[1, 0, -5, 255]", + "[2, 3, 6, 8]"), + REETestData::JSON(float64(), "[1, 1, 0, -5, -5, -5, 255, 255]", "[1, 0, -5, 255]", + "[2, 3, 6, 8]"), // A few temporal types REETestData::JSON(date32(), "[86400, 86400, 0, 432000, 432000, 432000, 22075200, 22075200]", @@ -369,6 +376,12 @@ std::vector GenerateTestData() { REETestData::JSON(time64(TimeUnit::MICRO), "[1, 1, 0, 5, 5, 5, 255, 255]", "[1, 0, 5, 255]", "[2, 3, 6, 8]"), // Decimal and fixed size binary types + REETestData::JSON(decimal32(4, 1), + R"(["1.0", "1.0", "0.0", "5.2", "5.2", "5.2", "255.0", "255.0"])", + R"(["1.0", "0.0", "5.2", "255.0"])", "[2, 3, 6, 8]"), + REETestData::JSON(decimal64(4, 1), + R"(["1.0", "1.0", "0.0", "5.2", "5.2", "5.2", "255.0", "255.0"])", + R"(["1.0", "0.0", "5.2", "255.0"])", "[2, 3, 6, 8]"), REETestData::JSON(decimal128(4, 1), R"(["1.0", "1.0", "0.0", "5.2", "5.2", "5.2", "255.0", "255.0"])", R"(["1.0", "0.0", "5.2", "255.0"])", "[2, 3, 6, 8]"), diff --git a/cpp/src/arrow/compute/kernels/vector_select_k.cc b/cpp/src/arrow/compute/kernels/vector_select_k.cc index eba7873e510..591a2509673 100644 --- a/cpp/src/arrow/compute/kernels/vector_select_k.cc +++ b/cpp/src/arrow/compute/kernels/vector_select_k.cc @@ -20,6 +20,7 @@ #include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/logging_internal.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index 6c6f1b36b84..6f8dd52a4fd 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -34,6 +34,7 @@ #include "arrow/compute/kernels/util_internal.h" #include "arrow/compute/kernels/vector_selection_filter_internal.h" #include "arrow/compute/kernels/vector_selection_take_internal.h" +#include "arrow/compute/registry_internal.h" #include "arrow/extension_type.h" #include "arrow/record_batch.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index 040dfc9656d..b9c8480d988 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -211,7 +211,7 @@ struct TakeBenchmark { } for (auto _ : state) { - ABORT_NOT_OK(Take(values, indices).status()); + ABORT_NOT_OK(Take(values, indices)); } state.SetItemsProcessed(state.iterations() * num_indices); state.counters["selection_factor"] = selection_factor; @@ -253,11 +253,11 @@ struct TakeBenchmark { if (chunk_indices_too) { for (auto _ : state) { - ABORT_NOT_OK(Take(values, chunked_indices).status()); + ABORT_NOT_OK(Take(values, chunked_indices)); } } else { for (auto _ : state) { - ABORT_NOT_OK(Take(values, indices).status()); + ABORT_NOT_OK(Take(values, indices)); } } state.SetItemsProcessed(state.iterations() * num_indices); @@ -321,7 +321,7 @@ struct FilterBenchmark { auto filter = rand.Boolean(values->length(), args.selected_proportion, args.filter_null_proportion); for (auto _ : state) { - ABORT_NOT_OK(Filter(values, filter).status()); + ABORT_NOT_OK(Filter(values, filter)); } state.SetItemsProcessed(state.iterations() * values->length()); } @@ -356,7 +356,7 @@ struct FilterBenchmark { auto batch = RecordBatch::Make(schema(fields), num_rows, columns); for (auto _ : state) { - ABORT_NOT_OK(Filter(batch, filter).status()); + ABORT_NOT_OK(Filter(batch, filter)); } state.SetItemsProcessed(state.iterations() * num_rows); } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 194c3591337..1c2eacb9a76 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -1096,6 +1096,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(match::LargeBinaryLike()), plain_filter, BinaryFilterExec}, {InputType(null()), plain_filter, NullFilterExec}, {InputType(Type::FIXED_SIZE_BINARY), plain_filter, PrimitiveFilterExec}, + {InputType(Type::DECIMAL32), plain_filter, PrimitiveFilterExec}, + {InputType(Type::DECIMAL64), plain_filter, PrimitiveFilterExec}, {InputType(Type::DECIMAL128), plain_filter, PrimitiveFilterExec}, {InputType(Type::DECIMAL256), plain_filter, PrimitiveFilterExec}, {InputType(Type::DICTIONARY), plain_filter, DictionaryFilterExec}, @@ -1116,6 +1118,8 @@ void PopulateFilterKernels(std::vector* out) { {InputType(match::LargeBinaryLike()), ree_filter, BinaryFilterExec}, {InputType(null()), ree_filter, NullFilterExec}, {InputType(Type::FIXED_SIZE_BINARY), ree_filter, PrimitiveFilterExec}, + {InputType(Type::DECIMAL32), ree_filter, PrimitiveFilterExec}, + {InputType(Type::DECIMAL64), ree_filter, PrimitiveFilterExec}, {InputType(Type::DECIMAL128), ree_filter, PrimitiveFilterExec}, {InputType(Type::DECIMAL256), ree_filter, PrimitiveFilterExec}, {InputType(Type::DICTIONARY), ree_filter, DictionaryFilterExec}, diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 28868849fc5..41cb0a357a4 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -20,6 +20,7 @@ #include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/logging_internal.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc index 5c31d4da38a..787b32b6363 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc @@ -309,7 +309,7 @@ static void ChunkedArraySortIndicesString(benchmark::State& state) { static void DatumSortIndicesBenchmark(benchmark::State& state, const Datum& datum, const SortOptions& options) { for (auto _ : state) { - ABORT_NOT_OK(SortIndices(datum, options).status()); + ABORT_NOT_OK(SortIndices(datum, options)); } } diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 4ce0b5a616e..0569f1f2abb 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -30,6 +30,7 @@ #include "arrow/array/concatenate.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util_internal.h" +#include "arrow/compute/registry.h" #include "arrow/result.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" @@ -2317,6 +2318,11 @@ class TestRank : public BaseTestRank { } }; +TEST_F(TestRank, DefaultOptions) { + ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction("rank")); + ASSERT_STREQ(function->default_options()->type_name(), "RankOptions"); +} + TEST_F(TestRank, Real) { for (auto real_type : ::arrow::FloatingPointTypes()) { SetInput(ArrayFromJSON(real_type, "[2.1, 3.2, 1.0, 0.0, 5.5]")); @@ -2635,6 +2641,12 @@ class TestRankQuantile : public BaseTestRank { } }; +TEST_F(TestRankQuantile, DefaultOptions) { + ASSERT_OK_AND_ASSIGN(auto function, + GetFunctionRegistry()->GetFunction("rank_quantile")); + ASSERT_STREQ(function->default_options()->type_name(), "RankQuantileOptions"); +} + TEST_F(TestRankQuantile, Real) { for (auto type : ::arrow::FloatingPointTypes()) { AssertRankQuantileNumeric(type); @@ -2675,5 +2687,12 @@ TEST_F(TestRankQuantile, FixedSizeBinary) { AssertRankQuantile_N1N2N(); } +class TestRankNormal : public BaseTestRank {}; + +TEST_F(TestRankNormal, DefaultOptions) { + ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction("rank_normal")); + ASSERT_STREQ(function->default_options()->type_name(), "RankQuantileOptions"); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_statistics.cc b/cpp/src/arrow/compute/kernels/vector_statistics.cc index 3965b7f84b9..074f2ec0a73 100644 --- a/cpp/src/arrow/compute/kernels/vector_statistics.cc +++ b/cpp/src/arrow/compute/kernels/vector_statistics.cc @@ -27,6 +27,7 @@ #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/result.h" #include "arrow/scalar.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/kernels/vector_swizzle.cc b/cpp/src/arrow/compute/kernels/vector_swizzle.cc index 0e6a4e0a2f3..aa82f55c2b8 100644 --- a/cpp/src/arrow/compute/kernels/vector_swizzle.cc +++ b/cpp/src/arrow/compute/kernels/vector_swizzle.cc @@ -19,6 +19,7 @@ #include "arrow/compute/function.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/registry_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging_internal.h" diff --git a/cpp/src/arrow/compute/kernels/vector_topk_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_topk_benchmark.cc index 44452471b25..f0248d0d8d2 100644 --- a/cpp/src/arrow/compute/kernels/vector_topk_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_topk_benchmark.cc @@ -33,7 +33,7 @@ constexpr auto kSeed = 0x0ff1ce; static void SelectKBenchmark(benchmark::State& state, const std::shared_ptr& values, int64_t k) { for (auto _ : state) { - ABORT_NOT_OK(SelectKUnstable(*values, SelectKOptions::TopKDefault(k)).status()); + ABORT_NOT_OK(SelectKUnstable(*values, SelectKOptions::TopKDefault(k))); } state.SetItemsProcessed(state.iterations() * values->length()); } diff --git a/cpp/src/arrow/compute/key_hash_internal.h b/cpp/src/arrow/compute/key_hash_internal.h index 769f3b2145e..d141603ce0f 100644 --- a/cpp/src/arrow/compute/key_hash_internal.h +++ b/cpp/src/arrow/compute/key_hash_internal.h @@ -21,6 +21,7 @@ #include "arrow/compute/light_array_internal.h" #include "arrow/compute/util.h" +#include "arrow/compute/visibility.h" #include "arrow/util/simd.h" namespace arrow { @@ -34,7 +35,7 @@ enum class BloomFilterBuildStrategy; // Implementations are based on xxh3 32-bit algorithm description from: // https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md // -class ARROW_EXPORT Hashing32 { +class ARROW_COMPUTE_EXPORT Hashing32 { friend class TestVectorHash; template friend void TestBloomLargeHashHelper(int64_t, int64_t, const std::vector&, @@ -157,7 +158,7 @@ class ARROW_EXPORT Hashing32 { #endif }; -class ARROW_EXPORT Hashing64 { +class ARROW_COMPUTE_EXPORT Hashing64 { friend class TestVectorHash; template friend void TestBloomLargeHashHelper(int64_t, int64_t, const std::vector&, diff --git a/cpp/src/arrow/compute/key_map_internal.h b/cpp/src/arrow/compute/key_map_internal.h index c558ef5c2a6..27583e82ade 100644 --- a/cpp/src/arrow/compute/key_map_internal.h +++ b/cpp/src/arrow/compute/key_map_internal.h @@ -22,6 +22,7 @@ #include "arrow/compute/util.h" #include "arrow/compute/util_internal.h" +#include "arrow/compute/visibility.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -37,7 +38,7 @@ namespace compute { // slots, stamps) and operations provided by this class is given in the document: // arrow/acero/doc/key_map.md. // -class ARROW_EXPORT SwissTable { +class ARROW_COMPUTE_EXPORT SwissTable { friend class SwissTableMerge; public: diff --git a/cpp/src/arrow/compute/light_array_internal.cc b/cpp/src/arrow/compute/light_array_internal.cc index e4b1f1b8cdd..322f4bd7fdb 100644 --- a/cpp/src/arrow/compute/light_array_internal.cc +++ b/cpp/src/arrow/compute/light_array_internal.cc @@ -611,11 +611,9 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source }); Visit(source, num_rows_to_append - num_rows_to_process, row_ids + num_rows_to_process, [&](int i, const uint8_t* ptr, int32_t num_bytes) { - uint64_t* dst = reinterpret_cast( - target->mutable_data(2) + - offsets[num_rows_before + num_rows_to_process + i]); - const uint64_t* src = reinterpret_cast(ptr); - memcpy(dst, src, num_bytes); + auto dst = target->mutable_data(2) + + offsets[num_rows_before + num_rows_to_process + i]; + memcpy(dst, ptr, num_bytes); }); } diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h index cf7b95cbe74..ecd7e758ecd 100644 --- a/cpp/src/arrow/compute/light_array_internal.h +++ b/cpp/src/arrow/compute/light_array_internal.h @@ -23,6 +23,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/util.h" #include "arrow/compute/util_internal.h" +#include "arrow/compute/visibility.h" #include "arrow/type.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" @@ -53,7 +54,7 @@ struct LightContext { /// and no children. /// /// This metadata object is a zero-allocation analogue of arrow::DataType -struct ARROW_EXPORT KeyColumnMetadata { +struct ARROW_COMPUTE_EXPORT KeyColumnMetadata { KeyColumnMetadata() = default; KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in, bool is_null_type_in = false) @@ -81,7 +82,7 @@ struct ARROW_EXPORT KeyColumnMetadata { /// A "key" column is a non-nested, non-union column \see KeyColumnMetadata /// /// This metadata object is a zero-allocation analogue of arrow::ArrayData -class ARROW_EXPORT KeyColumnArray { +class ARROW_COMPUTE_EXPORT KeyColumnArray { public: /// \brief Create an uninitialized KeyColumnArray KeyColumnArray() = default; @@ -218,7 +219,7 @@ class ARROW_EXPORT KeyColumnArray { /// /// This should only be called on "key" columns. Calling this with /// a non-key column will return Status::TypeError. -ARROW_EXPORT Result ColumnMetadataFromDataType( +ARROW_COMPUTE_EXPORT Result ColumnMetadataFromDataType( const std::shared_ptr& type); /// \brief Create KeyColumnArray from ArrayData @@ -228,7 +229,7 @@ ARROW_EXPORT Result ColumnMetadataFromDataType( /// /// The caller should ensure this is only called on "key" columns. /// \see ColumnMetadataFromDataType for details -ARROW_EXPORT Result ColumnArrayFromArrayData( +ARROW_COMPUTE_EXPORT Result ColumnArrayFromArrayData( const std::shared_ptr& array_data, int64_t start_row, int64_t num_rows); /// \brief Create KeyColumnArray from ArrayData and KeyColumnMetadata @@ -238,7 +239,7 @@ ARROW_EXPORT Result ColumnArrayFromArrayData( /// /// The caller should ensure this is only called on "key" columns. /// \see ColumnMetadataFromDataType for details -ARROW_EXPORT KeyColumnArray ColumnArrayFromArrayDataAndMetadata( +ARROW_COMPUTE_EXPORT KeyColumnArray ColumnArrayFromArrayDataAndMetadata( const std::shared_ptr& array_data, const KeyColumnMetadata& metadata, int64_t start_row, int64_t num_rows); @@ -248,7 +249,7 @@ ARROW_EXPORT KeyColumnArray ColumnArrayFromArrayDataAndMetadata( /// /// All columns in `batch` must be eligible "key" columns and have an array shape /// \see ColumnMetadataFromDataType for more details -ARROW_EXPORT Status ColumnMetadatasFromExecBatch( +ARROW_COMPUTE_EXPORT Status ColumnMetadatasFromExecBatch( const ExecBatch& batch, std::vector* column_metadatas); /// \brief Create KeyColumnArray instances from a slice of an ExecBatch @@ -257,9 +258,9 @@ ARROW_EXPORT Status ColumnMetadatasFromExecBatch( /// /// All columns in `batch` must be eligible "key" columns and have an array shape /// \see ColumnArrayFromArrayData for more details -ARROW_EXPORT Status ColumnArraysFromExecBatch(const ExecBatch& batch, int64_t start_row, - int64_t num_rows, - std::vector* column_arrays); +ARROW_COMPUTE_EXPORT Status +ColumnArraysFromExecBatch(const ExecBatch& batch, int64_t start_row, int64_t num_rows, + std::vector* column_arrays); /// \brief Create KeyColumnArray instances from an ExecBatch /// @@ -267,8 +268,8 @@ ARROW_EXPORT Status ColumnArraysFromExecBatch(const ExecBatch& batch, int64_t st /// /// All columns in `batch` must be eligible "key" columns and have an array shape /// \see ColumnArrayFromArrayData for more details -ARROW_EXPORT Status ColumnArraysFromExecBatch(const ExecBatch& batch, - std::vector* column_arrays); +ARROW_COMPUTE_EXPORT Status ColumnArraysFromExecBatch( + const ExecBatch& batch, std::vector* column_arrays); /// A lightweight resizable array for "key" columns /// @@ -276,7 +277,7 @@ ARROW_EXPORT Status ColumnArraysFromExecBatch(const ExecBatch& batch, /// /// Resizing is handled by arrow::ResizableBuffer and a doubling approach is /// used so that resizes will always grow up to the next power of 2 -class ARROW_EXPORT ResizableArrayData { +class ARROW_COMPUTE_EXPORT ResizableArrayData { public: /// \brief Create an uninitialized instance /// @@ -372,7 +373,7 @@ class ARROW_EXPORT ResizableArrayData { /// \brief A builder to concatenate batches of data into a larger batch /// /// Will only store num_rows_max() rows -class ARROW_EXPORT ExecBatchBuilder { +class ARROW_COMPUTE_EXPORT ExecBatchBuilder { public: /// \brief Add rows from `source` into `target` column /// diff --git a/cpp/src/arrow/compute/meson.build b/cpp/src/arrow/compute/meson.build new file mode 100644 index 00000000000..fed699e1ca4 --- /dev/null +++ b/cpp/src/arrow/compute/meson.build @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Meson does not allow you to glob for headers to install. See also +# https://mesonbuild.com/FAQ.html#why-cant-i-specify-target-files-with-a-wildcard +# install_subdir would be usable if the directory only contained headers + +install_headers( + [ + 'api.h', + 'api_scalar.h', + 'api_vector.h', + 'cast.h', + 'exec.h', + 'expression.h', + 'function.h', + 'function_options.h', + 'kernel.h', + 'ordering.h', + 'registry.h', + 'type_fwd.h', + 'util.h', + ], + subdir: 'arrow/compute', +) + +if needs_compute + pkg.generate( + filebase: 'arrow-compute', + name: 'Apache Arrow Compute Kernels', + description: 'Apache Arrow\'s Compute Kernels', + requires: ['arrow'], + ) +endif + +# Define arrow_compute_core_testing object library for common test files requiring +# only core compute. No extra kernels are required. +if needs_testing + arrow_compute_core_test_lib = library( + 'arrow-compute-core-testing', + sources: files('test_util_internal.cc'), + dependencies: arrow_test_dep, + ) + arrow_compute_core_test_dep = declare_dependency( + link_with: arrow_compute_core_test_lib, + ) +else + arrow_compute_core_test_dep = disabler() +endif + +# Define arrow_compute_testing object library for test files requiring extra kernels. +if needs_testing and needs_compute + arrow_compute_testing_lib = library( + 'arrow-compute-testing', + sources: files('test_env.cc'), + dependencies: [ + arrow_compute_dep, + arrow_compute_core_test_dep, + arrow_test_dep_no_main, + ], + ) + arrow_compute_test_dep = declare_dependency( + link_with: arrow_compute_testing_lib, + dependencies: [ + arrow_compute_dep, + arrow_compute_core_test_dep, + arrow_test_dep_no_main, + ], + ) +else + arrow_compute_test_dep = disabler() +endif + +exc = executable( + 'arrow-internals-test', + sources: [ + 'function_test.cc', + 'exec_test.cc', + 'kernel_test.cc', + 'registry_test.cc', + ], + dependencies: [arrow_compute_core_test_dep, arrow_test_dep], +) +test('arrow-internals-test', exc) + +compute_tests = { + 'arrow-compute-expression-test': {'sources': ['expression_test.cc']}, + 'arrow-compute-row-test': { + 'sources': [ + 'key_hash_test.cc', + 'light_array_test.cc', + 'row/compare_test.cc', + 'row/grouper_test.cc', + 'row/row_encoder_internal_test.cc', + 'row/row_test.cc', + 'util_internal_test.cc', + ], + }, +} + +# This will only add the test if ARROW_COMPUTE is enabled, meaning the full kernel registry is available. +# +# The following kernels are always present in default builds: +# - array_filter +# - array_take +# - cast +# - dictionary_encode +# - drop_null +# - filter +# - indices_nonzero +# - take +# - unique +# - value_counts +# +# Also see: GH-34388, GH-34615 +foreach key, val : compute_tests + exc = executable( + key, + sources: val['sources'], + dependencies: [arrow_compute_test_dep], + ) + test(key, exc) +endforeach + +exc = executable( + 'arrow-compute-function-benchmark', + sources: ['function_benchmark.cc'], + dependencies: [arrow_benchmark_dep], +) +benchmark('arrow-compute-function-benchmark', exc) + +subdir('kernels') + +subdir('row') diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc index b4f1c0f2f97..be0ebd32016 100644 --- a/cpp/src/arrow/compute/registry.cc +++ b/cpp/src/arrow/compute/registry.cc @@ -287,55 +287,12 @@ static std::unique_ptr CreateBuiltInRegistry() { RegisterDictionaryDecode(registry.get()); RegisterVectorHash(registry.get()); RegisterVectorSelection(registry.get()); + RegisterVectorSwizzle(registry.get()); RegisterScalarOptions(registry.get()); RegisterVectorOptions(registry.get()); RegisterAggregateOptions(registry.get()); -#ifdef ARROW_COMPUTE - // Register additional kernels - - // Scalar functions - RegisterScalarArithmetic(registry.get()); - RegisterScalarBoolean(registry.get()); - RegisterScalarComparison(registry.get()); - RegisterScalarIfElse(registry.get()); - RegisterScalarNested(registry.get()); - RegisterScalarRandom(registry.get()); // Nullary - RegisterScalarRoundArithmetic(registry.get()); - RegisterScalarSetLookup(registry.get()); - RegisterScalarStringAscii(registry.get()); - RegisterScalarStringUtf8(registry.get()); - RegisterScalarTemporalBinary(registry.get()); - RegisterScalarTemporalUnary(registry.get()); - RegisterScalarValidity(registry.get()); - - // Vector functions - RegisterVectorArraySort(registry.get()); - RegisterVectorCumulativeSum(registry.get()); - RegisterVectorNested(registry.get()); - RegisterVectorRank(registry.get()); - RegisterVectorReplace(registry.get()); - RegisterVectorSelectK(registry.get()); - RegisterVectorSort(registry.get()); - RegisterVectorRunEndEncode(registry.get()); - RegisterVectorRunEndDecode(registry.get()); - RegisterVectorPairwise(registry.get()); - RegisterVectorStatistics(registry.get()); - RegisterVectorSwizzle(registry.get()); - - // Aggregate functions - RegisterHashAggregateBasic(registry.get()); - RegisterHashAggregateNumeric(registry.get()); - RegisterHashAggregatePivot(registry.get()); - RegisterScalarAggregateBasic(registry.get()); - RegisterScalarAggregateMode(registry.get()); - RegisterScalarAggregatePivot(registry.get()); - RegisterScalarAggregateQuantile(registry.get()); - RegisterScalarAggregateTDigest(registry.get()); - RegisterScalarAggregateVariance(registry.get()); -#endif - return registry; } diff --git a/cpp/src/arrow/compute/row/CMakeLists.txt b/cpp/src/arrow/compute/row/CMakeLists.txt index 747fd0a92d9..542dc314806 100644 --- a/cpp/src/arrow/compute/row/CMakeLists.txt +++ b/cpp/src/arrow/compute/row/CMakeLists.txt @@ -20,6 +20,11 @@ arrow_install_all_headers("arrow/compute/row") -if(ARROW_COMPUTE) +if(ARROW_BUILD_BENCHMARKS AND ARROW_COMPUTE) add_arrow_benchmark(grouper_benchmark PREFIX "arrow-compute") + if(ARROW_BUILD_STATIC) + target_link_libraries(arrow-compute-grouper-benchmark PUBLIC arrow_compute_static) + else() + target_link_libraries(arrow-compute-grouper-benchmark PUBLIC arrow_compute_shared) + endif() endif() diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 72d408ba5f7..da2dfca1d82 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -276,7 +276,11 @@ void KeyCompare::CompareVarBinaryColumnToRowHelper( int32_t tail_length = length - j * 8; uint64_t tail_mask = ~0ULL >> (64 - 8 * tail_length); uint64_t key_left = 0; - std::memcpy(&key_left, key_left_ptr + j, tail_length); + // NOTE: UBSAN may falsely report "misaligned load" in `std::memcpy` on some + // platforms when using 64-bit pointers. Cast to an 8-bit pointer to work around + // this. + const uint8_t* src_bytes = reinterpret_cast(key_left_ptr + j); + std::memcpy(&key_left, src_bytes, tail_length); uint64_t key_right = key_right_ptr[j]; result_or |= tail_mask & (key_left ^ key_right); } diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index 29d7f859e59..264ef69b39f 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -23,6 +23,7 @@ #include "arrow/compute/row/encode_internal.h" #include "arrow/compute/row/row_internal.h" #include "arrow/compute/util.h" +#include "arrow/compute/visibility.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" @@ -30,7 +31,7 @@ namespace arrow { namespace compute { -class ARROW_EXPORT KeyCompare { +class ARROW_COMPUTE_EXPORT KeyCompare { public: // Clarify the max temp stack usage for CompareColumnsToRows, which might be necessary // for the caller to be aware of (possibly at compile time) to reserve enough stack size diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 8af84ac6b2f..c127354b0be 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -272,8 +272,8 @@ inline uint64_t CompareSelected8_avx2(const uint8_t* left_base, const uint8_t* r ARROW_DCHECK(false); } - __m128i right_lo = _mm256_i64gather_epi32((int const*)right_base, offset_right_lo, 1); - __m128i right_hi = _mm256_i64gather_epi32((int const*)right_base, offset_right_hi, 1); + __m128i right_lo = _mm256_i64gather_epi32((const int*)right_base, offset_right_lo, 1); + __m128i right_hi = _mm256_i64gather_epi32((const int*)right_base, offset_right_hi, 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); if (column_width != sizeof(uint32_t)) { constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff; @@ -318,8 +318,8 @@ inline uint64_t Compare8_avx2(const uint8_t* left_base, const uint8_t* right_bas ARROW_DCHECK(false); } - __m128i right_lo = _mm256_i64gather_epi32((int const*)right_base, offset_right_lo, 1); - __m128i right_hi = _mm256_i64gather_epi32((int const*)right_base, offset_right_hi, 1); + __m128i right_lo = _mm256_i64gather_epi32((const int*)right_base, offset_right_lo, 1); + __m128i right_hi = _mm256_i64gather_epi32((const int*)right_base, offset_right_hi, 1); __m256i right = _mm256_set_m128i(right_hi, right_lo); if (column_width != sizeof(uint32_t)) { constexpr uint32_t mask = column_width == 0 || column_width == 1 ? 0xff : 0xffff; diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 2b8f4d97561..9bc839cf424 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -386,8 +386,8 @@ TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsOver4GBFixedLength)) { RowTableImpl row_table_right, RepeatRowTableUntil(MakeRowTableFromExecBatch(batch_left).ValueUnsafe(), num_rows_row_table)); - // The row table must not contain a third buffer. - ASSERT_EQ(row_table_right.var_length_rows(), NULLPTR); + // The row table must be fixed length. + ASSERT_TRUE(row_table_right.metadata().is_fixed_length); // The row data must be greater than 4GB. ASSERT_GT(row_table_right.buffer_size(1), k4GB); diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 5ad82e0c8e7..6bfb87e6f84 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -26,6 +26,7 @@ #include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/row_internal.h" #include "arrow/compute/util.h" +#include "arrow/compute/visibility.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" @@ -44,7 +45,7 @@ namespace compute { /// be accessed together, as in the case of hash table key. /// /// Does not support nested types -class ARROW_EXPORT RowTableEncoder { +class ARROW_COMPUTE_EXPORT RowTableEncoder { public: void Init(const std::vector& cols, int row_alignment, int string_alignment); diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h index 7554e5ef159..9424559385b 100644 --- a/cpp/src/arrow/compute/row/grouper.h +++ b/cpp/src/arrow/compute/row/grouper.h @@ -21,6 +21,7 @@ #include #include "arrow/compute/kernel.h" +#include "arrow/compute/visibility.h" #include "arrow/datum.h" #include "arrow/result.h" #include "arrow/util/visibility.h" @@ -36,7 +37,7 @@ namespace compute { /// same segment key within a given batch. When a segment group span cross batches, it /// will have multiple segments. A segment never spans cross batches. The segment data /// structure only makes sense when used along with a exec batch. -struct ARROW_EXPORT Segment { +struct ARROW_COMPUTE_EXPORT Segment { /// \brief the offset into the batch where the segment starts int64_t offset; /// \brief the length of the segment @@ -74,7 +75,7 @@ inline bool operator!=(const Segment& segment1, const Segment& segment2) { /// /// If the next call to the segmenter starts with `A A` then that segment would set the /// "extends" flag, which indicates whether the segment continues the last open batch. -class ARROW_EXPORT RowSegmenter { +class ARROW_COMPUTE_EXPORT RowSegmenter { public: virtual ~RowSegmenter() = default; @@ -101,7 +102,7 @@ class ARROW_EXPORT RowSegmenter { }; /// Consumes batches of keys and yields batches of the group ids. -class ARROW_EXPORT Grouper { +class ARROW_COMPUTE_EXPORT Grouper { public: virtual ~Grouper() = default; diff --git a/cpp/src/arrow/compute/row/grouper_internal.h b/cpp/src/arrow/compute/row/grouper_internal.h index eb3dfe8ba16..bce9ea1d3d5 100644 --- a/cpp/src/arrow/compute/row/grouper_internal.h +++ b/cpp/src/arrow/compute/row/grouper_internal.h @@ -20,7 +20,7 @@ namespace arrow { namespace compute { -ARROW_EXPORT Result> MakeAnyKeysSegmenter( +ARROW_COMPUTE_EXPORT Result> MakeAnyKeysSegmenter( const std::vector& key_types, ExecContext* ctx); } // namespace compute diff --git a/cpp/src/arrow/compute/row/meson.build b/cpp/src/arrow/compute/row/meson.build new file mode 100644 index 00000000000..237c12d8b3b --- /dev/null +++ b/cpp/src/arrow/compute/row/meson.build @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Contains utilities for working with Arrow data been stored +# in a row-major order. + +install_headers(['grouper.h'], subdir: 'arrow/compute/row') + +if needs_compute + exc = executable( + 'arrow-compute-grouper-benchmark', + sources: ['grouper_benchmark.cc'], + dependencies: [arrow_compute_dep, arrow_benchmark_dep], + ) +endif diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.h b/cpp/src/arrow/compute/row/row_encoder_internal.h index 2cb47d4a600..9337e78bf8a 100644 --- a/cpp/src/arrow/compute/row/row_encoder_internal.h +++ b/cpp/src/arrow/compute/row/row_encoder_internal.h @@ -20,6 +20,7 @@ #include #include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/compute/visibility.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -29,7 +30,7 @@ using internal::checked_cast; namespace compute { namespace internal { -struct ARROW_EXPORT KeyEncoder { +struct ARROW_COMPUTE_EXPORT KeyEncoder { // the first byte of an encoded key is used to indicate nullity static constexpr bool kExtraByteForNull = true; @@ -85,7 +86,7 @@ struct ARROW_EXPORT KeyEncoder { } }; -struct ARROW_EXPORT BooleanKeyEncoder : KeyEncoder { +struct ARROW_COMPUTE_EXPORT BooleanKeyEncoder : KeyEncoder { static constexpr int kByteWidth = 1; void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override; @@ -101,7 +102,7 @@ struct ARROW_EXPORT BooleanKeyEncoder : KeyEncoder { MemoryPool* pool) override; }; -struct ARROW_EXPORT FixedWidthKeyEncoder : KeyEncoder { +struct ARROW_COMPUTE_EXPORT FixedWidthKeyEncoder : KeyEncoder { explicit FixedWidthKeyEncoder(std::shared_ptr type) : type_(std::move(type)), byte_width_(checked_cast(*type_).bit_width() / 8) {} @@ -122,7 +123,7 @@ struct ARROW_EXPORT FixedWidthKeyEncoder : KeyEncoder { const int byte_width_; }; -struct ARROW_EXPORT DictionaryKeyEncoder : FixedWidthKeyEncoder { +struct ARROW_COMPUTE_EXPORT DictionaryKeyEncoder : FixedWidthKeyEncoder { DictionaryKeyEncoder(std::shared_ptr type, MemoryPool* pool) : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {} @@ -251,7 +252,7 @@ struct VarLengthKeyEncoder : KeyEncoder { std::shared_ptr type_; }; -struct ARROW_EXPORT NullKeyEncoder : KeyEncoder { +struct ARROW_COMPUTE_EXPORT NullKeyEncoder : KeyEncoder { void AddLength(const ExecValue&, int64_t batch_length, int32_t* lengths) override {} void AddLengthNull(int32_t* length) override {} @@ -331,7 +332,7 @@ struct ARROW_EXPORT NullKeyEncoder : KeyEncoder { /// # Row Encoding /// /// The row format is the concatenation of the encodings of each column. -class ARROW_EXPORT RowEncoder { +class ARROW_COMPUTE_EXPORT RowEncoder { public: static constexpr int kRowIdForNulls() { return -1; } diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 0919773a228..219fcbc51f4 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -21,6 +21,7 @@ #include "arrow/buffer.h" #include "arrow/compute/light_array_internal.h" +#include "arrow/compute/visibility.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/logging.h" @@ -29,7 +30,7 @@ namespace arrow { namespace compute { /// Description of the data stored in a RowTable -struct ARROW_EXPORT RowTableMetadata { +struct ARROW_COMPUTE_EXPORT RowTableMetadata { using offset_type = int64_t; /// \brief True if there are no variable length columns in the table @@ -170,7 +171,7 @@ struct ARROW_EXPORT RowTableMetadata { /// Can store both fixed-size data types and variable-length data types /// /// The row table is not safe -class ARROW_EXPORT RowTableImpl { +class ARROW_COMPUTE_EXPORT RowTableImpl { public: using offset_type = RowTableMetadata::offset_type; diff --git a/cpp/src/arrow/compute/test_env.cc b/cpp/src/arrow/compute/test_env.cc new file mode 100644 index 00000000000..57e92763c6d --- /dev/null +++ b/cpp/src/arrow/compute/test_env.cc @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/initialize.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow::compute { + +namespace { + +class ComputeKernelEnvironment : public ::testing::Environment { + public: + // This must be done before using the compute kernels in order to + // register them to the FunctionRegistry. + ComputeKernelEnvironment() : ::testing::Environment() {} + + void SetUp() override { ASSERT_OK(arrow::compute::Initialize()); } +}; + +} // namespace +} // namespace arrow::compute + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::testing::AddGlobalTestEnvironment(new arrow::compute::ComputeKernelEnvironment); + return RUN_ALL_TESTS(); +} diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 1aaff43e10e..51a24b50fe6 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -26,6 +26,7 @@ #include "arrow/compute/expression.h" #include "arrow/compute/type_fwd.h" +#include "arrow/compute/visibility.h" #include "arrow/result.h" #include "arrow/util/cpu_info.h" #include "arrow/util/simd.h" @@ -66,49 +67,54 @@ class MiniBatch { namespace bit_util { -ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, - int* num_indexes, uint16_t* indexes, - int bit_offset = 0); +ARROW_COMPUTE_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + int* num_indexes, uint16_t* indexes, + int bit_offset = 0); -ARROW_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes, int bit_offset = 0); +ARROW_COMPUTE_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes, + int bit_offset = 0); // Input and output indexes may be pointing to the same data (in-place filtering). -ARROW_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, int* num_indexes_bit0, - uint16_t* indexes_bit0, uint16_t* indexes_bit1, - int bit_offset = 0); +ARROW_COMPUTE_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, int* num_indexes_bit0, + uint16_t* indexes_bit0, + uint16_t* indexes_bit1, int bit_offset = 0); // Bit 1 is replaced with byte 0xFF. -ARROW_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); +ARROW_COMPUTE_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, uint8_t* bytes, + int bit_offset = 0); // Return highest bit of each byte. -ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits, - const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); +ARROW_COMPUTE_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits, + const uint8_t* bytes, uint8_t* bits, + int bit_offset = 0); -ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, - uint32_t num_bytes); +ARROW_COMPUTE_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes); #if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2) // The functions below use BMI2 instructions, be careful before calling! namespace avx2 { -ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, - const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes); -ARROW_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, int* num_indexes, - uint16_t* indexes, uint16_t base_index = 0); -ARROW_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, - uint8_t* bytes); -ARROW_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, - uint8_t* bits); -ARROW_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); +ARROW_COMPUTE_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); +ARROW_COMPUTE_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, int* num_indexes, + uint16_t* indexes, + uint16_t base_index = 0); +ARROW_COMPUTE_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, + uint8_t* bytes); +ARROW_COMPUTE_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, + uint8_t* bits); +ARROW_COMPUTE_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes, + uint32_t num_bytes); } // namespace avx2 #endif @@ -143,7 +149,7 @@ Result ModifyExpression(Expression expr, const PreVisit& pre, ARROW_ASSIGN_OR_RAISE(auto modified_argument, ModifyExpression(call->arguments[i], pre, post_call)); - if (Identical(modified_argument, call->arguments[i])) { + if (Expression::Identical(modified_argument, call->arguments[i])) { continue; } diff --git a/cpp/src/arrow/compute/util_avx2.cc b/cpp/src/arrow/compute/util_avx2.cc index f0ff4575bbe..a554e0463f0 100644 --- a/cpp/src/arrow/compute/util_avx2.cc +++ b/cpp/src/arrow/compute/util_avx2.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/util.h" #include "arrow/util/bit_util.h" #include "arrow/util/logging.h" #include "arrow/util/simd.h" diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h index 5e5b15a5ff6..301fd4939b4 100644 --- a/cpp/src/arrow/compute/util_internal.h +++ b/cpp/src/arrow/compute/util_internal.h @@ -17,6 +17,7 @@ #pragma once +#include "arrow/compute/visibility.h" #include "arrow/status.h" #include "arrow/type_fwd.h" #include "arrow/util/logging.h" @@ -34,7 +35,7 @@ void CheckAlignment(const void* ptr) { /// Temporary vectors should resemble allocating temporary variables on the stack /// but in the context of vectorized processing where we need to store a vector of /// temporaries instead of a single value. -class ARROW_EXPORT TempVectorStack { +class ARROW_COMPUTE_EXPORT TempVectorStack { template friend class TempVectorHolder; diff --git a/cpp/src/arrow/compute/visibility.h b/cpp/src/arrow/compute/visibility.h new file mode 100644 index 00000000000..ae994bd2333 --- /dev/null +++ b/cpp/src/arrow/compute/visibility.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4251) +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif + +# ifdef ARROW_COMPUTE_STATIC +# define ARROW_COMPUTE_EXPORT +# elif defined(ARROW_COMPUTE_EXPORTING) +# define ARROW_COMPUTE_EXPORT __declspec(dllexport) +# else +# define ARROW_COMPUTE_EXPORT __declspec(dllimport) +# endif + +# define ARROW_COMPUTE_NO_EXPORT + +# if defined(_MSC_VER) +# pragma warning(pop) +# endif + +#else // Not Windows +# ifndef ARROW_COMPUTE_EXPORT +# define ARROW_COMPUTE_EXPORT __attribute__((visibility("default"))) +# endif +# ifndef ARROW_COMPUTE_NO_EXPORT +# define ARROW_COMPUTE_NO_EXPORT __attribute__((visibility("hidden"))) +# endif +#endif diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 3825364fa94..ec31d4b1ceb 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -36,7 +36,7 @@ #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" -#include "arrow/util/trie.h" +#include "arrow/util/trie_internal.h" #include "arrow/util/utf8_internal.h" #include "arrow/util/value_parsing.h" // IWYU pragma: keep @@ -470,6 +470,32 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder { std::vector parsers_; }; +// +// Value decoder for durations +// +struct DurationValueDecoder : public ValueDecoder { + using value_type = int64_t; + + explicit DurationValueDecoder(const std::shared_ptr& type, + const ConvertOptions& options) + : ValueDecoder(type, options), + concrete_type_(checked_cast(*type)), + string_converter_() {} + + Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { + TrimWhiteSpace(&data, &size); + if (ARROW_PREDICT_FALSE(!string_converter_.Convert( + concrete_type_, reinterpret_cast(data), size, out))) { + return GenericConversionError(type_, data, size); + } + return Status::OK(); + } + + protected: + const DurationType& concrete_type_; + arrow::internal::StringConverter string_converter_; +}; + ///////////////////////////////////////////////////////////////////////// // Concrete Converter hierarchy @@ -702,6 +728,7 @@ Result> Converter::Make(const std::shared_ptr)) CONVERTER_CASE(Type::BINARY, (PrimitiveConverter>)) @@ -785,6 +812,7 @@ Result> DictionaryConverter::Make( CONVERTER_CASE(Type::UINT64, UInt64Type, NumericValueDecoder) CONVERTER_CASE(Type::FLOAT, FloatType, NumericValueDecoder) CONVERTER_CASE(Type::DOUBLE, DoubleType, NumericValueDecoder) + CONVERTER_CASE(Type::DURATION, DurationType, DurationValueDecoder) REAL_CONVERTER_CASE(Type::DECIMAL, Decimal128Type, DecimalValueDecoder) CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryType, FixedSizeBinaryValueDecoder) diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc index f4491d7441f..5dc078e7fd8 100644 --- a/cpp/src/arrow/csv/converter_test.cc +++ b/cpp/src/arrow/csv/converter_test.cc @@ -660,6 +660,68 @@ TEST(TimestampConversion, UserDefinedParsersWithZone) { AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, options); } +TEST(DurationConversion, Basics) { + auto type = duration(TimeUnit::SECOND); + AssertConversion( + type, {"1,120\n", "10800,345600\n", "-1,-120\n", "-10800,-345600\n"}, + {{1, 10800, -1, -10800}, {120, 345600, -120, -345600}}); + + type = duration(TimeUnit::MILLI); + AssertConversion( + type, {"1000,120000\n", "10800000,345600000\n", "500,0\n", "-1000,-120000\n"}, + {{1000, 10800000, 500, -1000}, {120000, 345600000, 0, -120000}}); + + type = duration(TimeUnit::MICRO); + AssertConversion( + type, {"1000000,500000\n", "120000000,10800000000\n", "-500000,-1000000\n"}, + {{1000000, 120000000, -500000}, {500000, 10800000000, -1000000}}); + + type = duration(TimeUnit::NANO); + AssertConversion( + type, + {"1000000000,500000000\n", "120000000000,10800000000000\n", "7000,9\n", + "-7000,-9\n"}, + {{1000000000, 120000000000, 7000, -7000}, {500000000, 10800000000000, 9, -9}}); +} + +TEST(DurationConversion, Nulls) { + auto type = duration(TimeUnit::MILLI); + AssertConversion(type, {"1000,N/A\n", ",10800000\n"}, + {{1000, 0}, {0, 10800000}}, + {{true, false}, {false, true}}); +} + +TEST(DurationConversion, CustomNulls) { + auto options = ConvertOptions::Defaults(); + options.null_values = {"xxx", "zzz"}; + + auto type = duration(TimeUnit::SECOND); + AssertConversion(type, {"1,xxx\n"}, {{1}, {0}}, + {{true}, {false}}, options); + + options.quoted_strings_can_be_null = false; + AssertConversionError(type, {"\"1\",\"xxx\"\n"}, {1}, options); + + AssertConversion(type, {"1,xxx\n", "zzz,120\n"}, + {{1, 0}, {0, 120}}, + {{true, false}, {false, true}}, options); +} + +TEST(DurationConversion, Whitespace) { + auto type = duration(TimeUnit::MILLI); + AssertConversion(type, + {" 1000 , 120000 \n", " 500 , 10800000 \n"}, + {{1000, 500}, {120000, 10800000}}); +} + +TEST(DurationConversion, Invalid) { + auto type = duration(TimeUnit::SECOND); + AssertConversionError(type, {"xyz\n"}, {0}); + AssertConversionError(type, {"123abc\n"}, {0}); + AssertConversionError(type, {"1.5\n"}, {0}); // floats not allowed + AssertConversionError(type, {"s1\n"}, {0}); // bad format +} + Decimal128 Dec128(std::string_view value) { Decimal128 dec; int32_t scale = 0; diff --git a/cpp/src/arrow/csv/meson.build b/cpp/src/arrow/csv/meson.build index 50850e12e0f..ed91d00aa0a 100644 --- a/cpp/src/arrow/csv/meson.build +++ b/cpp/src/arrow/csv/meson.build @@ -62,6 +62,11 @@ install_headers( subdir: 'arrow/csv', ) +arrow_csv_dep = declare_dependency( + include_directories: include_directories('.'), + dependencies: arrow_dep, +) +meson.override_dependency('arrow-csv', arrow_csv_dep) pkg.generate( filebase: 'arrow-csv', diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h index 7723dcedc61..10e55bf838c 100644 --- a/cpp/src/arrow/csv/options.h +++ b/cpp/src/arrow/csv/options.h @@ -209,6 +209,12 @@ struct ARROW_EXPORT WriteOptions { /// \brief Quoting style QuotingStyle quoting_style = QuotingStyle::Needed; + /// \brief Quoting style of header + /// + /// Note that `QuotingStyle::Needed` and `QuotingStyle::AllValid` have the same + /// effect of quoting all column names. + QuotingStyle quoting_header = QuotingStyle::Needed; + /// Create write options with default values static WriteOptions Defaults(); diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index b861a80a1ab..e83855336d2 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -144,7 +144,11 @@ class ValueDescWriter { protected: ValueDescWriter(MemoryPool* pool, int64_t values_capacity) - : values_size_(0), values_capacity_(values_capacity), status_(Status::OK()) { + : values_size_(0), + values_capacity_(values_capacity), + quoted_(false), + saved_values_size_(0), + status_(Status::OK()) { status_ &= AllocateResizableBuffer(values_capacity_ * sizeof(*values_), pool) .Value(&values_buffer_); if (status_.ok()) { diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc index 372de73b26b..5d14fe4b9b1 100644 --- a/cpp/src/arrow/csv/writer.cc +++ b/cpp/src/arrow/csv/writer.cc @@ -105,7 +105,8 @@ int64_t CountQuotes(std::string_view s) { // Matching quote pair character length. constexpr int64_t kQuoteCount = 2; -constexpr int64_t kQuoteDelimiterCount = kQuoteCount + /*end_char*/ 1; +// Delimiter character length. +constexpr int64_t kDelimiterCount = 1; // Interface for generating CSV data per column. // The intended usage is to iteratively call UpdateRowLengths for a column and @@ -176,6 +177,34 @@ char* Escape(std::string_view s, char* out) { return out; } +// Return the index of the first structural char in the input. A structural char +// is a character that needs quoting and/or escaping. +int64_t StopAtStructuralChar(const uint8_t* data, const int64_t buffer_size, + const char delimiter) { + int64_t offset = 0; +#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) + // _mm_cmpistrc gives slightly better performance than the naive approach, + // probably doesn't deserve the effort + using simd_batch = xsimd::make_sized_batch_t; + while ((offset + 16) <= buffer_size) { + const auto v = simd_batch::load_unaligned(data + offset); + if (xsimd::any((v == '\n') | (v == '\r') | (v == '"') | (v == delimiter))) { + break; + } + offset += 16; + } +#endif + while (offset < buffer_size) { + // error happened or remaining bytes to check + const char c = static_cast(data[offset]); + if (c == '\n' || c == '\r' || c == '"' || c == delimiter) { + break; + } + ++offset; + } + return offset; +} + // Populator used for non-string/binary types, or when unquoted strings/binary types are // desired. It assumes the strings in the casted array do not require quoting or escaping. // This is enforced by setting reject_values_with_quotes to true, in which case a check @@ -268,35 +297,18 @@ class UnquotedColumnPopulator : public ColumnPopulator { // scan the underlying string array buffer as a single big string const uint8_t* const data = array.raw_data() + array.value_offset(0); const int64_t buffer_size = array.total_values_length(); - int64_t offset = 0; -#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) - // _mm_cmpistrc gives slightly better performance than the naive approach, - // probably doesn't deserve the effort - using simd_batch = xsimd::make_sized_batch_t; - while ((offset + 16) <= buffer_size) { - const auto v = simd_batch::load_unaligned(data + offset); - if (xsimd::any((v == '\n') | (v == '\r') | (v == '"') | (v == delimiter))) { - break; - } - offset += 16; - } -#endif - while (offset < buffer_size) { - // error happened or remaining bytes to check - const char c = static_cast(data[offset]); - if (c == '\n' || c == '\r' || c == '"' || c == delimiter) { - // extract the offending string from array per offset - const auto* offsets = array.raw_value_offsets(); - const auto index = - std::upper_bound(offsets, offsets + array.length(), offset + offsets[0]) - - offsets; - DCHECK_GT(index, 0); - return Status::Invalid( - "CSV values may not contain structural characters if quoting style is " - "\"None\". See RFC4180. Invalid value: ", - array.GetView(index - 1)); - } - ++offset; + if (int64_t offset = StopAtStructuralChar(data, buffer_size, delimiter); + offset != buffer_size) { + // extract the offending string from array per offset + const auto* offsets = array.raw_value_offsets(); + const auto index = + std::upper_bound(offsets, offsets + array.length(), offset + offsets[0]) - + offsets; + DCHECK_GT(index, 0); + return Status::Invalid( + "CSV values may not contain structural characters if quoting style is " + "\"None\". See RFC4180. Invalid value: ", + array.GetView(index - 1)); } return Status::OK(); } @@ -578,26 +590,62 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { return Status::OK(); } - int64_t CalculateHeaderSize() const { + int64_t CalculateHeaderSize(QuotingStyle quoting_style) const { int64_t header_length = 0; for (int col = 0; col < schema_->num_fields(); col++) { const std::string& col_name = schema_->field(col)->name(); header_length += col_name.size(); - header_length += CountQuotes(col_name); + switch (quoting_style) { + case QuotingStyle::None: + break; + case QuotingStyle::Needed: + case QuotingStyle::AllValid: + header_length += CountQuotes(col_name); + break; + } + } + header_length += kDelimiterCount * (schema_->num_fields() - 1) + options_.eol.size(); + switch (quoting_style) { + case QuotingStyle::None: + break; + case QuotingStyle::Needed: + case QuotingStyle::AllValid: + header_length += kQuoteCount * schema_->num_fields(); + break; } - // header_length + ([quotes + ','] * schema_->num_fields()) + (eol - ',') - return header_length + (kQuoteDelimiterCount * schema_->num_fields()) + - (options_.eol.size() - 1); + return header_length; } Status WriteHeader() { // Only called once, as part of initialization - RETURN_NOT_OK(data_buffer_->Resize(CalculateHeaderSize(), /*shrink_to_fit=*/false)); + RETURN_NOT_OK(data_buffer_->Resize(CalculateHeaderSize(options_.quoting_header), + /*shrink_to_fit=*/false)); char* next = reinterpret_cast(data_buffer_->mutable_data()); for (int col = 0; col < schema_->num_fields(); ++col) { - *next++ = '"'; - next = Escape(schema_->field(col)->name(), next); - *next++ = '"'; + const std::string& col_name = schema_->field(col)->name(); + switch (options_.quoting_header) { + case QuotingStyle::None: + if (StopAtStructuralChar(reinterpret_cast(col_name.c_str()), + col_name.length(), options_.delimiter) != + static_cast(col_name.length())) { + return Status::Invalid( + "CSV header may not contain structural characters if quoting style is " + "\"None\". See RFC4180. Invalid value: ", + col_name); + } + memcpy(next, col_name.data(), col_name.size()); + next += col_name.size(); + break; + case QuotingStyle::Needed: + case QuotingStyle::AllValid: + // QuotingStyle::Needed is defined as always quoting string/binary data, + // regardless of whether it contains structural chars. + // We use consistent semantics for header names, which are strings. + *next++ = '"'; + next = Escape(schema_->field(col)->name(), next); + *next++ = '"'; + break; + } if (col != schema_->num_fields() - 1) { *next++ = options_.delimiter; } diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 4fccf4ddbbb..783d7631ab3 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -61,10 +61,12 @@ WriteOptions DefaultTestOptions(bool include_header = false, const std::string& null_string = "", QuotingStyle quoting_style = QuotingStyle::Needed, const std::string& eol = "\n", char delimiter = ',', - int batch_size = 5) { + int batch_size = 5, + QuotingStyle quoting_header = QuotingStyle::Needed) { WriteOptions options; options.batch_size = batch_size; options.include_header = include_header; + options.quoting_header = quoting_header; options.null_string = null_string; options.eol = eol; options.quoting_style = quoting_style; @@ -91,6 +93,17 @@ std::vector GenerateTestCases() { auto dummy_schema = schema({field("a", uint8())}); std::string dummy_batch_data = R"([{"a": null}])"; + auto header_without_structural_charaters = + schema({field("a ", uint64()), field("b", int32())}); + std::string expected_header_without_structural_charaters = + std::string(R"(a ,b)") + "\n"; + auto expected_status_no_quotes_with_structural_in_header = [](const char* header) { + return Status::Invalid( + "CSV header may not contain structural characters if quoting " + "style is \"None\". See RFC4180. Invalid value: ", + header); + }; + // Schema to test various types. auto abc_schema = schema({ field("a", uint64()), @@ -279,7 +292,20 @@ std::vector GenerateTestCases() { {schema_custom_delimiter, batch_custom_delimiter, DefaultTestOptions(/*include_header=*/false, /*null_string=*/"", QuotingStyle::Needed, /*eol=*/";", /*delimiter=*/';'), - /*expected_output*/ "", expected_status_illegal_delimiter(';')}}; + /*expected_output*/ "", expected_status_illegal_delimiter(';')}, + {header_without_structural_charaters, "[]", + DefaultTestOptions(/*include_header=*/true, /*null_string=*/"", + QuotingStyle::Needed, /*eol=*/"\n", + /*delimiter=*/',', /*batch_size=*/5, + /*quoting_header=*/QuotingStyle::None), + expected_header_without_structural_charaters}, + {abc_schema, "[]", + DefaultTestOptions(/*include_header=*/true, /*null_string=*/"", + QuotingStyle::Needed, /*eol=*/"\n", + /*delimiter=*/',', /*batch_size=*/5, + /*quoting_header=*/QuotingStyle::None), + "", expected_status_no_quotes_with_structural_in_header("b\"")}, + }; } class TestWriteCSV : public ::testing::TestWithParam { diff --git a/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in b/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in index 4573ac37185..381d819276b 100644 --- a/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in +++ b/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in @@ -30,7 +30,12 @@ set(ARROW_DATASET_REQUIRED_DEPENDENCIES "@ARROW_DATASET_REQUIRED_DEPENDENCIES@") include(CMakeFindDependencyMacro) foreach(dependency ${ARROW_DATASET_REQUIRED_DEPENDENCIES}) - find_dependency(${dependency}) + # Currently all dependencies in ARROW_DATASET_REQUIRED_DEPENDENCIES + # are created by Apache Arrow C++. So we can use CONFIG for all + # dependencies. If ARROW_DATASET_REQUIRED_DEPENDENCIES may have + # dependencies not created by Apache Arrow C++, we need to revisit + # this CONFIG. + find_dependency(${dependency} CONFIG) endforeach() include("${CMAKE_CURRENT_LIST_DIR}/ArrowDatasetTargets.cmake") diff --git a/cpp/src/arrow/dataset/CMakeLists.txt b/cpp/src/arrow/dataset/CMakeLists.txt index 29d8c3cb59f..fa6875527db 100644 --- a/cpp/src/arrow/dataset/CMakeLists.txt +++ b/cpp/src/arrow/dataset/CMakeLists.txt @@ -40,8 +40,8 @@ set(ARROW_DATASET_SRCS scanner.cc scan_node.cc) -set(ARROW_DATASET_PKG_CONFIG_REQUIRES "arrow-acero") -set(ARROW_DATASET_REQUIRED_DEPENDENCIES Arrow ArrowAcero) +set(ARROW_DATASET_PKG_CONFIG_REQUIRES "arrow-acero arrow-compute") +set(ARROW_DATASET_REQUIRED_DEPENDENCIES Arrow ArrowCompute ArrowAcero) if(ARROW_PARQUET) string(APPEND ARROW_DATASET_PKG_CONFIG_REQUIRES " parquet") list(APPEND ARROW_DATASET_REQUIRED_DEPENDENCIES Parquet) @@ -94,8 +94,6 @@ add_arrow_lib(arrow_dataset ARROW_DATASET_LIBRARIES SOURCES ${ARROW_DATASET_SRCS} - PRECOMPILED_HEADERS - "$<$:arrow/dataset/pch.h>" PRIVATE_INCLUDES ${ARROW_DATASET_PRIVATE_INCLUDES} SHARED_LINK_LIBS @@ -193,8 +191,7 @@ if(ARROW_JSON) endif() if(ARROW_ORC) - add_arrow_dataset_test(file_orc_test EXTRA_LINK_LIBS ${ARROW_DATASET_TEST_LINK_LIBS} - orc::orc) + add_arrow_dataset_test(file_orc_test EXTRA_LINK_LIBS ${ARROW_DATASET_TEST_LINK_LIBS}) endif() if(ARROW_PARQUET) @@ -208,15 +205,39 @@ if(ARROW_PARQUET) endif() endif() -if(ARROW_BUILD_BENCHMARKS) - add_arrow_benchmark(file_benchmark PREFIX "arrow-dataset") - add_arrow_benchmark(scanner_benchmark PREFIX "arrow-dataset") +function(add_arrow_dataset_benchmark REL_BENCHMARK_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args EXTRA_LINK_LIBS) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) - if(ARROW_BUILD_STATIC) - target_link_libraries(arrow-dataset-file-benchmark PUBLIC arrow_dataset_static) - target_link_libraries(arrow-dataset-scanner-benchmark PUBLIC arrow_dataset_static) + if(ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) else() - target_link_libraries(arrow-dataset-file-benchmark PUBLIC arrow_dataset_shared) - target_link_libraries(arrow-dataset-scanner-benchmark PUBLIC arrow_dataset_shared) + set(PREFIX "arrow-dataset") endif() -endif() + + if(ARROW_TEST_LINKAGE STREQUAL "static") + set(EXTRA_LINK_LIBS arrow_dataset_static) + else() + set(EXTRA_LINK_LIBS arrow_dataset_shared) + endif() + if(ARG_EXTRA_LINK_LIBS) + list(APPEND EXTRA_LINK_LIBS ${ARG_EXTRA_LINK_LIBS}) + endif() + + # Dataset benchmarks require compute kernels initialization. + add_arrow_compute_benchmark(${REL_BENCHMARK_NAME} + PREFIX + ${PREFIX} + EXTRA_LINK_LIBS + ${EXTRA_LINK_LIBS} + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() + +add_arrow_dataset_benchmark(file_benchmark) +add_arrow_dataset_benchmark(scanner_benchmark) diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc index 4ae15a7b3b9..f5104efb70b 100644 --- a/cpp/src/arrow/dataset/dataset_writer.cc +++ b/cpp/src/arrow/dataset/dataset_writer.cc @@ -29,7 +29,7 @@ #include "arrow/table.h" #include "arrow/util/future.h" #include "arrow/util/logging_internal.h" -#include "arrow/util/map.h" +#include "arrow/util/map_internal.h" #include "arrow/util/string.h" #include "arrow/util/tracing_internal.h" @@ -53,18 +53,19 @@ class Throttle { bool Unthrottled() const { return max_value_ <= 0; } - Future<> Acquire(uint64_t values) { + std::optional> Acquire(uint64_t values) { if (Unthrottled()) { return Future<>::MakeFinished(); } std::lock_guard lg(mutex_); - if (values + current_value_ > max_value_) { + if (current_value_ >= max_value_) { in_waiting_ = values; backpressure_ = Future<>::Make(); - } else { - current_value_ += values; + return backpressure_; } - return backpressure_; + current_value_ += values; + DCHECK(backpressure_.is_finished()); + return std::nullopt; } void Release(uint64_t values) { @@ -75,7 +76,7 @@ class Throttle { { std::lock_guard lg(mutex_); current_value_ -= values; - if (in_waiting_ > 0 && in_waiting_ + current_value_ <= max_value_) { + if (in_waiting_ > 0 && current_value_ < max_value_) { in_waiting_ = 0; to_complete = backpressure_; } @@ -131,28 +132,38 @@ Result> OpenWriter( {write_options.filesystem, filename}); } -class DatasetWriterFileQueue { +class DatasetWriterFileQueue + : public std::enable_shared_from_this { public: explicit DatasetWriterFileQueue(const std::shared_ptr& schema, const FileSystemDatasetWriteOptions& options, - DatasetWriterState* writer_state) - : options_(options), schema_(schema), writer_state_(writer_state) {} + std::shared_ptr writer_state) + : options_(options), schema_(schema), writer_state_(std::move(writer_state)) {} - void Start(util::AsyncTaskScheduler* file_tasks, const std::string& filename) { - file_tasks_ = file_tasks; + void Start(std::unique_ptr file_tasks, + std::string filename) { + file_tasks_ = std::move(file_tasks); // Because the scheduler runs one task at a time we know the writer will // be opened before any attempt to write file_tasks_->AddSimpleTask( - [this, filename] { - Executor* io_executor = options_.filesystem->io_context().executor(); - return DeferNotOk(io_executor->Submit([this, filename]() { - ARROW_ASSIGN_OR_RAISE(writer_, OpenWriter(options_, schema_, filename)); + [self = shared_from_this(), filename = std::move(filename)] { + Executor* io_executor = self->options_.filesystem->io_context().executor(); + return DeferNotOk(io_executor->Submit([self, filename = std::move(filename)]() { + ARROW_ASSIGN_OR_RAISE(self->writer_, + OpenWriter(self->options_, self->schema_, filename)); return Status::OK(); })); }, "DatasetWriter::OpenWriter"sv); } + void Abort() { + // The scheduler may be keeping this object alive through shared_ptr references + // in async closures. Make sure we break any reference cycles by losing our + // reference to the scheduler. + file_tasks_.reset(); + } + Result> PopStagedBatch() { std::vector> batches_to_write; uint64_t num_rows = 0; @@ -184,7 +195,7 @@ class DatasetWriterFileQueue { void ScheduleBatch(std::shared_ptr batch) { file_tasks_->AddSimpleTask( - [self = this, batch = std::move(batch)]() { + [self = shared_from_this(), batch = std::move(batch)]() { return self->WriteNext(std::move(batch)); }, "DatasetWriter::WriteBatch"sv); @@ -217,13 +228,15 @@ class DatasetWriterFileQueue { Status Finish() { writer_state_->staged_rows_count -= rows_currently_staged_; while (!staged_batches_.empty()) { - RETURN_NOT_OK(PopAndDeliverStagedBatch()); + RETURN_NOT_OK(PopAndDeliverStagedBatch().status().OrElse( + [&](auto&&) { file_tasks_.reset(); })); } // At this point all write tasks have been added. Because the scheduler // is a 1-task FIFO we know this task will run at the very end and can // add it now. - file_tasks_->AddSimpleTask([this] { return DoFinish(); }, + file_tasks_->AddSimpleTask([self = shared_from_this()] { return self->DoFinish(); }, "DatasetWriter::FinishFile"sv); + file_tasks_.reset(); return Status::OK(); } @@ -231,7 +244,7 @@ class DatasetWriterFileQueue { Future<> WriteNext(std::shared_ptr next) { // May want to prototype / measure someday pushing the async write down further return DeferNotOk(options_.filesystem->io_context().executor()->Submit( - [self = this, batch = std::move(next)]() { + [self = shared_from_this(), batch = std::move(next)]() { int64_t rows_to_release = batch->num_rows(); Status status = self->writer_->Write(batch); self->writer_state_->rows_in_flight_throttle.Release(rows_to_release); @@ -244,21 +257,22 @@ class DatasetWriterFileQueue { std::lock_guard lg(writer_state_->visitors_mutex); RETURN_NOT_OK(options_.writer_pre_finish(writer_.get())); } - return writer_->Finish().Then([this]() { - std::lock_guard lg(writer_state_->visitors_mutex); - return options_.writer_post_finish(writer_.get()); - }); + return writer_->Finish().Then( + [self = shared_from_this(), writer_post_finish = options_.writer_post_finish]() { + std::lock_guard lg(self->writer_state_->visitors_mutex); + return writer_post_finish(self->writer_.get()); + }); } const FileSystemDatasetWriteOptions& options_; const std::shared_ptr& schema_; - DatasetWriterState* writer_state_; + std::shared_ptr writer_state_; std::shared_ptr writer_; // Batches are accumulated here until they are large enough to write out at which // point they are merged together and added to write_queue_ std::deque> staged_batches_; uint64_t rows_currently_staged_ = 0; - util::AsyncTaskScheduler* file_tasks_ = nullptr; + std::unique_ptr file_tasks_; }; struct WriteTask { @@ -266,18 +280,25 @@ struct WriteTask { uint64_t num_rows; }; -class DatasetWriterDirectoryQueue { +class DatasetWriterDirectoryQueue + : public std::enable_shared_from_this { public: DatasetWriterDirectoryQueue(util::AsyncTaskScheduler* scheduler, std::string directory, std::string prefix, std::shared_ptr schema, const FileSystemDatasetWriteOptions& write_options, - DatasetWriterState* writer_state) + std::shared_ptr writer_state) : scheduler_(std::move(scheduler)), directory_(std::move(directory)), prefix_(std::move(prefix)), schema_(std::move(schema)), write_options_(write_options), - writer_state_(writer_state) {} + writer_state_(std::move(writer_state)) {} + + ~DatasetWriterDirectoryQueue() { + if (latest_open_file_) { + latest_open_file_->Abort(); + } + } Result> NextWritableChunk( std::shared_ptr batch, std::shared_ptr* remainder, @@ -330,32 +351,27 @@ class DatasetWriterDirectoryQueue { Status FinishCurrentFile() { if (latest_open_file_) { - ARROW_RETURN_NOT_OK(latest_open_file_->Finish()); - latest_open_file_tasks_.reset(); - latest_open_file_ = nullptr; + auto file = std::move(latest_open_file_); + ARROW_RETURN_NOT_OK(file->Finish()); } rows_written_ = 0; return GetNextFilename().Value(¤t_filename_); } Status OpenFileQueue(const std::string& filename) { - auto file_queue = - std::make_unique(schema_, write_options_, writer_state_); - latest_open_file_ = file_queue.get(); - // Create a dedicated throttle for write jobs to this file and keep it alive until we - // are finished and have closed the file. - auto file_finish_task = [this, file_queue = std::move(file_queue)] { - writer_state_->open_files_throttle.Release(1); + latest_open_file_.reset( + new DatasetWriterFileQueue(schema_, write_options_, writer_state_)); + auto file_finish_task = [self = shared_from_this()] { + self->writer_state_->open_files_throttle.Release(1); return Status::OK(); }; - latest_open_file_tasks_ = util::MakeThrottledAsyncTaskGroup( - scheduler_, 1, /*queue=*/nullptr, std::move(file_finish_task)); + auto file_tasks = util::MakeThrottledAsyncTaskGroup(scheduler_, 1, /*queue=*/nullptr, + std::move(file_finish_task)); if (init_future_.is_valid()) { - latest_open_file_tasks_->AddSimpleTask( - [init_future = init_future_]() { return init_future; }, - "DatasetWriter::WaitForDirectoryInit"sv); + file_tasks->AddSimpleTask([init_future = init_future_]() { return init_future; }, + "DatasetWriter::WaitForDirectoryInit"sv); } - latest_open_file_->Start(latest_open_file_tasks_.get(), filename); + latest_open_file_->Start(std::move(file_tasks), filename); return Status::OK(); } @@ -398,14 +414,14 @@ class DatasetWriterDirectoryQueue { "DatasetWriter::InitializeDirectory"sv); } - static Result> Make( + static Result> Make( util::AsyncTaskScheduler* scheduler, const FileSystemDatasetWriteOptions& write_options, - DatasetWriterState* writer_state, std::shared_ptr schema, + std::shared_ptr writer_state, std::shared_ptr schema, std::string directory, std::string prefix) { - auto dir_queue = std::make_unique( + auto dir_queue = std::make_shared( scheduler, std::move(directory), std::move(prefix), std::move(schema), - write_options, writer_state); + write_options, std::move(writer_state)); dir_queue->PrepareDirectory(); ARROW_ASSIGN_OR_RAISE(dir_queue->current_filename_, dir_queue->GetNextFilename()); return dir_queue; @@ -413,26 +429,31 @@ class DatasetWriterDirectoryQueue { Status Finish() { if (latest_open_file_) { - ARROW_RETURN_NOT_OK(latest_open_file_->Finish()); - latest_open_file_tasks_.reset(); - latest_open_file_ = nullptr; + auto file = std::move(latest_open_file_); + ARROW_RETURN_NOT_OK(file->Finish()); } used_filenames_.clear(); return Status::OK(); } + void Abort() { + if (latest_open_file_) { + latest_open_file_->Abort(); + latest_open_file_.reset(); + } + } + private: util::AsyncTaskScheduler* scheduler_ = nullptr; std::string directory_; std::string prefix_; std::shared_ptr schema_; const FileSystemDatasetWriteOptions& write_options_; - DatasetWriterState* writer_state_; + std::shared_ptr writer_state_; Future<> init_future_; std::string current_filename_; std::unordered_set used_filenames_; - DatasetWriterFileQueue* latest_open_file_ = nullptr; - std::unique_ptr latest_open_file_tasks_; + std::shared_ptr latest_open_file_; uint64_t rows_written_ = 0; uint32_t file_counter_ = 0; }; @@ -520,11 +541,26 @@ class DatasetWriter::DatasetWriterImpl { return Status::OK(); })), write_options_(std::move(write_options)), - writer_state_(max_rows_queued, write_options_.max_open_files, - CalculateMaxRowsStaged(max_rows_queued)), + writer_state_(std::make_shared( + max_rows_queued, write_options_.max_open_files, + CalculateMaxRowsStaged(max_rows_queued))), pause_callback_(std::move(pause_callback)), resume_callback_(std::move(resume_callback)) {} + ~DatasetWriterImpl() { + // In case something went wrong (e.g. an IO error occurred), some tasks + // may be left dangling in a ThrottledAsyncTaskScheduler and that may + // lead to memory leaks via shared_ptr reference cycles (this can show up + // in some unit tests under Valgrind). + // To prevent this, explicitly break reference cycles at DatasetWriter + // destruction. + // The alternative is to use weak_from_this() thoroughly in async callbacks, + // but that makes for less readable code. + for (const auto& directory_queue : directory_queues_) { + directory_queue.second->Abort(); + } + } + Future<> WriteAndCheckBackpressure(std::shared_ptr batch, const std::string& directory, const std::string& prefix) { @@ -592,8 +628,10 @@ class DatasetWriter::DatasetWriterImpl { "DatasetWriter::FinishAll"sv); // Reset write_tasks_ to signal that we are done adding tasks, this will allow // us to invoke the finish callback once the tasks wrap up. - std::lock_guard lg(mutex_); - write_tasks_.reset(); + { + std::lock_guard lg(mutex_); + write_tasks_.reset(); + } } protected: @@ -621,11 +659,11 @@ class DatasetWriter::DatasetWriterImpl { &directory_queues_, directory + prefix, [this, &batch, &directory, &prefix](const std::string& key) { return DatasetWriterDirectoryQueue::Make(scheduler_, write_options_, - &writer_state_, batch->schema(), + writer_state_, batch->schema(), directory, prefix); })); std::shared_ptr dir_queue = dir_queue_itr->second; - Future<> backpressure; + std::optional> backpressure; while (batch) { // Keep opening new files until batch is done. std::shared_ptr remainder; @@ -643,16 +681,16 @@ class DatasetWriter::DatasetWriterImpl { continue; } backpressure = - writer_state_.rows_in_flight_throttle.Acquire(next_chunk->num_rows()); - if (!backpressure.is_finished()) { + writer_state_->rows_in_flight_throttle.Acquire(next_chunk->num_rows()); + if (backpressure) { EVENT_ON_CURRENT_SPAN("DatasetWriter::Backpressure::TooManyRowsQueued"); break; } if (will_open_file) { - backpressure = writer_state_.open_files_throttle.Acquire(1); - if (!backpressure.is_finished()) { + backpressure = writer_state_->open_files_throttle.Acquire(1); + if (backpressure) { EVENT_ON_CURRENT_SPAN("DatasetWriter::Backpressure::TooManyOpenFiles"); - writer_state_.rows_in_flight_throttle.Release(next_chunk->num_rows()); + writer_state_->rows_in_flight_throttle.Release(next_chunk->num_rows()); RETURN_NOT_OK(TryCloseLargestFile()); break; } @@ -664,7 +702,7 @@ class DatasetWriter::DatasetWriterImpl { // // `open_files_throttle` will be handed by `DatasetWriterDirectoryQueue` // so we don't need to release it here. - writer_state_.rows_in_flight_throttle.Release(next_chunk->num_rows()); + writer_state_->rows_in_flight_throttle.Release(next_chunk->num_rows()); return s; } batch = std::move(remainder); @@ -674,7 +712,8 @@ class DatasetWriter::DatasetWriterImpl { } if (batch) { - return backpressure.Then([this, batch, directory, prefix] { + DCHECK(backpressure); + return backpressure->Then([this, batch, directory, prefix] { return DoWriteRecordBatch(batch, directory, prefix); }); } @@ -685,7 +724,7 @@ class DatasetWriter::DatasetWriterImpl { std::unique_ptr write_tasks_; Future<> finish_fut_ = Future<>::Make(); FileSystemDatasetWriteOptions write_options_; - DatasetWriterState writer_state_; + std::shared_ptr writer_state_; std::function pause_callback_; std::function resume_callback_; // Map from directory + prefix to the queue for that directory diff --git a/cpp/src/arrow/dataset/dataset_writer_test.cc b/cpp/src/arrow/dataset/dataset_writer_test.cc index 32ae8d7ee12..2f34c21aec9 100644 --- a/cpp/src/arrow/dataset/dataset_writer_test.cc +++ b/cpp/src/arrow/dataset/dataset_writer_test.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "arrow/array/builder_primitive.h" @@ -105,7 +106,8 @@ class DatasetWriterTestFixture : public testing::Test { uint64_t max_rows = kDefaultDatasetWriterMaxRowsQueued) { EXPECT_OK_AND_ASSIGN(auto dataset_writer, DatasetWriter::Make( - write_options_, scheduler_, [] {}, [] {}, [] {}, max_rows)); + write_options_, scheduler_, [this] { paused_ = true; }, + [this] { paused_ = false; }, [] {}, max_rows)); return dataset_writer; } @@ -231,6 +233,7 @@ class DatasetWriterTestFixture : public testing::Test { util::AsyncTaskScheduler* scheduler_; Future<> scheduler_finished_; FileSystemDatasetWriteOptions write_options_; + std::atomic_bool paused_{false}; uint64_t counter_ = 0; }; @@ -265,6 +268,49 @@ TEST_F(DatasetWriterTestFixture, DirectoryCreateFails) { ASSERT_FINISHES_AND_RAISES(Invalid, scheduler_finished_); } +TEST_F(DatasetWriterTestFixture, BatchGreaterThanMaxRowsQueued) { + auto dataset_writer = MakeDatasetWriter(/*max_rows=*/10); + dataset_writer->WriteRecordBatch(MakeBatch(35), ""); + EndWriterChecked(dataset_writer.get()); + AssertCreatedData({{"testdir/chunk-0.arrow", 0, 35}}); + ASSERT_EQ(paused_, false); +} + +TEST_F(DatasetWriterTestFixture, BatchWriteConcurrent) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif + auto dataset_writer = MakeDatasetWriter(/*max_rows=*/5); + + for (int threads = 1; threads < 5; threads++) { + for (int iter = 2; iter <= 256; iter *= 2) { + for (int batch = 2; batch <= 64; batch *= 2) { + std::vector workers; + for (int i = 0; i < threads; ++i) { + workers.push_back(std::thread([&, i = i]() { + for (int j = 0; j < iter; ++j) { + while (paused_) { + SleepABit(); + } + dataset_writer->WriteRecordBatch(MakeBatch(0, batch + i + 10 * j), ""); + } + })); + } + for (std::thread& t : workers) { + if (t.joinable()) { + t.join(); + } + while (paused_) { + SleepABit(); + } + } + } + } + } + EndWriterChecked(dataset_writer.get()); + ASSERT_EQ(paused_, false); +} + TEST_F(DatasetWriterTestFixture, MaxRowsOneWrite) { write_options_.max_rows_per_file = 10; write_options_.max_rows_per_group = 10; @@ -275,6 +321,7 @@ TEST_F(DatasetWriterTestFixture, MaxRowsOneWrite) { {"testdir/chunk-1.arrow", 10, 10}, {"testdir/chunk-2.arrow", 20, 10}, {"testdir/chunk-3.arrow", 30, 5}}); + ASSERT_EQ(paused_, false); } TEST_F(DatasetWriterTestFixture, MaxRowsOneWriteBackpresure) { diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index b502d1d1947..5686e50e3cb 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -39,6 +39,22 @@ using internal::StartsWith; namespace dataset { +namespace { + +bool StartsWithAnyOf(const std::string& path, const std::vector& prefixes) { + if (prefixes.empty()) { + return false; + } + + auto parts = fs::internal::SplitAbstractPath(path); + return std::any_of(parts.cbegin(), parts.cend(), [&](std::string_view part) { + return std::any_of(prefixes.cbegin(), prefixes.cend(), + [&](std::string_view prefix) { return StartsWith(part, prefix); }); + }); +} + +} // namespace + DatasetFactory::DatasetFactory() : root_partition_(compute::literal(true)) {} Result> DatasetFactory::Inspect(InspectOptions options) { @@ -157,18 +173,6 @@ Result> FileSystemDatasetFactory::Make( std::move(format), std::move(options))); } -bool StartsWithAnyOf(const std::string& path, const std::vector& prefixes) { - if (prefixes.empty()) { - return false; - } - - auto parts = fs::internal::SplitAbstractPath(path); - return std::any_of(parts.cbegin(), parts.cend(), [&](std::string_view part) { - return std::any_of(prefixes.cbegin(), prefixes.cend(), - [&](std::string_view prefix) { return StartsWith(part, prefix); }); - }); -} - Result> FileSystemDatasetFactory::Make( std::shared_ptr filesystem, fs::FileSelector selector, std::shared_ptr format, FileSystemFactoryOptions options) { diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index b084581b1af..ccc79dfa9bf 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -46,7 +46,7 @@ #include "arrow/util/iterator.h" #include "arrow/util/logging_internal.h" #include "arrow/util/macros.h" -#include "arrow/util/map.h" +#include "arrow/util/map_internal.h" #include "arrow/util/string.h" #include "arrow/util/task_group.h" #include "arrow/util/tracing_internal.h" @@ -473,9 +473,14 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio WriteNodeOptions write_node_options(write_options); write_node_options.custom_schema = custom_schema; + // preserve existing order across fragments by setting require_sequenced_output=true + bool require_sequenced_output = write_node_options.write_options.preserve_order; + // preserve existing order of sequenced scan output by setting implicit_order=true + bool implicit_ordering = write_node_options.write_options.preserve_order; acero::Declaration plan = acero::Declaration::Sequence({ - {"scan", ScanNodeOptions{dataset, scanner->options()}}, + {"scan", ScanNodeOptions{dataset, scanner->options(), require_sequenced_output, + implicit_ordering}}, {"filter", acero::FilterNodeOptions{scanner->options()->filter}}, {"project", acero::ProjectNodeOptions{std::move(exprs), std::move(names)}}, {"write", std::move(write_node_options)}, @@ -484,6 +489,8 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio return acero::DeclarationToStatus(std::move(plan), scanner->options()->use_threads); } +namespace { + Result MakeWriteNode(acero::ExecPlan* plan, std::vector inputs, const acero::ExecNodeOptions& options) { @@ -541,14 +548,17 @@ Result MakeWriteNode(acero::ExecPlan* plan, ARROW_ASSIGN_OR_RAISE( auto node, + // to preserve order explicitly, sequence the exec batches + // this requires exec batch index to be set upstream (e.g. by SourceNode) acero::MakeExecNode("consuming_sink", plan, std::move(inputs), - acero::ConsumingSinkNodeOptions{std::move(consumer)})); + acero::ConsumingSinkNodeOptions{ + std::move(consumer), + {}, + /*sequence_output=*/write_options.preserve_order})); return node; } -namespace { - class TeeNode : public acero::MapNode { public: TeeNode(acero::ExecPlan* plan, std::vector inputs, diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 46fc8ebc40d..e13c1312a47 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -399,6 +399,10 @@ struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { /// Partitioning used to generate fragment paths. std::shared_ptr partitioning; + /// If true the order of rows in the dataset is preserved when writing with + /// multiple threads. This may cause notable performance degradation. + bool preserve_order = false; + /// Maximum number of partitions any batch may be written into, default is 1K. int max_partitions = 1024; diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc index 5691e806ccb..cede2681070 100644 --- a/cpp/src/arrow/dataset/file_csv.cc +++ b/cpp/src/arrow/dataset/file_csv.cc @@ -53,6 +53,10 @@ using internal::SerialExecutor; namespace dataset { +namespace { + +using RecordBatchGenerator = std::function>()>; + struct CsvInspectedFragment : public InspectedFragment { CsvInspectedFragment(std::vector column_names, std::shared_ptr input_stream, int64_t num_bytes) @@ -142,8 +146,6 @@ class CsvFileScanner : public FragmentScanner { int scanned_so_far_ = 0; }; -using RecordBatchGenerator = std::function>()>; - Result> GetOrderedColumnNames( const csv::ReadOptions& read_options, const csv::ParseOptions& parse_options, std::string_view first_block, MemoryPool* pool) { @@ -348,6 +350,8 @@ static RecordBatchGenerator GeneratorFromReader( return MakeFromFuture(std::move(gen_fut)); } +} // namespace + CsvFileFormat::CsvFileFormat() : FileFormat(std::make_shared()) {} bool CsvFileFormat::Equals(const FileFormat& format) const { @@ -420,6 +424,8 @@ Future> CsvFileFormat::BeginScan( exec_context->executor()); } +namespace { + Result> DoInspectFragment( const FileSource& source, const CsvFragmentScanOptions& csv_options, compute::ExecContext* exec_context) { @@ -442,6 +448,8 @@ Result> DoInspectFragment( source.Size()); } +} // namespace + Future> CsvFileFormat::InspectFragment( const FileSource& source, const FragmentScanOptions* format_options, compute::ExecContext* exec_context) const { diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index fd601b673c4..7ef60618700 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -36,6 +36,7 @@ #include "arrow/util/iterator.h" #include "arrow/util/logging_internal.h" #include "arrow/util/range.h" +#include "arrow/util/thread_pool.h" #include "arrow/util/tracing_internal.h" #include "parquet/arrow/reader.h" #include "parquet/arrow/schema.h" @@ -116,6 +117,8 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties( } properties.set_coerce_int96_timestamp_unit( format.reader_options.coerce_int96_timestamp_unit); + properties.set_binary_type(format.reader_options.binary_type); + properties.set_list_type(format.reader_options.list_type); return properties; } @@ -132,6 +135,8 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties( arrow_properties.set_io_context( parquet_scan_options.arrow_reader_properties->io_context()); arrow_properties.set_use_threads(options.use_threads); + arrow_properties.set_arrow_extensions_enabled( + parquet_scan_options.arrow_reader_properties->get_arrow_extensions_enabled()); return arrow_properties; } @@ -441,7 +446,9 @@ bool ParquetFileFormat::Equals(const FileFormat& other) const { // FIXME implement comparison for decryption options return (reader_options.dict_columns == other_reader_options.dict_columns && reader_options.coerce_int96_timestamp_unit == - other_reader_options.coerce_int96_timestamp_unit); + other_reader_options.coerce_int96_timestamp_unit && + reader_options.binary_type == other_reader_options.binary_type && + reader_options.list_type == other_reader_options.list_type); } ParquetFileFormat::ParquetFileFormat(const parquet::ReaderProperties& reader_properties) @@ -636,10 +643,12 @@ Result ParquetFileFormat::ScanBatchesAsync( kParquetTypeName, options.get(), default_fragment_scan_options)); int batch_readahead = options->batch_readahead; int64_t rows_to_readahead = batch_readahead * options->batch_size; - ARROW_ASSIGN_OR_RAISE(auto generator, - reader->GetRecordBatchGenerator( - reader, row_groups, column_projection, - ::arrow::internal::GetCpuThreadPool(), rows_to_readahead)); + // Use the executor from scan options if provided. + auto cpu_executor = options->cpu_executor ? options->cpu_executor + : ::arrow::internal::GetCpuThreadPool(); + ARROW_ASSIGN_OR_RAISE(auto generator, reader->GetRecordBatchGenerator( + reader, row_groups, column_projection, + cpu_executor, rows_to_readahead)); RecordBatchGenerator sliced = SlicingGenerator(std::move(generator), options->batch_size); if (batch_readahead == 0) { @@ -1042,6 +1051,8 @@ static inline Result FileFromRowGroup( return filesystem->NormalizePath(std::move(path)); } +namespace { + Result> GetSchema( const parquet::FileMetaData& metadata, const parquet::ArrowReaderProperties& properties) { @@ -1051,6 +1062,8 @@ Result> GetSchema( return schema; } +} // namespace + Result> ParquetDatasetFactory::Make( const std::string& metadata_path, std::shared_ptr filesystem, std::shared_ptr format, ParquetFactoryOptions options) { diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index d496ff6eb87..1811a96bf98 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -90,6 +90,8 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { /// @{ std::unordered_set dict_columns; arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO; + Type::type binary_type = Type::BINARY; + Type::type list_type = Type::LIST; /// @} } reader_options; @@ -242,8 +244,7 @@ class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions { /// ScanOptions. std::shared_ptr reader_properties; /// Arrow reader properties. Not all properties are respected: batch_size comes from - /// ScanOptions. Additionally, dictionary columns come from - /// ParquetFileFormat::ReaderOptions::dict_columns. + /// ScanOptions. Additionally, other options come from ParquetFileFormat::ReaderOptions. std::shared_ptr arrow_reader_properties; /// A configuration structure that provides decryption properties for a dataset std::shared_ptr parquet_decryption_config = NULLPTR; diff --git a/cpp/src/arrow/dataset/file_parquet_encryption_test.cc b/cpp/src/arrow/dataset/file_parquet_encryption_test.cc index d2e1763c62f..91d813530d4 100644 --- a/cpp/src/arrow/dataset/file_parquet_encryption_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_encryption_test.cc @@ -34,6 +34,7 @@ #include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/future.h" +#include "arrow/util/secure_string.h" #include "arrow/util/thread_pool.h" #include "parquet/arrow/reader.h" #include "parquet/encryption/crypto_factory.h" @@ -41,12 +42,16 @@ #include "parquet/encryption/kms_client.h" #include "parquet/encryption/test_in_memory_kms.h" -constexpr std::string_view kFooterKeyMasterKey = "0123456789012345"; +using arrow::util::SecureString; + +const SecureString kFooterKeyMasterKey("0123456789012345"); constexpr std::string_view kFooterKeyMasterKeyId = "footer_key"; constexpr std::string_view kFooterKeyName = "footer_key"; -constexpr std::string_view kColumnMasterKey = "1234567890123450"; + +const SecureString kColumnMasterKey("1234567890123450"); constexpr std::string_view kColumnMasterKeyId = "col_key"; constexpr std::string_view kColumnKeyMapping = "col_key: a"; + constexpr std::string_view kBaseDir = ""; using arrow::internal::checked_pointer_cast; @@ -105,7 +110,7 @@ class DatasetEncryptionTestBase : public testing::TestWithParam key_map; + std::unordered_map key_map; key_map.emplace(kColumnMasterKeyId, kColumnMasterKey); key_map.emplace(kFooterKeyMasterKeyId, kFooterKeyMasterKey); @@ -145,7 +150,7 @@ class DatasetEncryptionTestBase : public testing::TestWithParam( - std::string(kFooterKeyMasterKey)) + kFooterKeyMasterKey) ->build(); auto writer_properties = std::make_unique() ->encryption(file_encryption_properties) @@ -230,7 +235,7 @@ class DatasetEncryptionTestBase : public testing::TestWithParam() - ->footer_key(std::string(kFooterKeyMasterKey)) + ->footer_key(kFooterKeyMasterKey) ->build(); parquet_scan_options->reader_properties->file_decryption_properties( file_decryption_properties); @@ -370,7 +375,7 @@ TEST_P(DatasetEncryptionTest, ReadSingleFile) { // Configure decryption keys via file decryption properties with static footer key. file_decryption_properties = std::make_unique() - ->footer_key(std::string(kFooterKeyMasterKey)) + ->footer_key(kFooterKeyMasterKey) ->build(); } auto reader_properties = parquet::default_reader_properties(); diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 95f00c195c2..696bda19359 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -17,6 +17,7 @@ #include "arrow/dataset/file_parquet.h" +#include #include #include #include @@ -25,6 +26,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/dataset/dataset_internal.h" #include "arrow/dataset/parquet_encryption_config.h" +#include "arrow/dataset/scanner.h" #include "arrow/dataset/test_util_internal.h" #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" @@ -133,6 +135,29 @@ class ParquetFormatHelper { } }; +class DelayedBufferReader : public ::arrow::io::BufferReader { + public: + explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer) + : ::arrow::io::BufferReader(buffer) {} + + ::arrow::Future> ReadAsync( + const ::arrow::io::IOContext& io_context, int64_t position, + int64_t nbytes) override { + read_async_count.fetch_add(1); + auto self = std::dynamic_pointer_cast(shared_from_this()); + return DeferNotOk(::arrow::io::internal::SubmitIO( + io_context, [self, position, nbytes]() -> Result> { + std::this_thread::sleep_for(std::chrono::seconds(1)); + return self->DoReadAt(position, nbytes); + })); + } + + std::atomic read_async_count{0}; +}; + +using CustomizeScanOptionsWithThreadPool = + std::function; + class TestParquetFileFormat : public FileFormatFixtureMixin { public: RecordBatchIterator Batches(Fragment* fragment) { @@ -183,6 +208,51 @@ class TestParquetFileFormat : public FileFormatFixtureMixin EXPECT_EQ(SingleBatch(parquet_fragment.get())->num_rows(), expected + 1); } } + + void TestMultithreadedRegression(CustomizeScanOptionsWithThreadPool customizer) { + auto reader = MakeGeneratedRecordBatch(schema({field("utf8", utf8())}), 10000, 100); + ASSERT_OK_AND_ASSIGN(auto buffer, ParquetFormatHelper::Write(reader.get())); + + std::vector> completes; + std::vector> pools; + + for (int idx = 0; idx < 2; ++idx) { + auto buffer_reader = std::make_shared(buffer); + auto source = std::make_shared(buffer_reader, buffer->size()); + auto fragment = MakeFragment(*source); + std::shared_ptr scanner; + + { + auto options = std::make_shared(); + ASSERT_OK_AND_ASSIGN(auto thread_pool, arrow::internal::ThreadPool::Make(1)); + pools.emplace_back(thread_pool); + customizer(*options, pools.back().get()); + auto fragment_scan_options = std::make_shared(); + fragment_scan_options->arrow_reader_properties->set_pre_buffer(true); + + options->fragment_scan_options = fragment_scan_options; + ScannerBuilder builder(ArithmeticDatasetFixture::schema(), fragment, options); + + ASSERT_OK(builder.UseThreads(true)); + ASSERT_OK(builder.BatchSize(10000)); + ASSERT_OK_AND_ASSIGN(scanner, builder.Finish()); + } + + ASSERT_OK_AND_ASSIGN(auto batch, scanner->Head(10000)); + [[maybe_unused]] auto fut = scanner->ScanBatchesUnorderedAsync(); + // Random ReadAsync calls, generate some futures to make the state machine + // more complex. + for (int yy = 0; yy < 16; yy++) { + completes.emplace_back( + buffer_reader->ReadAsync(::arrow::io::IOContext(), 0, 1001)); + } + scanner = nullptr; + } + + for (auto& f : completes) { + f.Wait(); + } + } }; TEST_F(TestParquetFileFormat, InspectFailureWithRelevantError) { @@ -904,73 +974,25 @@ TEST(TestParquetStatistics, NoNullCount) { } } -class DelayedBufferReader : public ::arrow::io::BufferReader { - public: - explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer) - : ::arrow::io::BufferReader(buffer) {} - - ::arrow::Future> ReadAsync( - const ::arrow::io::IOContext& io_context, int64_t position, - int64_t nbytes) override { - read_async_count.fetch_add(1); - auto self = std::dynamic_pointer_cast(shared_from_this()); - return DeferNotOk(::arrow::io::internal::SubmitIO( - io_context, [self, position, nbytes]() -> Result> { - std::this_thread::sleep_for(std::chrono::seconds(1)); - return self->DoReadAt(position, nbytes); - })); - } - - std::atomic read_async_count{0}; -}; - TEST_F(TestParquetFileFormat, MultithreadedScanRegression) { // GH-38438: This test is similar to MultithreadedScan, but it try to use self // designed Executor and DelayedBufferReader to mock async execution to make // the state machine more complex. - auto reader = MakeGeneratedRecordBatch(schema({field("utf8", utf8())}), 10000, 100); - - ASSERT_OK_AND_ASSIGN(auto buffer, ParquetFormatHelper::Write(reader.get())); - - std::vector> completes; - std::vector> pools; - - for (int idx = 0; idx < 2; ++idx) { - auto buffer_reader = std::make_shared(buffer); - auto source = std::make_shared(buffer_reader, buffer->size()); - auto fragment = MakeFragment(*source); - std::shared_ptr scanner; - - { - auto options = std::make_shared(); - ASSERT_OK_AND_ASSIGN(auto thread_pool, arrow::internal::ThreadPool::Make(1)); - pools.emplace_back(thread_pool); - options->io_context = - ::arrow::io::IOContext(::arrow::default_memory_pool(), pools.back().get()); - auto fragment_scan_options = std::make_shared(); - fragment_scan_options->arrow_reader_properties->set_pre_buffer(true); - - options->fragment_scan_options = fragment_scan_options; - ScannerBuilder builder(ArithmeticDatasetFixture::schema(), fragment, options); - - ASSERT_OK(builder.UseThreads(true)); - ASSERT_OK(builder.BatchSize(10000)); - ASSERT_OK_AND_ASSIGN(scanner, builder.Finish()); - } - - ASSERT_OK_AND_ASSIGN(auto batch, scanner->Head(10000)); - [[maybe_unused]] auto fut = scanner->ScanBatchesUnorderedAsync(); - // Random ReadAsync calls, generate some futures to make the state machine - // more complex. - for (int yy = 0; yy < 16; yy++) { - completes.emplace_back(buffer_reader->ReadAsync(::arrow::io::IOContext(), 0, 1001)); - } - scanner = nullptr; - } + CustomizeScanOptionsWithThreadPool customize_io_context = + [](ScanOptions& options, arrow::internal::ThreadPool* pool) { + options.io_context = ::arrow::io::IOContext(::arrow::default_memory_pool(), pool); + }; + TestMultithreadedRegression(customize_io_context); +} - for (auto& f : completes) { - f.Wait(); - } +TEST_F(TestParquetFileFormat, MultithreadedComputeRegression) { + // GH-43694: Test similar situation as MultithreadedScanRegression but with + // the customized CPU executor instead + CustomizeScanOptionsWithThreadPool customize_cpu_executor = + [](ScanOptions& options, arrow::internal::ThreadPool* pool) { + options.cpu_executor = pool; + }; + TestMultithreadedRegression(customize_cpu_executor); } } // namespace dataset diff --git a/cpp/src/arrow/dataset/file_test.cc b/cpp/src/arrow/dataset/file_test.cc index 5d6068557f0..8904531200d 100644 --- a/cpp/src/arrow/dataset/file_test.cc +++ b/cpp/src/arrow/dataset/file_test.cc @@ -15,15 +15,22 @@ // specific language governing permissions and limitations // under the License. +#include +#include #include #include #include +#include #include #include #include #include +#include +#include +#include +#include #include "arrow/acero/exec_plan.h" #include "arrow/acero/test_util_internal.h" #include "arrow/array/array_primitive.h" @@ -31,6 +38,7 @@ #include "arrow/dataset/api.h" #include "arrow/dataset/partition.h" #include "arrow/dataset/plan.h" +#include "arrow/dataset/projector.h" #include "arrow/dataset/test_util_internal.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/test_util.h" @@ -353,6 +361,165 @@ TEST_F(TestFileSystemDataset, WriteProjected) { } } +// This kernel delays execution for some specific scalar values, +// which guarantees the writing phase sees out-of-order exec batches +Status delay(compute::KernelContext* ctx, const compute::ExecSpan& batch, + compute::ExecResult* out) { + const ArraySpan& input = batch[0].array; + const auto* input_values = input.GetValues(1); + uint8_t* output_values = out->array_span()->buffers[1].data; + + // Boolean data is stored in 1 bit per value + for (int64_t i = 0; i < input.length; ++i) { + if (input_values[i] % 16 == 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + bit_util::SetBitTo(output_values, i, true); + } + + return Status::OK(); +} + +// A fragment with start=0 will defer ScanBatchesAsync returning a batch generator +// This guarantees a dataset of multiple fragments could produce out-of-order batches +class MockFragment : public Fragment { + public: + explicit MockFragment(uint32_t start, int64_t rows_per_batch, int num_batches, + const std::shared_ptr& schema) + : Fragment(compute::literal(true), schema), + start_(start), + rows_per_batch_(rows_per_batch), + num_batches_(num_batches) {} + + Result ScanBatchesAsync( + const std::shared_ptr& options) override { + // Fragment with start_=0 defers returning the generator + if (start_ == 0) { + std::this_thread::sleep_for(std::chrono::duration(0.1)); + } + + auto vec = gen::Gen({gen::Step(start_)}) + ->FailOnError() + ->RecordBatches(rows_per_batch_, num_batches_); + auto it = MakeVectorIterator(vec); + return MakeBackgroundGenerator(std::move(it), io::default_io_context().executor()); + } + + std::string type_name() const override { return "mock"; } + + protected: + Result> ReadPhysicalSchemaImpl() override { + return given_physical_schema_; + }; + + private: + uint32_t start_; + int64_t rows_per_batch_; + int num_batches_; +}; + +// This dataset consists of multiple fragments with incrementing values across the +// fragments +class MockDataset : public Dataset { + public: + explicit MockDataset(const std::shared_ptr& schema) : Dataset(schema) {} + + MockDataset(const std::shared_ptr& schema, + const compute::Expression& partition_expression) + : Dataset(schema, partition_expression) {} + + std::string type_name() const override { return "mock"; } + Result> ReplaceSchema( + std::shared_ptr schema) const override { + RETURN_NOT_OK(CheckProjectable(*schema_, *schema)); + return std::make_shared(std::move(schema)); + } + + protected: + Result GetFragmentsImpl(compute::Expression predicate) override { + FragmentVector fragments; + fragments.push_back(std::make_shared(0, 2, 1024, schema_)); + fragments.push_back(std::make_shared(2 * 1024, 2, 1024, schema_)); + return MakeVectorIterator(std::move(fragments)); + }; +}; + +TEST_F(TestFileSystemDataset, MultiThreadedWritePersistsOrder) { + // Test for GH-26818 + // + // This test uses std::this_thread::sleep_for to increase chances for batches + // to get written out-of-order in multi-threaded environment. + // With preserve_order = false, the existence of out-of-order is asserted to + // verify that the test setup reliably writes out-of-order sequences, and + // that write_options.preserve_order = preserve_order can recreate order. + // + // Estimates for out_of_order == false and preserve_order == false to occur + // are 10^-62 https://github.com/apache/arrow/pull/44470#discussion_r2079049038 + // + // If this test starts to reliably fail with preserve_order == false, the test setup + // has to be revised to again reliably produce out-of-order sequences. + auto format = std::make_shared(); + FileSystemDatasetWriteOptions write_options; + write_options.file_write_options = format->DefaultWriteOptions(); + write_options.base_dir = "root"; + write_options.partitioning = std::make_shared(schema({})); + write_options.basename_template = "{i}.feather"; + + // The Mock dataset delays emitting the first fragment, which test sequenced output of + // scan node + auto dataset = std::make_shared(schema({field("f0", int32())})); + + // The delay scalar function delays some batches of all fragments, which tests implicit + // ordering + auto delay_func = std::make_shared("delay", compute::Arity(1), + compute::FunctionDoc()); + compute::ScalarKernel delay_kernel; + delay_kernel.exec = delay; + delay_kernel.signature = compute::KernelSignature::Make({int32()}, boolean()); + ASSERT_OK(delay_func->AddKernel(delay_kernel)); + ASSERT_OK(compute::GetFunctionRegistry()->AddFunction(delay_func)); + + for (bool preserve_order : {true, false}) { + ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan()); + ASSERT_OK(scanner_builder->UseThreads(true)); + ASSERT_OK( + scanner_builder->Filter(compute::call("delay", {compute::field_ref("f0")}))); + ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish()); + + auto fs = std::make_shared(fs::kNoTime); + write_options.filesystem = fs; + write_options.preserve_order = preserve_order; + + ASSERT_OK(FileSystemDataset::Write(write_options, scanner)); + + // Read the file back out and verify the order + ASSERT_OK_AND_ASSIGN(auto dataset_factory, FileSystemDatasetFactory::Make( + fs, {"root/0.feather"}, format, {})); + ASSERT_OK_AND_ASSIGN(auto written_dataset, dataset_factory->Finish(FinishOptions{})); + ASSERT_OK_AND_ASSIGN(scanner_builder, written_dataset->NewScan()); + ASSERT_OK(scanner_builder->UseThreads(false)); + ASSERT_OK_AND_ASSIGN(scanner, scanner_builder->Finish()); + ASSERT_OK_AND_ASSIGN(auto actual, scanner->ToTable()); + TableBatchReader reader(*actual); + std::shared_ptr batch; + ASSERT_OK(reader.ReadNext(&batch)); + int32_t prev = -1; + auto out_of_order = false; + while (batch != nullptr) { + const auto* values = batch->column(0)->data()->GetValues(1); + for (int row = 0; row < batch->num_rows(); ++row) { + int32_t value = values[row]; + if (value <= prev) { + out_of_order = true; + } + prev = value; + } + ASSERT_OK(reader.ReadNext(&batch)); + } + ASSERT_EQ(!out_of_order, preserve_order); + } +} + class FileSystemWriteTest : public testing::TestWithParam> { using PlanFactory = std::function( const FileSystemDatasetWriteOptions&, diff --git a/cpp/src/arrow/dataset/pch.h b/cpp/src/arrow/dataset/pch.h deleted file mode 100644 index a74fd96e355..00000000000 --- a/cpp/src/arrow/dataset/pch.h +++ /dev/null @@ -1,27 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Often-used headers, for precompiling. -// If updating this header, please make sure you check compilation speed -// before checking in. Adding headers which are not used extremely often -// may incur a slowdown, since it makes the precompiled header heavier to load. - -// This API is EXPERIMENTAL. - -#include "arrow/dataset/dataset.h" -#include "arrow/dataset/scanner.h" -#include "arrow/pch.h" diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index a8c8c6bde68..222e1323d4a 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -360,8 +360,9 @@ class OneShotFragment : public Fragment { ARROW_ASSIGN_OR_RAISE( auto background_gen, MakeBackgroundGenerator(std::move(batch_it_), options->io_context.executor())); - return MakeTransferredGenerator(std::move(background_gen), - ::arrow::internal::GetCpuThreadPool()); + auto cpu_executor = options->cpu_executor ? options->cpu_executor + : ::arrow::internal::GetCpuThreadPool(); + return MakeTransferredGenerator(std::move(background_gen), cpu_executor); } std::string type_name() const override { return "one-shot"; } @@ -387,7 +388,7 @@ Result AsyncScanner::ScanBatches() { [this](::arrow::internal::Executor* executor) { return ScanBatchesAsync(executor); }, - scan_options_->use_threads); + scan_options_->use_threads, scan_options_->cpu_executor); } Result AsyncScanner::ScanBatchesUnordered() { @@ -395,7 +396,7 @@ Result AsyncScanner::ScanBatchesUnordered() { [this](::arrow::internal::Executor* executor) { return ScanBatchesUnorderedAsync(executor); }, - scan_options_->use_threads); + scan_options_->use_threads, scan_options_->cpu_executor); } Result> AsyncScanner::ToTable() { @@ -405,7 +406,9 @@ Result> AsyncScanner::ToTable() { } Result AsyncScanner::ScanBatchesUnorderedAsync() { - return ScanBatchesUnorderedAsync(::arrow::internal::GetCpuThreadPool(), + return ScanBatchesUnorderedAsync(scan_options_->cpu_executor + ? scan_options_->cpu_executor + : ::arrow::internal::GetCpuThreadPool(), /*sequence_fragments=*/false); } @@ -606,7 +609,9 @@ Result> AsyncScanner::Head(int64_t num_rows) { } Result AsyncScanner::ScanBatchesAsync() { - return ScanBatchesAsync(::arrow::internal::GetCpuThreadPool()); + return ScanBatchesAsync(scan_options_->cpu_executor + ? scan_options_->cpu_executor + : ::arrow::internal::GetCpuThreadPool()); } Result AsyncScanner::ScanBatchesAsync( @@ -783,7 +788,9 @@ Future AsyncScanner::CountRowsAsync(Executor* executor) { } Future AsyncScanner::CountRowsAsync() { - return CountRowsAsync(::arrow::internal::GetCpuThreadPool()); + return CountRowsAsync(scan_options_->cpu_executor + ? scan_options_->cpu_executor + : ::arrow::internal::GetCpuThreadPool()); } Result AsyncScanner::CountRows() { diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 50310577f1e..7885b132cc9 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -35,6 +35,7 @@ #include "arrow/type_fwd.h" #include "arrow/util/async_generator_fwd.h" #include "arrow/util/iterator.h" +#include "arrow/util/thread_pool.h" #include "arrow/util/type_fwd.h" namespace arrow { @@ -104,6 +105,13 @@ struct ARROW_DS_EXPORT ScanOptions { /// Note: The IOContext executor will be ignored if use_threads is set to false io::IOContext io_context; + /// Executor for any CPU tasks + /// + /// If null, the global CPU executor will be used + /// + /// Note: The Executor will be ignored if use_threads is set to false + arrow::internal::Executor* cpu_executor = NULLPTR; + /// If true the scanner will scan in parallel /// /// Note: If true, this will use threads from both the cpu_executor and the diff --git a/cpp/src/arrow/dataset/test_util_internal.h b/cpp/src/arrow/dataset/test_util_internal.h index ee73ebc5a48..0dc8264a9cc 100644 --- a/cpp/src/arrow/dataset/test_util_internal.h +++ b/cpp/src/arrow/dataset/test_util_internal.h @@ -18,7 +18,6 @@ #pragma once #include -#include #include #include #include diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc index d7125d1f6c9..39900780986 100644 --- a/cpp/src/arrow/datum.cc +++ b/cpp/src/arrow/datum.cc @@ -31,7 +31,7 @@ #include "arrow/table.h" #include "arrow/util/byte_size.h" #include "arrow/util/logging_internal.h" -#include "arrow/util/memory.h" +#include "arrow/util/memory_internal.h" namespace arrow { diff --git a/cpp/src/arrow/engine/ArrowSubstraitConfig.cmake.in b/cpp/src/arrow/engine/ArrowSubstraitConfig.cmake.in index 932c52ac65f..013ed928490 100644 --- a/cpp/src/arrow/engine/ArrowSubstraitConfig.cmake.in +++ b/cpp/src/arrow/engine/ArrowSubstraitConfig.cmake.in @@ -27,10 +27,10 @@ @PACKAGE_INIT@ include(CMakeFindDependencyMacro) -find_dependency(Arrow) -find_dependency(ArrowAcero) -find_dependency(ArrowDataset) -find_dependency(Parquet) +find_dependency(Arrow CONFIG) +find_dependency(ArrowAcero CONFIG) +find_dependency(ArrowDataset CONFIG) +find_dependency(Parquet CONFIG) include("${CMAKE_CURRENT_LIST_DIR}/ArrowSubstraitTargets.cmake") diff --git a/cpp/src/arrow/engine/CMakeLists.txt b/cpp/src/arrow/engine/CMakeLists.txt index 6978a8383f0..adf98087ad1 100644 --- a/cpp/src/arrow/engine/CMakeLists.txt +++ b/cpp/src/arrow/engine/CMakeLists.txt @@ -51,8 +51,6 @@ add_arrow_lib(arrow_substrait ARROW_SUBSTRAIT_LIBRARIES SOURCES ${ARROW_SUBSTRAIT_SRCS} - PRECOMPILED_HEADERS - "$<$:arrow/engine/pch.h>" SHARED_LINK_FLAGS ${ARROW_VERSION_SCRIPT_FLAGS} # Defined in cpp/arrow/CMakeLists.txt SHARED_LINK_LIBS @@ -89,6 +87,7 @@ add_arrow_test(substrait_test substrait/test_util.cc EXTRA_LINK_LIBS ${ARROW_SUBSTRAIT_TEST_LINK_LIBS} + arrow_compute_testing PREFIX "arrow-substrait" LABELS diff --git a/cpp/src/arrow/engine/pch.h b/cpp/src/arrow/engine/pch.h deleted file mode 100644 index ddb4c120f2a..00000000000 --- a/cpp/src/arrow/engine/pch.h +++ /dev/null @@ -1,23 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Often-used headers, for precompiling. -// If updating this header, please make sure you check compilation speed -// before checking in. Adding headers which are not used extremely often -// may incur a slowdown, since it makes the precompiled header heavier to load. - -#include "arrow/pch.h" diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 464e6d26703..d91edbfce6d 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -88,8 +88,6 @@ Id NormalizeFunctionName(Id id) { return {id.uri, func_name}; } -} // namespace - Status DecodeArg(const substrait::FunctionArgument& arg, int idx, SubstraitCall* call, const ExtensionSet& ext_set, const ConversionOptions& conversion_options) { @@ -136,15 +134,6 @@ Result DecodeScalarFunction( return call; } -std::string EnumToString(int value, const google::protobuf::EnumDescriptor* descriptor) { - const google::protobuf::EnumValueDescriptor* value_desc = - descriptor->FindValueByNumber(value); - if (value_desc == nullptr) { - return "unknown"; - } - return std::string(value_desc->name()); -} - Result FromProto(const substrait::Expression::ReferenceSegment* ref, const ExtensionSet& ext_set, const ConversionOptions& conversion_options, @@ -229,6 +218,8 @@ Result FromProto(const substrait::Expression::FieldReferenc return FromProto(&dref, ext_set, conversion_options, std::move(in_expr)); } +} // namespace + Result DirectReferenceFromProto( const substrait::Expression::FieldReference* fref, const ExtensionSet& ext_set, const ConversionOptions& conversion_options) { @@ -678,7 +669,7 @@ Result FromProto(const substrait::Expression::Literal& lit, for (int i = 0; i < map.key_values_size(); ++i) { const auto& kv = map.key_values(i); - static const std::array kMissing = {"key and value", "value", + static const std::array kMissing = {"key and value", "value", "key", nullptr}; if (auto missing = kMissing[kv.has_key() + kv.has_value() * 2]) { return Status::Invalid("While converting to MapScalar encountered missing ", @@ -1128,6 +1119,7 @@ struct ScalarToProtoImpl { ExtensionSet* ext_set_; const ConversionOptions& conversion_options_; }; + } // namespace Result> ToProto( @@ -1152,7 +1144,9 @@ Result> ToProto( return out; } -static Status AddChildToReferenceSegment( +namespace { + +Status AddChildToReferenceSegment( substrait::Expression::ReferenceSegment& segment, std::unique_ptr&& child) { auto status = Status::Invalid("Attempt to add child to incomplete reference segment"); @@ -1197,7 +1191,7 @@ static Status AddChildToReferenceSegment( // Indexes the given Substrait expression or root (if expr is empty) using the given // ReferenceSegment. -static Result> MakeDirectReference( +Result> MakeDirectReference( std::unique_ptr&& expr, std::unique_ptr&& ref_segment) { // If expr is already a selection expression, add the index to its index stack. @@ -1227,7 +1221,7 @@ static Result> MakeDirectReference( // Indexes the given Substrait struct-typed expression or root (if expr is empty) using // the given field index. -static Result> MakeStructFieldReference( +Result> MakeStructFieldReference( std::unique_ptr&& expr, int field) { auto struct_field = std::make_unique(); @@ -1240,7 +1234,7 @@ static Result> MakeStructFieldReference( } // Indexes the given Substrait list-typed expression using the given offset. -static Result> MakeListElementReference( +Result> MakeListElementReference( std::unique_ptr&& expr, int offset) { auto list_element = std::make_unique(); @@ -1340,6 +1334,8 @@ Result>> DatumToLiterals( return literals; } +} // namespace + Result> ToProto( const compute::Expression& expr, ExtensionSet* ext_set, const ConversionOptions& conversion_options) { diff --git a/cpp/src/arrow/engine/substrait/extension_set.cc b/cpp/src/arrow/engine/substrait/extension_set.cc index 559b618157c..4fb9b19645b 100644 --- a/cpp/src/arrow/engine/substrait/extension_set.cc +++ b/cpp/src/arrow/engine/substrait/extension_set.cc @@ -175,7 +175,7 @@ void SubstraitCall::SetValueArg(int index, compute::Expression value_arg) { value_args_[index] = std::move(value_arg); } -std::optional const*> SubstraitCall::GetOption( +std::optional*> SubstraitCall::GetOption( std::string_view option_name) const { auto opt = options_.find(std::string(option_name)); if (opt == options_.end()) { @@ -762,11 +762,11 @@ Result ParseOptionOrElse(const SubstraitCall& call, std::string_view optio const EnumParser& parser, const std::vector& implemented_options, Enum fallback) { - std::optional const*> enum_arg = call.GetOption(option_name); + std::optional*> enum_arg = call.GetOption(option_name); if (!enum_arg.has_value()) { return fallback; } - std::vector const* prefs = *enum_arg; + const std::vector* prefs = *enum_arg; for (const std::string& pref : *prefs) { ARROW_ASSIGN_OR_RAISE(Enum parsed, parser.Parse(pref)); for (Enum implemented_opt : implemented_options) { diff --git a/cpp/src/arrow/engine/substrait/extension_set.h b/cpp/src/arrow/engine/substrait/extension_set.h index c18e0cf77aa..4f631e0f193 100644 --- a/cpp/src/arrow/engine/substrait/extension_set.h +++ b/cpp/src/arrow/engine/substrait/extension_set.h @@ -141,7 +141,7 @@ class ARROW_ENGINE_EXPORT SubstraitCall { Result GetValueArg(int index) const; bool HasValueArg(int index) const; void SetValueArg(int index, compute::Expression value_arg); - std::optional const*> GetOption( + std::optional*> GetOption( std::string_view option_name) const; void SetOption(std::string_view option_name, const std::vector& option_preferences); diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc b/cpp/src/arrow/engine/substrait/relation_internal.cc index 4d52ba3820a..1ea143f9c58 100644 --- a/cpp/src/arrow/engine/substrait/relation_internal.cc +++ b/cpp/src/arrow/engine/substrait/relation_internal.cc @@ -94,6 +94,8 @@ Result GetEmitInfo(const RelMessage& rel, return emit_info; } +namespace { + Result ProcessEmitProject( std::optional rel_common_opt, const DeclarationInfo& project_declr, const std::shared_ptr& input_schema) { @@ -130,6 +132,8 @@ Result ProcessEmitProject( } } +} // namespace + template Result ProcessEmit(const RelMessage& rel, const DeclarationInfo& no_emit_declr, @@ -153,6 +157,7 @@ Result ProcessEmit(const RelMessage& rel, return no_emit_declr; } } + /// In the specialization, a single ProjectNode is being used to /// get the Acero relation with or without emit. template <> @@ -163,6 +168,8 @@ Result ProcessEmit(const substrait::ProjectRel& rel, no_emit_declr, schema); } +namespace { + Result ProcessExtensionEmit(const DeclarationInfo& no_emit_declr, const std::vector& emit_order) { const std::shared_ptr& input_schema = no_emit_declr.output_schema; @@ -289,6 +296,8 @@ Status DiscoverFilesFromDir(const std::shared_ptr& local_fs return Status::OK(); } +} // namespace + namespace internal { Result ParseAggregateMeasure( @@ -1100,8 +1109,6 @@ Result> FilterRelationConverter( return filter_rel; } -} // namespace - Status SerializeAndCombineRelations(const acero::Declaration& declaration, ExtensionSet* ext_set, std::unique_ptr* rel, @@ -1141,6 +1148,8 @@ Status SerializeAndCombineRelations(const acero::Declaration& declaration, return Status::OK(); } +} // namespace + Result> ToProto( const acero::Declaration& declr, ExtensionSet* ext_set, const ConversionOptions& conversion_options) { diff --git a/cpp/src/arrow/engine/substrait/serde.cc b/cpp/src/arrow/engine/substrait/serde.cc index db2dcb59282..5ce97cb0ccf 100644 --- a/cpp/src/arrow/engine/substrait/serde.cc +++ b/cpp/src/arrow/engine/substrait/serde.cc @@ -48,6 +48,8 @@ namespace arrow { namespace engine { +namespace { + Status ParseFromBufferImpl(const Buffer& buf, const std::string& full_name, google::protobuf::Message* message) { google::protobuf::io::ArrayInputStream buf_stream{buf.data(), @@ -59,6 +61,8 @@ Status ParseFromBufferImpl(const Buffer& buf, const std::string& full_name, return Status::Invalid("ParseFromZeroCopyStream failed for ", full_name); } +} // namespace + template Result ParseFromBuffer(const Buffer& buf) { Message message; diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 6762d1e0454..f92aee8eaad 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -1065,6 +1065,10 @@ NamedTableProvider AlwaysProvideSameTable(std::shared_ptr
table) { } TEST(Substrait, ExecReadRelWithLocalFiles) { +#ifdef _WIN32 + GTEST_SKIP() + << "GH-47490: Substrait does not properly parse PARQUET_TEST_DATA path on Windows"; +#endif ASSERT_OK_AND_ASSIGN(std::string dir_string, arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); diff --git a/cpp/src/arrow/engine/substrait/test_plan_builder.cc b/cpp/src/arrow/engine/substrait/test_plan_builder.cc index 724c58277e7..a8302145f54 100644 --- a/cpp/src/arrow/engine/substrait/test_plan_builder.cc +++ b/cpp/src/arrow/engine/substrait/test_plan_builder.cc @@ -42,7 +42,9 @@ namespace arrow { namespace engine { namespace internal { -static const ConversionOptions kPlanBuilderConversionOptions; +namespace { + +const ConversionOptions kPlanBuilderConversionOptions; Result> CreateRead(const Table& table, ExtensionSet* ext_set) { @@ -185,6 +187,8 @@ Result> CreatePlan(std::unique_ptr> CreateScanProjectSubstrait( Id function_id, const std::shared_ptr
& input_table, const std::vector& arguments, diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index b469f5fa0ba..3e8c0dda765 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -203,7 +203,7 @@ Result, bool>> FromProto( case substrait::Type::kMap: { const auto& map = type.map(); - static const std::array kMissing = {"key and value", "value", "key", + static const std::array kMissing = {"key and value", "value", "key", nullptr}; if (auto missing = kMissing[map.has_key() + map.has_value() * 2]) { return Status::Invalid("While converting to MapType encountered missing ", diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index 1b195e4c9f5..bb7082e6976 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -28,8 +28,8 @@ #include "arrow/tensor.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging_internal.h" -#include "arrow/util/print.h" -#include "arrow/util/sort.h" +#include "arrow/util/print_internal.h" +#include "arrow/util/sort_internal.h" #include "arrow/util/string.h" #include @@ -202,7 +202,7 @@ std::shared_ptr FixedShapeTensorType::MakeArray( DCHECK_EQ(data->type->id(), Type::EXTENSION); DCHECK_EQ("arrow.fixed_shape_tensor", internal::checked_cast(*data->type).extension_name()); - return std::make_shared(data); + return std::make_shared(data); } Result> FixedShapeTensorType::MakeTensor( diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 51aea4b25fd..6d4d2de3265 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -28,7 +28,7 @@ #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" -#include "arrow/util/sort.h" +#include "arrow/util/sort_internal.h" namespace arrow { @@ -152,6 +152,28 @@ TEST_F(TestExtensionType, CreateFromArray) { ASSERT_EQ(ext_arr->null_count(), 0); } +TEST_F(TestExtensionType, MakeArrayCanGetCorrectScalarType) { + ASSERT_OK_AND_ASSIGN(auto tensor, + Tensor::Make(value_type_, Buffer::Wrap(values_), shape_)); + + auto exact_ext_type = internal::checked_pointer_cast(ext_type_); + ASSERT_OK_AND_ASSIGN(auto ext_arr, FixedShapeTensorArray::FromTensor(tensor)); + + auto data = ext_arr->data(); + auto array = internal::checked_pointer_cast( + exact_ext_type->MakeArray(data)); + ASSERT_EQ(array->length(), shape_[0]); + ASSERT_EQ(array->null_count(), 0); + + // Check that we can get the first element of the array + ASSERT_OK_AND_ASSIGN(auto first_element, array->GetScalar(0)); + ASSERT_EQ(*(first_element->type), + *(fixed_shape_tensor(value_type_, element_shape_, {0, 1}))); + + ASSERT_OK_AND_ASSIGN(auto tensor_from_array, array->ToTensor()); + ASSERT_TRUE(tensor->Equals(*tensor_from_array)); +} + void CheckSerializationRoundtrip(const std::shared_ptr& ext_type) { auto fst_type = internal::checked_pointer_cast(ext_type); auto serialized = fst_type->Serialize(); diff --git a/cpp/src/arrow/extension/meson.build b/cpp/src/arrow/extension/meson.build new file mode 100644 index 00000000000..663ebba4d4a --- /dev/null +++ b/cpp/src/arrow/extension/meson.build @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +canonical_extension_tests = ['bool8_test.cc', 'json_test.cc', 'uuid_test.cc'] + +if needs_json + canonical_extension_tests += [ + 'fixed_shape_tensor_test.cc', + 'opaque_test.cc', + ] +endif + +exc = executable( + 'arrow-canonical-extensions-test', + sources: canonical_extension_tests, + dependencies: [arrow_test_dep], +) +test('arrow-canonical-extensions-test', exc) + +install_headers( + ['bool8.h', 'fixed_shape_tensor.h', 'json.h', 'opaque.h', 'uuid.h'], +) diff --git a/cpp/src/arrow/extension/tensor_internal.h b/cpp/src/arrow/extension/tensor_internal.h index 069880cb17c..62b1dba6144 100644 --- a/cpp/src/arrow/extension/tensor_internal.h +++ b/cpp/src/arrow/extension/tensor_internal.h @@ -21,12 +21,11 @@ #include #include "arrow/status.h" -#include "arrow/util/print.h" +#include "arrow/util/print_internal.h" namespace arrow::internal { -ARROW_EXPORT -Status IsPermutationValid(const std::vector& permutation) { +inline Status IsPermutationValid(const std::vector& permutation) { const auto size = static_cast(permutation.size()); std::vector dim_seen(size, 0); diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc index 2f36eb3e7d1..b24f6895d0c 100644 --- a/cpp/src/arrow/extension/uuid.cc +++ b/cpp/src/arrow/extension/uuid.cc @@ -40,7 +40,7 @@ Result> UuidType::Deserialize( if (!serialized.empty()) { return Status::Invalid("Unexpected serialized metadata: '", serialized, "'"); } - if (!storage_type->Equals(*fixed_size_binary(16))) { + if (!IsSupportedStorageType(storage_type)) { return Status::Invalid("Invalid storage type for UuidType: ", storage_type->ToString()); } @@ -55,4 +55,8 @@ std::string UuidType::ToString(bool show_metadata) const { std::shared_ptr uuid() { return std::make_shared(); } +bool UuidType::IsSupportedStorageType(const std::shared_ptr& storage_type) { + return storage_type->Equals(*fixed_size_binary(16)); +} + } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h index 42bb21cf0b2..8c9660c463b 100644 --- a/cpp/src/arrow/extension/uuid.h +++ b/cpp/src/arrow/extension/uuid.h @@ -53,6 +53,8 @@ class ARROW_EXPORT UuidType : public ExtensionType { /// \brief Create a UuidType instance static Result> Make() { return std::make_shared(); } + + static bool IsSupportedStorageType(const std::shared_ptr& storage_type); }; /// \brief Return a UuidType instance. diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index 5642e16bcfb..c6d730fcb93 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -70,7 +70,7 @@ enum class HierarchicalNamespaceSupport { /// account. /// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never /// returned). -Result CheckIfHierarchicalNamespaceIsEnabled( +ARROW_EXPORT Result CheckIfHierarchicalNamespaceIsEnabled( const Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, const arrow::fs::AzureOptions& options); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 31deb42ce0a..43d1c2afb77 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -290,9 +290,9 @@ struct PreexistingData { public: const std::string container_name; - static constexpr char const* kObjectName = "test-object-name"; + static constexpr const char* kObjectName = "test-object-name"; - static constexpr char const* kLoremIpsum = R"""( + static constexpr const char* kLoremIpsum = R"""( Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. @@ -323,7 +323,7 @@ culpa qui officia deserunt mollit anim id est laborum. static std::string RandomContainerName(RNG& rng) { return RandomChars(32, rng); } static std::string RandomChars(int count, RNG& rng) { - auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); + const auto fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); std::uniform_int_distribution d(0, static_cast(fillers.size()) - 1); std::string s; std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(rng)]; }); @@ -987,7 +987,7 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(const std::vector& lines, const std::string& path, int total_size) { ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); - for (auto const& line : lines) { + for (const auto& line : lines) { ASSERT_OK(output->Write(line.data(), line.size())); } ASSERT_OK(output->Close()); @@ -1041,9 +1041,9 @@ class TestAzureFileSystem : public ::testing::Test { }; } - char const* kSubData = "sub data"; - char const* kSomeData = "some data"; - char const* kOtherData = "other data"; + const char* kSubData = "sub data"; + const char* kSomeData = "some data"; + const char* kOtherData = "other data"; void SetUpSmallFileSystemTree() { // Set up test containers @@ -1094,7 +1094,7 @@ class TestAzureFileSystem : public ::testing::Test { } #define ASSERT_RAISES_ERRNO(expr, expected_errno) \ - for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); \ + for (::arrow::Status _st = ::arrow::ToStatus((expr)); \ !WithErrno(_st, (expected_errno));) \ FAIL() << "'" ARROW_STRINGIFY(expr) "' did not fail with errno=" << #expected_errno \ << ": " << _st.ToString() @@ -1872,7 +1872,7 @@ class TestAzureFileSystem : public ::testing::Test { FileInfo _src_info; \ ASSERT_OK( \ CheckExpectedErrno(_src, _dest, _expected_errno, #expected_errno, &_src_info)); \ - auto _move_st = ::arrow::internal::GenericToStatus(fs()->Move(_src, _dest)); \ + auto _move_st = ::arrow::ToStatus(fs()->Move(_src, _dest)); \ if (_expected_errno.has_value()) { \ if (WithErrno(_move_st, *_expected_errno)) { \ /* If the Move failed, the source should remain unchanged. */ \ @@ -3126,8 +3126,8 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileMixedReadVsReadAt) { } // Verify random reads interleave too. - auto const index = PreexistingData::RandomIndex(kLineCount, rng_); - auto const position = index * kLineWidth; + const auto index = PreexistingData::RandomIndex(kLineCount, rng_); + const auto position = index * kLineWidth; ASSERT_OK_AND_ASSIGN(size, file->ReadAt(position, buffer.size(), buffer.data())); EXPECT_EQ(size, kLineWidth); auto actual = std::string{buffer.begin(), buffer.end()}; @@ -3160,8 +3160,8 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileRandomSeek) { for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. - auto const index = PreexistingData::RandomIndex(kLineCount, rng_); - auto const position = index * kLineWidth; + const auto index = PreexistingData::RandomIndex(kLineCount, rng_); + const auto position = index * kLineWidth; ASSERT_OK(file->Seek(position)); ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); EXPECT_EQ(lines[index], actual->ToString()); @@ -3197,7 +3197,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileInfo) { auto constexpr kStart = 16; ASSERT_OK_AND_ASSIGN(size, file->ReadAt(kStart, buffer.size(), buffer.data())); - auto const expected = std::string(PreexistingData::kLoremIpsum).substr(kStart); + const auto expected = std::string(PreexistingData::kLoremIpsum).substr(kStart); EXPECT_EQ(std::string(buffer.data(), size), expected); } diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index c02512c4930..8281bed7ce1 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -991,11 +991,6 @@ Result> FileSystemFromUriOrPath( return FileSystemFromUri(uri_string, io_context, out_path); } -Status FileSystemFromUri(const std::string& uri, std::shared_ptr* out_fs, - std::string* out_path) { - return FileSystemFromUri(uri, out_path).Value(out_fs); -} - Status Initialize(const FileSystemGlobalOptions& options) { internal::global_options = options; return Status::OK(); diff --git a/cpp/src/arrow/filesystem/filesystem_library.h b/cpp/src/arrow/filesystem/filesystem_library.h index d610c72237a..1d656901308 100644 --- a/cpp/src/arrow/filesystem/filesystem_library.h +++ b/cpp/src/arrow/filesystem/filesystem_library.h @@ -26,7 +26,9 @@ extern "C" { // _declspec(dllexport)/[[gnu::visibility("default")]] even when // this header is #included by a non-arrow source, as in a third // party filesystem implementation. -ARROW_FORCE_EXPORT void* arrow_filesystem_get_registry() { +ARROW_FORCE_EXPORT void* arrow_filesystem_get_registry(); + +void* arrow_filesystem_get_registry() { // In the case where libarrow is linked statically both to the executable and to a // dynamically loaded filesystem implementation library, the library contains a // duplicate definition of the registry into which the library's instances of diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 9869687a8b9..82d8a2a13f8 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -62,7 +62,7 @@ struct GcsPath { return Status::Invalid( "Expected a GCS object path of the form 'bucket/key...', got a URI: '", s, "'"); } - auto const first_sep = s.find_first_of(internal::kSep); + const auto first_sep = s.find_first_of(internal::kSep); if (first_sep == 0) { return Status::Invalid("Path cannot start with a separator ('", s, "')"); } @@ -353,12 +353,16 @@ class GcsFileSystem::Impl { // matches the prefix we assume it is a directory. std::string canonical = internal::EnsureTrailingSlash(path.object); auto list_result = client_.ListObjects(path.bucket, gcs::Prefix(canonical)); - if (list_result.begin() != list_result.end()) { - // If there is at least one result it indicates this is a directory (at - // least one object exists that starts with "path/") + + for (auto&& object_metadata : list_result) { + if (!object_metadata) { + continue; + } + // If there is at least one valid result, it indicates this is a + // directory (at least one object exists that starts with "path/") return FileInfo(path.full_path, FileType::Directory); } - // Return the original not-found info if there was no match. + // Return the original not-found info if there was no valid result. return info; } @@ -380,7 +384,7 @@ class GcsFileSystem::Impl { auto include_trailing = select.recursive ? gcs::IncludeTrailingDelimiter(false) : gcs::IncludeTrailingDelimiter(true); FileInfoVector result; - for (auto const& o : + for (const auto& o : client_.ListObjects(p.bucket, prefix, delimiter, include_trailing)) { if (!o.ok()) { if (select.allow_not_found && @@ -437,7 +441,7 @@ class GcsFileSystem::Impl { } Status CreateDirMarkerRecursive(const std::string& bucket, const std::string& name) { - auto get_parent = [](std::string const& path) { + auto get_parent = [](const std::string& path) { return std::move(internal::GetAbstractPathParent(path).first); }; // Find the list of missing parents. In the process we discover if any elements in @@ -474,7 +478,7 @@ class GcsFileSystem::Impl { // Note that the list of parents are sorted from deepest to most shallow, this is // convenient because as soon as we find a directory we can stop the iteration. - for (auto const& d : missing_parents) { + for (const auto& d : missing_parents) { auto o = CreateDirMarker(bucket, d); if (o) { if (IsDirectory(*o)) continue; diff --git a/cpp/src/arrow/filesystem/gcsfs_internal.cc b/cpp/src/arrow/filesystem/gcsfs_internal.cc index d155aec827d..721f4da88a8 100644 --- a/cpp/src/arrow/filesystem/gcsfs_internal.cc +++ b/cpp/src/arrow/filesystem/gcsfs_internal.cc @@ -168,7 +168,7 @@ Result ToObjectMetadata( return gcs::WithObjectMetadata{}; } - static auto const setters = [] { + static const auto setters = [] { using setter = std::function; return std::unordered_map{ {"Cache-Control", @@ -237,7 +237,7 @@ Result ToObjectMetadata( } Result> FromObjectMetadata( - gcs::ObjectMetadata const& m) { + const gcs::ObjectMetadata& m) { auto format_time = [](std::chrono::system_clock::time_point tp) { return absl::FormatTime(absl::RFC3339_full, absl::FromChrono(tp), absl::UTCTimeZone()); @@ -278,7 +278,7 @@ Result> FromObjectMetadata( result->Append("Content-Disposition", m.content_disposition()); result->Append("Content-Language", m.content_language()); result->Append("Cache-Control", m.cache_control()); - for (auto const& kv : m.metadata()) { + for (const auto& kv : m.metadata()) { result->Append("metadata." + kv.first, kv.second); } // Skip "acl" because it is overly complex diff --git a/cpp/src/arrow/filesystem/gcsfs_internal.h b/cpp/src/arrow/filesystem/gcsfs_internal.h index a3aed099a3e..e06b5681896 100644 --- a/cpp/src/arrow/filesystem/gcsfs_internal.h +++ b/cpp/src/arrow/filesystem/gcsfs_internal.h @@ -60,7 +60,7 @@ ARROW_EXPORT Result ToObjectMetadata const std::shared_ptr& metadata); ARROW_EXPORT Result> FromObjectMetadata( - google::cloud::storage::ObjectMetadata const& m); + const google::cloud::storage::ObjectMetadata& m); ARROW_EXPORT std::int64_t Depth(std::string_view path); diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 0e457b2d502..e174638b535 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -54,7 +54,7 @@ using ::testing::Pair; using ::testing::UnorderedElementsAre; using ::testing::UnorderedElementsAreArray; -auto const* kLoremIpsum = R"""( +const auto* kLoremIpsum = R"""( Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. @@ -227,7 +227,7 @@ class GcsIntegrationTest : public ::testing::Test { constexpr auto kFilesPerFolder = 2; auto base_dir = internal::ConcatAbstractPath(PreexistingBucketPath(), "b"); auto result = Hierarchy{base_dir, {}}; - for (auto const* f : kTestFolders) { + for (const auto* f : kTestFolders) { const auto folder = internal::ConcatAbstractPath(PreexistingBucketPath(), f); RETURN_NOT_OK(fs->CreateDir(folder, true)); result.contents.push_back(arrow::fs::Dir(folder)); @@ -246,7 +246,7 @@ class GcsIntegrationTest : public ::testing::Test { std::vector static CleanupDirectoryNames( std::vector expected) { std::transform(expected.begin(), expected.end(), expected.begin(), - [](FileInfo const& info) { + [](const FileInfo& info) { if (!info.IsDirectory()) return info; return Dir(std::string(internal::RemoveTrailingSlash(info.path()))); }); @@ -255,7 +255,7 @@ class GcsIntegrationTest : public ::testing::Test { private: std::string RandomChars(std::size_t count) { - auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); + const auto fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); std::uniform_int_distribution d(0, fillers.size() - 1); std::string s; std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(generator_)]; }); @@ -423,7 +423,7 @@ TEST(GcsFileSystem, OptionsAsGoogleCloudOptions) { a.retry_limit_seconds = 40.5; a.project_id = "test-only-invalid-project-id"; - auto const o1 = internal::AsGoogleCloudOptions(a); + const auto o1 = internal::AsGoogleCloudOptions(a); EXPECT_TRUE(o1.has()); EXPECT_TRUE(o1.has()); EXPECT_EQ(o1.get(), "http://localhost:8080"); @@ -434,7 +434,7 @@ TEST(GcsFileSystem, OptionsAsGoogleCloudOptions) { a.retry_limit_seconds.reset(); a.project_id.reset(); - auto const o2 = internal::AsGoogleCloudOptions(a); + const auto o2 = internal::AsGoogleCloudOptions(a); EXPECT_TRUE(o2.has()); EXPECT_FALSE(o2.has()); EXPECT_FALSE(o2.has()); @@ -629,6 +629,38 @@ TEST_F(GcsIntegrationTest, GetFileInfoBucket) { ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingBucketName())); } +// We intentionally do not test invalid permissions with storage-testbench, +// because the testbench does not enforce any permission checks (ACL/IAM). +// See: +// https://github.com/googleapis/storage-testbench?tab=readme-ov-file#what-is-this-testbench +// +// In real GCS environments, trying to access a bucket without permission +// results in: +// - Error code: 5 +// - Status : NOT_FOUND +// +// The following test depends on real GCS access and is not suitable for CI/CD +// environments. To run this test manually, set the environment variable: +// ARROW_TEST_GCS_USE_REAL_SERVICE=1 +// +// Example: +// ARROW_TEST_GCS_USE_REAL_SERVICE=1 ./debug/arrow-gcsfs-test +TEST_F(GcsIntegrationTest, GetFileInfoWithoutPermission) { + if (!std::getenv("ARROW_TEST_GCS_USE_REAL_SERVICE")) { + GTEST_SKIP() << "Skipping test that requires real GCS access. " + << "Set ARROW_TEST_GCS_USE_REAL_SERVICE=1 to enable."; + } + auto options = GcsOptions::Anonymous(); + options.retry_limit_seconds = 15; + options.project_id = "test-only-invalid-project-id"; + + ASSERT_OK_AND_ASSIGN(auto fs, GcsFileSystem::Make(options)); + + AssertFileInfo(fs.get(), PreexistingBucketPath() + "dir/foo/", FileType::NotFound); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "dir/foo/bar.txt", + FileType::NotFound); +} + TEST_F(GcsIntegrationTest, GetFileInfoObjectWithNestedStructure) { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). @@ -853,7 +885,7 @@ TEST_F(GcsIntegrationTest, DeleteDirSuccess) { ASSERT_OK(fs->DeleteDir(hierarchy.base_dir)); arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketName(), FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File); - for (auto const& info : hierarchy.contents) { + for (const auto& info : hierarchy.contents) { const auto expected_type = fs::internal::IsAncestorOf(hierarchy.base_dir, info.path()) ? FileType::NotFound : info.type(); @@ -880,7 +912,7 @@ TEST_F(GcsIntegrationTest, DeleteDirContentsSuccess) { arrow::fs::AssertFileInfo(fs.get(), hierarchy.base_dir, FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketName(), FileType::Directory); arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File); - for (auto const& info : hierarchy.contents) { + for (const auto& info : hierarchy.contents) { auto expected_type = FileType::NotFound; if (info.path() == hierarchy.base_dir || !fs::internal::IsAncestorOf(hierarchy.base_dir, info.path())) { @@ -1264,7 +1296,7 @@ TEST_F(GcsIntegrationTest, OpenInputFileMixedReadVsReadAt) { PreexistingBucketPath() + "OpenInputFileMixedReadVsReadAt/object-name"; std::shared_ptr output; ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); - for (auto const& line : lines) { + for (const auto& line : lines) { ASSERT_OK(output->Write(line.data(), line.size())); } ASSERT_OK(output->Close()); @@ -1288,8 +1320,8 @@ TEST_F(GcsIntegrationTest, OpenInputFileMixedReadVsReadAt) { } // Verify random reads interleave too. - auto const index = RandomIndex(kLineCount); - auto const position = index * kLineWidth; + const auto index = RandomIndex(kLineCount); + const auto position = index * kLineWidth; ASSERT_OK_AND_ASSIGN(size, file->ReadAt(position, buffer.size(), buffer.data())); EXPECT_EQ(size, kLineWidth); auto actual = std::string{buffer.begin(), buffer.end()}; @@ -1315,7 +1347,7 @@ TEST_F(GcsIntegrationTest, OpenInputFileRandomSeek) { const auto path = PreexistingBucketPath() + "OpenInputFileRandomSeek/object-name"; std::shared_ptr output; ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); - for (auto const& line : lines) { + for (const auto& line : lines) { ASSERT_OK(output->Write(line.data(), line.size())); } ASSERT_OK(output->Close()); @@ -1325,8 +1357,8 @@ TEST_F(GcsIntegrationTest, OpenInputFileRandomSeek) { for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. - auto const index = RandomIndex(kLineCount); - auto const position = index * kLineWidth; + const auto index = RandomIndex(kLineCount); + const auto position = index * kLineWidth; ASSERT_OK(file->Seek(position)); ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); EXPECT_EQ(lines[index], actual->ToString()); @@ -1363,7 +1395,7 @@ TEST_F(GcsIntegrationTest, OpenInputFileInfo) { auto constexpr kStart = 16; ASSERT_OK_AND_ASSIGN(size, file->ReadAt(kStart, buffer.size(), buffer.data())); - auto const expected = std::string(kLoremIpsum).substr(kStart); + const auto expected = std::string(kLoremIpsum).substr(kStart); EXPECT_EQ(std::string(buffer.data(), size), expected); } diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index d59b2a342d7..adb8b0d50d3 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -363,8 +363,14 @@ Result HdfsOptions::FromUri(const Uri& uri) { options_map.emplace(kv.first, kv.second); } + // Special case host = "default" or "hdfs://default" as stated by GH-47560. + // If given the string "default", libhdfs selects the default filesystem + // from `core-site.xml`. std::string host; - host = uri.scheme() + "://" + uri.host(); + if (uri.host() == "default") + host = uri.host(); + else + host = uri.scheme() + "://" + uri.host(); // configure endpoint const auto port = uri.port(); diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index 0b19cc74b14..4060d83a5fa 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -160,6 +160,12 @@ FileInfo StatToFileInfo(const struct stat& s) { # ifdef __APPLE__ // macOS doesn't use the POSIX-compliant spelling info.set_mtime(ToTimePoint(s.st_mtimespec)); +# elif defined(_AIX) && defined(_ALL_SOURCE) + // In AIX with _ALL_SOURCE, stat struct member st_mtim is of type st_timespec_t. + struct timespec times; + times.tv_sec = s.st_mtim.tv_sec; + times.tv_nsec = static_cast(s.st_mtim.tv_nsec); + info.set_mtime(ToTimePoint(times)); # else info.set_mtime(ToTimePoint(s.st_mtim)); # endif diff --git a/cpp/src/arrow/filesystem/meson.build b/cpp/src/arrow/filesystem/meson.build index 964eedd2cfa..99c0905e3c6 100644 --- a/cpp/src/arrow/filesystem/meson.build +++ b/cpp/src/arrow/filesystem/meson.build @@ -34,6 +34,12 @@ install_headers( subdir: 'arrow/filesystem', ) +arrow_filesystem_dep = declare_dependency( + include_directories: include_directories('.'), + dependencies: arrow_dep, +) +meson.override_dependency('arrow-filesystem', arrow_filesystem_dep) + pkg.generate( filebase: 'arrow-filesystem', name: 'Apache Arrow Filesystem', diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index f0e4050f232..15bc3f9b212 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -108,10 +108,7 @@ struct Directory { return p.second; } - void AssignEntry(const std::string& s, std::unique_ptr entry) { - DCHECK(!s.empty()); - entries[s] = std::move(entry); - } + void AssignEntry(const std::string& s, std::unique_ptr entry); bool DeleteEntry(const std::string& s) { return entries.erase(s) > 0; } @@ -187,6 +184,11 @@ class Entry : public EntryBase { ARROW_DISALLOW_COPY_AND_ASSIGN(Entry); }; +void Directory::AssignEntry(const std::string& s, std::unique_ptr entry) { + DCHECK(!s.empty()); + entries[s] = std::move(entry); +} + //////////////////////////////////////////////////////////////////////////// // Streams @@ -255,12 +257,12 @@ class MockFSInputStream : public io::BufferReader { } // namespace -std::ostream& operator<<(std::ostream& os, const MockDirInfo& di) { +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const MockDirInfo& di) { return os << "'" << di.full_path << "' [mtime=" << di.mtime.time_since_epoch().count() << "]"; } -std::ostream& operator<<(std::ostream& os, const MockFileInfo& di) { +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const MockFileInfo& di) { return os << "'" << di.full_path << "' [mtime=" << di.mtime.time_since_epoch().count() << ", size=" << di.data.length() << "]"; } diff --git a/cpp/src/arrow/filesystem/s3_internal.h b/cpp/src/arrow/filesystem/s3_internal.h index 772387e5fb6..6f2780ddc05 100644 --- a/cpp/src/arrow/filesystem/s3_internal.h +++ b/cpp/src/arrow/filesystem/s3_internal.h @@ -37,30 +37,9 @@ #include "arrow/status.h" #include "arrow/util/base64.h" #include "arrow/util/logging.h" -#include "arrow/util/print.h" +#include "arrow/util/print_internal.h" #include "arrow/util/string.h" -#ifndef ARROW_AWS_SDK_VERSION_CHECK -// AWS_SDK_VERSION_{MAJOR,MINOR,PATCH} are available since 1.9.7. -# if defined(AWS_SDK_VERSION_MAJOR) && defined(AWS_SDK_VERSION_MINOR) && \ - defined(AWS_SDK_VERSION_PATCH) -// Redundant "(...)" are for suppressing "Weird number of spaces at -// line-start. Are you using a 2-space indent? [whitespace/indent] -// [3]" errors... -# define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) \ - ((AWS_SDK_VERSION_MAJOR > (major) || \ - (AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR > (minor)) || \ - ((AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR == (minor) && \ - AWS_SDK_VERSION_PATCH >= (patch))))) -# else -# define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) 0 -# endif -#endif // !ARROW_AWS_SDK_VERSION_CHECK - -#if ARROW_AWS_SDK_VERSION_CHECK(1, 9, 201) -# define ARROW_S3_HAS_SSE_CUSTOMER_KEY -#endif - namespace arrow { namespace fs { namespace internal { @@ -210,8 +189,12 @@ Status ErrorToStatus(const std::string& prefix, const std::string& operation, "'."; } } + + auto request_id = error.GetRequestId(); + auto request_str = request_id.empty() ? "" : (" (Request ID: " + request_id + ")"); + return Status::IOError(prefix, "AWS Error ", ss.str(), " during ", operation, - " operation: ", error.GetMessage(), + " operation: ", error.GetMessage(), request_str, wrong_region_msg.value_or("")); } @@ -350,14 +333,9 @@ inline Result> GetSSECustomerKeyHeaders( if (sse_customer_key.empty()) { return std::nullopt; } -#ifdef ARROW_S3_HAS_SSE_CUSTOMER_KEY ARROW_ASSIGN_OR_RAISE(auto md5, internal::CalculateSSECustomerKeyMD5(sse_customer_key)); return SSECustomerKeyHeaders{arrow::util::base64_encode(sse_customer_key), md5, "AES256"}; -#else - return Status::NotImplemented( - "SSE customer key not supported by this version of the AWS SDK"); -#endif } template @@ -366,16 +344,11 @@ Status SetSSECustomerKey(S3RequestType* request, const std::string& sse_customer if (!maybe_headers.has_value()) { return Status::OK(); } -#ifdef ARROW_S3_HAS_SSE_CUSTOMER_KEY auto headers = std::move(maybe_headers).value(); request->SetSSECustomerKey(headers.sse_customer_key); request->SetSSECustomerKeyMD5(headers.sse_customer_key_md5); request->SetSSECustomerAlgorithm(headers.sse_customer_algorithm); return Status::OK(); -#else - return Status::NotImplemented( - "SSE customer key not supported by this version of the AWS SDK"); -#endif } } // namespace internal diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 1c978f8c5ad..04774171503 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -55,8 +55,13 @@ #include #include #include +#include +#include +#include #include #include +#include +#include #include #include #include @@ -78,42 +83,18 @@ #include #include -// AWS_SDK_VERSION_{MAJOR,MINOR,PATCH} are available since 1.9.7. -#if defined(AWS_SDK_VERSION_MAJOR) && defined(AWS_SDK_VERSION_MINOR) && \ - defined(AWS_SDK_VERSION_PATCH) // Redundant "(...)" are for suppressing "Weird number of spaces at // line-start. Are you using a 2-space indent? [whitespace/indent] // [3]" errors... -# define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) \ - ((AWS_SDK_VERSION_MAJOR > (major) || \ - (AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR > (minor)) || \ - ((AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR == (minor) && \ - AWS_SDK_VERSION_PATCH >= (patch))))) -#else -# define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) 0 -#endif - -// This feature is available since 1.9.0 but -// AWS_SDK_VERSION_{MAJOR,MINOR,PATCH} are available since 1.9.7. So -// we can't use this feature for [1.9.0,1.9.6]. If it's a problem, -// please report it to our issue tracker. -#if ARROW_AWS_SDK_VERSION_CHECK(1, 9, 0) -# define ARROW_S3_HAS_CRT -#endif - -#if ARROW_AWS_SDK_VERSION_CHECK(1, 10, 0) -# define ARROW_S3_HAS_S3CLIENT_CONFIGURATION -#endif - -#ifdef ARROW_S3_HAS_CRT -# include -# include -# include -#endif - -#ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION -# include -# include +#define ARROW_AWS_SDK_VERSION_CHECK(major, minor, patch) \ + ((AWS_SDK_VERSION_MAJOR > (major) || \ + (AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR > (minor)) || \ + ((AWS_SDK_VERSION_MAJOR == (major) && AWS_SDK_VERSION_MINOR == (minor) && \ + AWS_SDK_VERSION_PATCH >= (patch))))) + +// Keep this in sync with ThirdPartyToolChain.cmake +#if !defined(AWS_SDK_VERSION_MAJOR) || !ARROW_AWS_SDK_VERSION_CHECK(1, 11, 0) +# error "AWS SDK version 1.11.0 or later is required" #endif #include "arrow/util/windows_fixup.h" @@ -401,6 +382,9 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { options.scheme = kv.second; } else if (kv.first == "endpoint_override") { options.endpoint_override = kv.second; + } else if (kv.first == "allow_delayed_open") { + ARROW_ASSIGN_OR_RAISE(options.allow_delayed_open, + ::arrow::internal::ParseBoolean(kv.second)); } else if (kv.first == "allow_bucket_creation") { ARROW_ASSIGN_OR_RAISE(options.allow_bucket_creation, ::arrow::internal::ParseBoolean(kv.second)); @@ -414,6 +398,8 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { } else if (kv.first == "tls_verify_certificates") { ARROW_ASSIGN_OR_RAISE(options.tls_verify_certificates, ::arrow::internal::ParseBoolean(kv.second)); + } else if (kv.first == "smart_defaults") { + options.smart_defaults = kv.second; } else { return Status::Invalid("Unexpected query parameter in S3 URI: '", kv.first, "'"); } @@ -440,7 +426,8 @@ bool S3Options::Equals(const S3Options& other) const { default_metadata_size ? (other.default_metadata && other.default_metadata->Equals(*default_metadata)) : (!other.default_metadata || other.default_metadata->size() == 0); - return (region == other.region && connect_timeout == other.connect_timeout && + return (smart_defaults == other.smart_defaults && region == other.region && + connect_timeout == other.connect_timeout && request_timeout == other.request_timeout && endpoint_override == other.endpoint_override && scheme == other.scheme && role_arn == other.role_arn && session_name == other.session_name && @@ -448,6 +435,7 @@ bool S3Options::Equals(const S3Options& other) const { proxy_options.Equals(other.proxy_options) && credentials_kind == other.credentials_kind && background_writes == other.background_writes && + allow_delayed_open == other.allow_delayed_open && allow_bucket_creation == other.allow_bucket_creation && allow_bucket_deletion == other.allow_bucket_deletion && tls_ca_file_path == other.tls_ca_file_path && @@ -788,22 +776,6 @@ class S3Client : public Aws::S3::S3Client { std::shared_ptr s3_retry_strategy_; }; -// In AWS SDK < 1.8, Aws::Client::ClientConfiguration::followRedirects is a bool. -template -void DisableRedirectsImpl(bool* followRedirects) { - *followRedirects = false; -} - -// In AWS SDK >= 1.8, it's a Aws::Client::FollowRedirectsPolicy scoped enum. -template -void DisableRedirectsImpl(PolicyEnum* followRedirects) { - *followRedirects = Never; -} - -void DisableRedirects(Aws::Client::ClientConfiguration* c) { - DisableRedirectsImpl(&c->followRedirects); -} - // ----------------------------------------------------------------------- // S3 client protection against use after finalization // @@ -974,8 +946,6 @@ Result> GetClientHolder( // ----------------------------------------------------------------------- // S3 client factory: build S3Client from S3Options -#ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION - // GH-40279: standard initialization of S3Client creates a new `S3EndpointProvider` // every time. Its construction takes 1ms, which makes instantiating every S3Client // very costly (see upstream bug report @@ -1100,11 +1070,17 @@ class EndpointProviderCache { std::unordered_map cache_; }; -#endif // ARROW_S3_HAS_S3CLIENT_CONFIGURATION - class ClientBuilder { public: - explicit ClientBuilder(S3Options options) : options_(std::move(options)) {} + // Make sure the default S3ClientConfiguration constructor is never invoked (see below) + ClientBuilder() = delete; + + explicit ClientBuilder(S3Options options) + : options_(std::move(options)), + // The S3ClientConfiguration constructor always does EC2 metadata lookups, + // unless IMDS is disabled (GH-46214). + client_config_(/*useSmartDefaults=*/true, options_.smart_defaults.c_str(), + /*shouldDisableIMDS=*/true) {} const Aws::Client::ClientConfiguration& config() const { return client_config_; } @@ -1184,17 +1160,10 @@ class ClientBuilder { const bool use_virtual_addressing = options_.endpoint_override.empty() || options_.force_virtual_addressing; -#ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION client_config_.useVirtualAddressing = use_virtual_addressing; auto endpoint_provider = EndpointProviderCache::Instance()->Lookup(client_config_); auto client = std::make_shared(credentials_provider_, endpoint_provider, client_config_); -#else - auto client = std::make_shared( - credentials_provider_, client_config_, - Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, - use_virtual_addressing); -#endif client->s3_retry_strategy_ = options_.retry_strategy; return GetClientHolder(std::move(client)); } @@ -1203,11 +1172,7 @@ class ClientBuilder { protected: S3Options options_; -#ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION Aws::S3::S3ClientConfiguration client_config_; -#else - Aws::Client::ClientConfiguration client_config_; -#endif std::shared_ptr credentials_provider_; }; @@ -1271,7 +1236,8 @@ class RegionResolver { Status Init() { DCHECK(builder_.options().endpoint_override.empty()); // On Windows with AWS SDK >= 1.8, it is necessary to disable redirects (ARROW-10085). - DisableRedirects(builder_.mutable_config()); + builder_.mutable_config()->followRedirects = + Aws::Client::FollowRedirectsPolicy::NEVER; return builder_.BuildClient().Value(&holder_); } @@ -2387,8 +2353,6 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisFinalize(); -#ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION EndpointProviderCache::Instance()->Reset(); -#endif Aws::ShutdownAPI(aws_options_); } } @@ -3525,7 +3487,6 @@ struct AwsInstance { #undef LOG_LEVEL_CASE -#ifdef ARROW_S3_HAS_CRT aws_options_.ioOptions.clientBootstrap_create_fn = [ev_threads = options.num_event_loop_threads]() { // https://github.com/aws/aws-sdk-cpp/blob/1.11.15/src/aws-cpp-sdk-core/source/Aws.cpp#L65 @@ -3537,18 +3498,14 @@ struct AwsInstance { client_bootstrap->EnableBlockingShutdown(); return client_bootstrap; }; -#endif aws_options_.loggingOptions.logLevel = aws_log_level; // By default the AWS SDK logs to files, log to console instead aws_options_.loggingOptions.logger_create_fn = [this] { return std::make_shared( aws_options_.loggingOptions.logLevel); }; -#if ARROW_AWS_SDK_VERSION_CHECK(1, 9, 272) // ARROW-18290: escape all special chars for compatibility with non-AWS S3 backends. - // This configuration options is only available with AWS SDK 1.9.272 and later. aws_options_.httpOptions.compliantRfc3986Encoding = true; -#endif aws_options_.httpOptions.installSigPipeHandler = options.install_sigpipe_handler; Aws::InitAPI(aws_options_); } diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 05451b2312d..158d70a93fc 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -96,6 +96,12 @@ class ARROW_EXPORT S3RetryStrategy { /// Options for the S3FileSystem implementation. struct ARROW_EXPORT S3Options { + /// \brief Smart defaults for option values + /// + /// The possible values for this setting are explained in the AWS docs: + /// https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html + std::string smart_defaults = "standard"; + /// \brief AWS region to connect to. /// /// If unset, the AWS SDK will choose a default value. The exact algorithm diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 370f3b26852..f0a5d0e2e49 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -307,6 +307,7 @@ TEST_F(S3OptionsTest, FromUri) { ASSERT_EQ(options.region, ""); ASSERT_EQ(options.scheme, "https"); ASSERT_EQ(options.endpoint_override, ""); + ASSERT_EQ(options.smart_defaults, "standard"); ASSERT_EQ(path, ""); ASSERT_OK_AND_ASSIGN(options, S3Options::FromUri("s3:", &path)); @@ -330,6 +331,12 @@ TEST_F(S3OptionsTest, FromUri) { ASSERT_EQ(options.endpoint_override, ""); ASSERT_EQ(path, "mybucket/foo/bar"); + ASSERT_OK_AND_ASSIGN( + options, S3Options::FromUri( + "s3://?allow_bucket_creation=true&smart_defaults=legacy", &path)); + ASSERT_TRUE(options.allow_bucket_creation); + ASSERT_EQ(options.smart_defaults, "legacy"); + // Region resolution with a well-known bucket ASSERT_OK_AND_ASSIGN( options, S3Options::FromUri("s3://aws-earth-mo-atmospheric-ukv-prd/", &path)); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index efe7cff4958..da73a8ec16b 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -97,6 +97,16 @@ void AssertRaisesWithErrno(int expected_errno, const Result& result) { AssertRaisesWithErrno(expected_errno, result.status()); } +void GetSortedInfos(FileSystem* fs, FileSelector s, std::vector& infos) { + ASSERT_OK_AND_ASSIGN(infos, fs->GetFileInfo(s)); + // Clear mtime & size for easier testing. + for_each(infos.begin(), infos.end(), [](FileInfo& info) { + info.set_mtime(kNoTime); + info.set_size(kNoSize); + }); + SortInfos(&infos); +} + }; // namespace void AssertFileContents(FileSystem* fs, const std::string& path, @@ -862,16 +872,6 @@ void GenericFileSystemTest::TestGetFileInfoGenerator(FileSystem* fs) { ASSERT_EQ(infos.size(), 0); } -void GetSortedInfos(FileSystem* fs, FileSelector s, std::vector& infos) { - ASSERT_OK_AND_ASSIGN(infos, fs->GetFileInfo(s)); - // Clear mtime & size for easier testing. - for_each(infos.begin(), infos.end(), [](FileInfo& info) { - info.set_mtime(kNoTime); - info.set_size(kNoSize); - }); - SortInfos(&infos); -} - void GenericFileSystemTest::TestGetFileInfoSelectorWithRecursion(FileSystem* fs) { ASSERT_OK(fs->CreateDir("01/02/03/04")); ASSERT_OK(fs->CreateDir("AA")); diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index 3a643b7e9f0..3217cc8ca36 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -25,7 +25,7 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/filesystem/mockfs.h" #include "arrow/testing/visibility.h" -#include "arrow/util/counting_semaphore.h" +#include "arrow/util/counting_semaphore_internal.h" namespace arrow { namespace fs { diff --git a/cpp/src/arrow/flight/ArrowFlightConfig.cmake.in b/cpp/src/arrow/flight/ArrowFlightConfig.cmake.in index 92803289f1e..61f99040253 100644 --- a/cpp/src/arrow/flight/ArrowFlightConfig.cmake.in +++ b/cpp/src/arrow/flight/ArrowFlightConfig.cmake.in @@ -29,7 +29,7 @@ set(ARROW_FLIGHT_SYSTEM_DEPENDENCIES "@ARROW_FLIGHT_SYSTEM_DEPENDENCIES@") include(CMakeFindDependencyMacro) -find_dependency(Arrow) +find_dependency(Arrow CONFIG) if(ARROW_BUILD_STATIC) arrow_find_dependencies("${ARROW_FLIGHT_SYSTEM_DEPENDENCIES}") diff --git a/cpp/src/arrow/flight/ArrowFlightTestingConfig.cmake.in b/cpp/src/arrow/flight/ArrowFlightTestingConfig.cmake.in index 3c043b05a6b..c77d9d168e6 100644 --- a/cpp/src/arrow/flight/ArrowFlightTestingConfig.cmake.in +++ b/cpp/src/arrow/flight/ArrowFlightTestingConfig.cmake.in @@ -27,8 +27,8 @@ @PACKAGE_INIT@ include(CMakeFindDependencyMacro) -find_dependency(ArrowFlight) -find_dependency(ArrowTesting) +find_dependency(ArrowFlight CONFIG) +find_dependency(ArrowTesting CONFIG) include("${CMAKE_CURRENT_LIST_DIR}/ArrowFlightTestingTargets.cmake") diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index 564d300ffa6..359f6d37277 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -113,7 +113,23 @@ add_custom_command(OUTPUT ${FLIGHT_GENERATED_PROTO_FILES} "--plugin=protoc-gen-grpc=$" "${FLIGHT_PROTO}") -set_source_files_properties(${FLIGHT_GENERATED_PROTO_FILES} PROPERTIES GENERATED TRUE) +# Set common properties for C++ sources files generated by protoc +function(arrow_set_generated_proto_files_properties) + set(GENERATED_FILES ${ARGN}) + set_source_files_properties(${GENERATED_FILES} PROPERTIES GENERATED TRUE) + if(MSVC) + # Suppress missing dll-interface warning + set_source_files_properties(${GENERATED_FILES} + PROPERTIES COMPILE_OPTIONS "/wd4251" + SKIP_UNITY_BUILD_INCLUSION TRUE) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # Disable -Wmissing-declarations + set_source_files_properties(${GENERATED_FILES} PROPERTIES COMPILE_OPTIONS + "-Wno-missing-declarations") + endif() +endfunction() + +arrow_set_generated_proto_files_properties(${FLIGHT_GENERATED_PROTO_FILES}) add_custom_target(flight_grpc_gen ALL DEPENDS ${FLIGHT_GENERATED_PROTO_FILES}) @@ -173,20 +189,21 @@ set(ARROW_FLIGHT_SRCS transport/grpc/util_internal.cc types.cc) -if(ARROW_WITH_OPENTELEMETRY) - list(APPEND ARROW_FLIGHT_SRCS otel_logging.cc) +# Handle Unity build header conflicts on Windows. +if(CMAKE_UNITY_BUILD AND WIN32) + set_source_files_properties(client.cc + cookie_internal.cc + serialization_internal.cc + server.cc + transport/grpc/serialization_internal.cc + transport/grpc/protocol_grpc_internal.cc + transport/grpc/util_internal.cc + types.cc + PROPERTIES SKIP_UNITY_BUILD_INCLUSION TRUE) endif() -if(MSVC) - # Protobuf generated files trigger spurious warnings on MSVC. - foreach(GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/Flight.pb.cc" - "${CMAKE_CURRENT_BINARY_DIR}/Flight.pb.h") - # Suppress missing dll-interface warning - set_source_files_properties("${GENERATED_SOURCE}" - PROPERTIES COMPILE_OPTIONS "/wd4251" - GENERATED TRUE - SKIP_UNITY_BUILD_INCLUSION TRUE) - endforeach() +if(ARROW_WITH_OPENTELEMETRY) + list(APPEND ARROW_FLIGHT_SRCS otel_logging.cc) endif() add_arrow_lib(arrow_flight @@ -198,8 +215,6 @@ add_arrow_lib(arrow_flight ARROW_FLIGHT_LIBRARIES SOURCES ${ARROW_FLIGHT_SRCS} - PRECOMPILED_HEADERS - "$<$:arrow/flight/pch.h>" DEPENDENCIES flight_grpc_gen SHARED_LINK_FLAGS @@ -269,7 +284,9 @@ if(ARROW_TESTING) ArrowTesting::arrow_testing_static) endif() list(APPEND ARROW_FLIGHT_TESTING_SHARED_LINK_LIBS ${ARROW_FLIGHT_TEST_INTERFACE_LIBS}) + list(APPEND ARROW_FLIGHT_TESTING_SHARED_LINK_LIBS ${ARROW_GTEST_GMOCK}) list(APPEND ARROW_FLIGHT_TESTING_STATIC_LINK_LIBS ${ARROW_FLIGHT_TEST_INTERFACE_LIBS}) + list(APPEND ARROW_FLIGHT_TESTING_STATIC_LINK_LIBS ${ARROW_GTEST_GMOCK}) add_arrow_lib(arrow_flight_testing CMAKE_PACKAGE_NAME ArrowFlightTesting diff --git a/cpp/src/arrow/flight/client.cc b/cpp/src/arrow/flight/client.cc index 2003d778052..e33b6c3fd07 100644 --- a/cpp/src/arrow/flight/client.cc +++ b/cpp/src/arrow/flight/client.cc @@ -259,6 +259,14 @@ class ClientStreamReader : public FlightStreamReader { } return batches; } + + arrow::ipc::ReadStats stats() const override { + if (batch_reader_ == nullptr) { + return ipc::ReadStats{}; + } + return batch_reader_->stats(); + } + arrow::Result> ToTable() override { return ToTable(stop_token_); } @@ -278,7 +286,7 @@ class ClientStreamReader : public FlightStreamReader { StopToken stop_token_; std::shared_ptr memory_manager_; std::shared_ptr peekable_reader_; - std::shared_ptr batch_reader_; + std::shared_ptr batch_reader_; std::shared_ptr app_metadata_; }; diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index ae6011b117a..3ad9f26275b 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -141,6 +141,10 @@ class ARROW_FLIGHT_EXPORT FlightStreamReader : public MetadataRecordBatchReader using MetadataRecordBatchReader::ToTable; /// \brief Consume entire stream as a Table arrow::Result> ToTable(const StopToken& stop_token); + + using MetadataRecordBatchReader::stats; + /// \brief Return current read statistics + virtual arrow::ipc::ReadStats stats() const = 0; }; // Silence warning diff --git a/cpp/src/arrow/flight/flight_benchmark.cc b/cpp/src/arrow/flight/flight_benchmark.cc index 49e54d98f66..aa6e16820ef 100644 --- a/cpp/src/arrow/flight/flight_benchmark.cc +++ b/cpp/src/arrow/flight/flight_benchmark.cc @@ -31,8 +31,8 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/compression.h" #include "arrow/util/config.h" -#include "arrow/util/stopwatch.h" -#include "arrow/util/tdigest.h" +#include "arrow/util/stopwatch_internal.h" +#include "arrow/util/tdigest_internal.h" #include "arrow/util/thread_pool.h" #include "arrow/flight/api.h" diff --git a/cpp/src/arrow/flight/flight_internals_test.cc b/cpp/src/arrow/flight/flight_internals_test.cc index ab2f8c78307..bb14ddd6655 100644 --- a/cpp/src/arrow/flight/flight_internals_test.cc +++ b/cpp/src/arrow/flight/flight_internals_test.cc @@ -238,6 +238,7 @@ TEST(FlightTypes, FlightInfo) { MakeFlightInfo(schema1, desc1, {endpoint1}, -1, 42, true, ""), MakeFlightInfo(schema1, desc2, {endpoint1, endpoint2}, 64, -1, false, "\xDE\xAD\xC0\xDE"), + MakeFlightInfo(desc1, {}, -1, -1, false, ""), }; std::vector reprs = { " " @@ -257,6 +258,8 @@ TEST(FlightTypes, FlightInfo) { "locations=[grpc+tcp://localhost:1234] expiration_time=null " "app_metadata='CAFED00D'>] " "total_records=64 total_bytes=-1 ordered=false app_metadata='DEADC0DE'>", + " " + "endpoints=[] total_records=-1 total_bytes=-1 ordered=false app_metadata=''>", }; ASSERT_NO_FATAL_FAILURE(TestRoundtrip(values, reprs)); diff --git a/cpp/src/arrow/flight/flight_test.cc b/cpp/src/arrow/flight/flight_test.cc index 863f21f8db5..16a4909828b 100644 --- a/cpp/src/arrow/flight/flight_test.cc +++ b/cpp/src/arrow/flight/flight_test.cc @@ -70,7 +70,9 @@ // > other API headers. This approach efficiently avoids the conflict // > between the two different versions of Abseil. #include "arrow/util/tracing_internal.h" -#ifdef ARROW_WITH_OPENTELEMETRY +// When running with OTel, ASAN reports false-positives that can't be easily suppressed. +// Disable OTel for ASAN. See GH-46509. +#if defined(ARROW_WITH_OPENTELEMETRY) && !defined(ADDRESS_SANITIZER) # include # include # include @@ -95,7 +97,9 @@ const char kAuthHeader[] = "authorization"; class OtelEnvironment : public ::testing::Environment { public: void SetUp() override { -#ifdef ARROW_WITH_OPENTELEMETRY +// When running with OTel, ASAN reports false-positives that can't be easily suppressed. +// Disable OTel for ASAN. See GH-46509. +#if defined(ARROW_WITH_OPENTELEMETRY) && !defined(ADDRESS_SANITIZER) // The default tracer always generates no-op spans which have no // span/trace ID. Set up a different tracer. Note, this needs to be run // before Arrow uses OTel as GetTracer() gets a tracer once and keeps it @@ -1682,7 +1686,9 @@ class TracingTestServer : public FlightServerBase { auto* middleware = reinterpret_cast(call_context.GetMiddleware("tracing")); if (!middleware) return Status::Invalid("Could not find middleware"); -#ifdef ARROW_WITH_OPENTELEMETRY +// When running with OTel, ASAN reports false-positives that can't be easily suppressed. +// Disable OTel for ASAN. See GH-46509. +#if defined(ARROW_WITH_OPENTELEMETRY) && !defined(ADDRESS_SANITIZER) // Ensure the trace context is present (but the value is random so // we cannot assert any particular value) EXPECT_FALSE(middleware->GetTraceContext().empty()); @@ -1731,7 +1737,9 @@ class TestTracing : public ::testing::Test { std::unique_ptr server_; }; -#ifdef ARROW_WITH_OPENTELEMETRY +// When running with OTel, ASAN reports false-positives that can't be easily suppressed. +// Disable OTel for ASAN. See GH-46509. +#if defined(ARROW_WITH_OPENTELEMETRY) && !defined(ADDRESS_SANITIZER) // Must define it ourselves to avoid a linker error constexpr size_t kSpanIdSize = opentelemetry::trace::SpanId::kSize; constexpr size_t kTraceIdSize = opentelemetry::trace::TraceId::kSize; diff --git a/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc b/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc index 15318a8d7a4..83c4a30902f 100644 --- a/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc +++ b/cpp/src/arrow/flight/integration_tests/flight_integration_test.cc @@ -53,6 +53,8 @@ TEST(FlightIntegration, AuthBasicProto) { ASSERT_OK(RunScenario("auth:basic_prot TEST(FlightIntegration, Middleware) { ASSERT_OK(RunScenario("middleware")); } +TEST(FlightIntegration, Alignment) { ASSERT_OK(RunScenario("alignment")); } + TEST(FlightIntegration, Ordered) { ASSERT_OK(RunScenario("ordered")); } TEST(FlightIntegration, ExpirationTimeDoGet) { diff --git a/cpp/src/arrow/flight/integration_tests/test_integration.cc b/cpp/src/arrow/flight/integration_tests/test_integration.cc index f38076822c7..d9209f3e6ac 100644 --- a/cpp/src/arrow/flight/integration_tests/test_integration.cc +++ b/cpp/src/arrow/flight/integration_tests/test_integration.cc @@ -45,6 +45,7 @@ #include "arrow/table.h" #include "arrow/table_builder.h" #include "arrow/testing/gtest_util.h" +#include "arrow/util/align_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" @@ -281,6 +282,137 @@ class MiddlewareScenario : public Scenario { std::shared_ptr client_middleware_; }; +/// \brief The server used for testing FlightClient data alignment. +/// +/// The server always returns the same data of various byte widths. +/// The client should return data that is aligned according to the data type +/// if FlightCallOptions.read_options.ensure_memory_alignment is true. +/// +/// This scenario is passed only when the client returns aligned data. +class AlignmentServer : public FlightServerBase { + Status GetFlightInfo(const ServerCallContext& context, + const FlightDescriptor& descriptor, + std::unique_ptr* result) override { + auto schema = BuildSchema(); + std::vector endpoints{ + FlightEndpoint{{"align-data"}, {}, std::nullopt, ""}}; + ARROW_ASSIGN_OR_RAISE( + auto info, FlightInfo::Make(*schema, descriptor, endpoints, -1, -1, false)); + *result = std::make_unique(info); + return Status::OK(); + } + + Status DoGet(const ServerCallContext& context, const Ticket& request, + std::unique_ptr* stream) override { + if (request.ticket != "align-data") { + return Status::KeyError("Could not find flight: ", request.ticket); + } + auto record_batch = RecordBatchFromJSON(BuildSchema(), R"([ + [1, 1, false], + [2, 2, true], + [3, 3, false] + ])"); + std::vector> record_batches{record_batch}; + ARROW_ASSIGN_OR_RAISE(auto record_batch_reader, + RecordBatchReader::Make(record_batches)); + *stream = std::make_unique(record_batch_reader); + return Status::OK(); + } + + private: + std::shared_ptr BuildSchema() { + return arrow::schema({ + arrow::field("int32", arrow::int32(), false), + arrow::field("int64", arrow::int64(), false), + arrow::field("bool", arrow::boolean(), false), + }); + } +}; + +/// \brief The alignment scenario. +/// +/// This tests that the client provides aligned data if requested. +class AlignmentScenario : public Scenario { + Status MakeServer(std::unique_ptr* server, + FlightServerOptions* options) override { + server->reset(new AlignmentServer()); + return Status::OK(); + } + + Status MakeClient(FlightClientOptions* options) override { return Status::OK(); } + + arrow::Result> GetTable(FlightClient* client, + const FlightCallOptions& call_options) { + ARROW_ASSIGN_OR_RAISE(auto info, + client->GetFlightInfo(FlightDescriptor::Command("alignment"))); + std::vector> tables; + for (const auto& endpoint : info->endpoints()) { + if (!endpoint.locations.empty()) { + std::stringstream ss; + ss << "["; + for (const auto& location : endpoint.locations) { + if (ss.str().size() != 1) { + ss << ", "; + } + ss << location.ToString(); + } + ss << "]"; + return Status::Invalid( + "Expected to receive empty locations to use the original service: ", + ss.str()); + } + ARROW_ASSIGN_OR_RAISE(auto reader, client->DoGet(call_options, endpoint.ticket)); + ARROW_ASSIGN_OR_RAISE(auto table, reader->ToTable()); + tables.push_back(table); + } + return ConcatenateTables(tables); + } + + Status RunClient(std::unique_ptr client) override { + for (ipc::Alignment ensure_alignment : + {ipc::Alignment::kAnyAlignment, ipc::Alignment::kDataTypeSpecificAlignment, + ipc::Alignment::k64ByteAlignment}) { + auto call_options = FlightCallOptions(); + call_options.read_options.ensure_alignment = ensure_alignment; + ARROW_ASSIGN_OR_RAISE(auto table, GetTable(client.get(), call_options)); + + // Check read data + auto expected_row_count = 3; + if (table->num_rows() != expected_row_count) { + return Status::Invalid("Read table size isn't expected\n", "Expected rows:\n", + expected_row_count, "Actual rows:\n", table->num_rows()); + } + auto expected_column_count = 3; + if (table->num_columns() != expected_column_count) { + return Status::Invalid("Read table size isn't expected\n", "Expected columns:\n", + expected_column_count, "Actual columns:\n", + table->num_columns()); + } + // Check data alignment + std::vector needs_alignment; + if (ensure_alignment == ipc::Alignment::kAnyAlignment) { + // this is not a requirement but merely an observation: + // with ensure_alignment=false, flight client returns mis-aligned data + // if this is not the case any more, feel free to remove this assertion + if (util::CheckAlignment(*table, arrow::util::kValueAlignment, + &needs_alignment)) { + return Status::Invalid( + "Read table has aligned data, which is good, but unprecedented"); + } + } else { + // with ensure_alignment != kValueAlignment, we require data to be aligned + // the value of the Alignment enum provides us with the byte alignment value + if (!util::CheckAlignment(*table, static_cast(ensure_alignment), + &needs_alignment)) { + return Status::Invalid("Read table has unaligned data"); + } + } + } + + return Status::OK(); + } +}; + /// \brief The server used for testing FlightInfo.ordered. /// /// If the given command is "ordered", the server sets @@ -316,25 +448,16 @@ class OrderedServer : public FlightServerBase { Status DoGet(const ServerCallContext& context, const Ticket& request, std::unique_ptr* stream) override { - ARROW_ASSIGN_OR_RAISE(auto builder, RecordBatchBuilder::Make( - BuildSchema(), arrow::default_memory_pool())); - auto number_builder = builder->GetFieldAs(0); + std::shared_ptr record_batch; if (request.ticket == "1") { - ARROW_RETURN_NOT_OK(number_builder->Append(1)); - ARROW_RETURN_NOT_OK(number_builder->Append(2)); - ARROW_RETURN_NOT_OK(number_builder->Append(3)); + record_batch = RecordBatchFromJSON(BuildSchema(), "[[1], [2], [3]]"); } else if (request.ticket == "2") { - ARROW_RETURN_NOT_OK(number_builder->Append(10)); - ARROW_RETURN_NOT_OK(number_builder->Append(20)); - ARROW_RETURN_NOT_OK(number_builder->Append(30)); + record_batch = RecordBatchFromJSON(BuildSchema(), "[[10], [20], [30]]"); } else if (request.ticket == "3") { - ARROW_RETURN_NOT_OK(number_builder->Append(100)); - ARROW_RETURN_NOT_OK(number_builder->Append(200)); - ARROW_RETURN_NOT_OK(number_builder->Append(300)); + record_batch = RecordBatchFromJSON(BuildSchema(), "[[100], [200], [300]]"); } else { return Status::KeyError("Could not find flight: ", request.ticket); } - ARROW_ASSIGN_OR_RAISE(auto record_batch, builder->Flush()); std::vector> record_batches{record_batch}; ARROW_ASSIGN_OR_RAISE(auto record_batch_reader, RecordBatchReader::Make(record_batches)); @@ -390,19 +513,9 @@ class OrderedScenario : public Scenario { // Build expected table auto schema = arrow::schema({arrow::field("number", arrow::int32(), false)}); - ARROW_ASSIGN_OR_RAISE(auto builder, - RecordBatchBuilder::Make(schema, arrow::default_memory_pool())); - auto number_builder = builder->GetFieldAs(0); - ARROW_RETURN_NOT_OK(number_builder->Append(1)); - ARROW_RETURN_NOT_OK(number_builder->Append(2)); - ARROW_RETURN_NOT_OK(number_builder->Append(3)); - ARROW_RETURN_NOT_OK(number_builder->Append(10)); - ARROW_RETURN_NOT_OK(number_builder->Append(20)); - ARROW_RETURN_NOT_OK(number_builder->Append(30)); - ARROW_RETURN_NOT_OK(number_builder->Append(100)); - ARROW_RETURN_NOT_OK(number_builder->Append(200)); - ARROW_RETURN_NOT_OK(number_builder->Append(300)); - ARROW_ASSIGN_OR_RAISE(auto expected_record_batch, builder->Flush()); + auto expected_record_batch = RecordBatchFromJSON(schema, R"([ + [1], [2], [3], [10], [20], [30], [100], [200], [300] + ])"); std::vector> expected_record_batches{ expected_record_batch}; ARROW_ASSIGN_OR_RAISE(auto expected_table, @@ -490,11 +603,8 @@ class ExpirationTimeServer : public FlightServerBase { } } status.num_gets++; - ARROW_ASSIGN_OR_RAISE(auto builder, RecordBatchBuilder::Make( - BuildSchema(), arrow::default_memory_pool())); - auto number_builder = builder->GetFieldAs(0); - ARROW_RETURN_NOT_OK(number_builder->Append(index)); - ARROW_ASSIGN_OR_RAISE(auto record_batch, builder->Flush()); + auto record_batch = + RecordBatchFromJSON(BuildSchema(), "[[" + std::to_string(index) + "]]"); std::vector> record_batches{record_batch}; ARROW_ASSIGN_OR_RAISE(auto record_batch_reader, RecordBatchReader::Make(record_batches)); @@ -621,13 +731,7 @@ class ExpirationTimeDoGetScenario : public Scenario { // Build expected table auto schema = arrow::schema({arrow::field("number", arrow::uint32(), false)}); - ARROW_ASSIGN_OR_RAISE(auto builder, - RecordBatchBuilder::Make(schema, arrow::default_memory_pool())); - auto number_builder = builder->GetFieldAs(0); - ARROW_RETURN_NOT_OK(number_builder->Append(0)); - ARROW_RETURN_NOT_OK(number_builder->Append(1)); - ARROW_RETURN_NOT_OK(number_builder->Append(2)); - ARROW_ASSIGN_OR_RAISE(auto expected_record_batch, builder->Flush()); + auto expected_record_batch = RecordBatchFromJSON(schema, "[[0], [1], [2]]"); std::vector> expected_record_batches{ expected_record_batch}; ARROW_ASSIGN_OR_RAISE(auto expected_table, @@ -1167,6 +1271,7 @@ const std::shared_ptr& GetQuerySchema() { .IsSearchable(true) .CatalogName("catalog_test") .Precision(100) + .Remarks("test column") .Build() .metadata_map())}); return kSchema; @@ -1187,6 +1292,7 @@ std::shared_ptr GetQueryWithTransactionSchema() { .IsSearchable(true) .CatalogName("catalog_test") .Precision(100) + .Remarks("test column") .Build() .metadata_map())}); return kSchema; @@ -2382,6 +2488,9 @@ Status GetScenario(const std::string& scenario_name, std::shared_ptr* } else if (scenario_name == "middleware") { *out = std::make_shared(); return Status::OK(); + } else if (scenario_name == "alignment") { + *out = std::make_shared(); + return Status::OK(); } else if (scenario_name == "ordered") { *out = std::make_shared(); return Status::OK(); diff --git a/cpp/src/arrow/flight/meson.build b/cpp/src/arrow/flight/meson.build new file mode 100644 index 00000000000..9ffe3413dfc --- /dev/null +++ b/cpp/src/arrow/flight/meson.build @@ -0,0 +1,226 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + [ + 'api.h', + 'client_auth.h', + 'client_cookie_middleware.h', + 'client.h', + 'client_middleware.h', + 'client_tracing_middleware.h', + 'middleware.h', + 'otel_logging.h', + 'platform.h', + 'server_auth.h', + 'server.h', + 'server_middleware.h', + 'server_tracing_middleware.h', + 'test_auth_handlers.h', + 'test_definitions.h', + 'test_flight_server.h', + 'test_util.h', + 'transport.h', + 'transport_server.h', + 'type_fwd.h', + 'types_async.h', + 'types.h', + 'visibility.h', + ], + subdir: 'arrow/flight', +) + +grpc_dep = dependency('grpc++') +protobuf_dep = dependency('protobuf') +abseil_sync_dep = dependency('absl_synchronization') + +fs = import('fs') +protoc = find_program('protoc') + +flight_proto_path = fs.parent(meson.project_source_root()) / 'format' + +# To ensure messages from proto files are created correctly, we need to +# pass in dllexport_decl=<...> . Unfortunately, it doesn't appear that we +# can just pass in dllexport_decl=ARROW_FLIGHT_EXPORT, as the visibility +# macro won't be easily available to the generated proto file. See also +# https://github.com/protocolbuffers/protobuf/issues/19422 +if cpp_compiler.get_id() == 'msvc' + if get_option('default_library') != 'static' + proto_visibility = 'dllexport_decl=__declspec(dllexport):' + else + proto_visibility = '' + endif +else + proto_visibility = 'dllexport_decl=__attribute__((visibility("default"))):' +endif + +flight_proto_files = custom_target( + 'arrow-flight-proto-files', + input: [flight_proto_path / 'Flight.proto'], + output: ['Flight.pb.cc', 'Flight.pb.h'], + command: [ + protoc, + '--proto_path=' + flight_proto_path, + '--cpp_out=@0@@1@'.format(proto_visibility, meson.current_build_dir()), + '@INPUT@', + ], +) + +grpc_cpp_plugin = find_program('grpc_cpp_plugin') +flight_proto_grpc_files = custom_target( + 'arrow-flight-proto-grpc-files', + input: [flight_proto_path / 'Flight.proto'], + output: ['Flight.grpc.pb.cc', 'Flight.grpc.pb.h'], + command: [ + protoc, + '--proto_path=' + flight_proto_path, + '--grpc_out=' + meson.current_build_dir(), + '--plugin=protoc-gen-grpc=' + grpc_cpp_plugin.full_path(), + '@INPUT@', + ], +) + +arrow_flight_srcs = [ + 'client.cc', + 'client_cookie_middleware.cc', + 'client_tracing_middleware.cc', + 'cookie_internal.cc', + 'middleware.cc', + 'serialization_internal.cc', + 'server.cc', + 'server_auth.cc', + 'server_tracing_middleware.cc', + 'transport.cc', + 'transport_server.cc', + 'transport/grpc/grpc_client.cc', + 'transport/grpc/grpc_server.cc', + 'transport/grpc/serialization_internal.cc', + 'transport/grpc/protocol_grpc_internal.cc', + 'transport/grpc/util_internal.cc', + 'types.cc', +] + +thread_dep = dependency('threads') + +arrow_flight = library( + 'arrow-flight', + # We intentionally index flight_proto_grpc_files[1] so as to avoid + # adding 'Flight.grpc.pb.cc' to the sources. This is required + # because protocol_grpc_internal.cc includes the source file + # directly; using as a source here will cause a ODR violation + sources: arrow_flight_srcs + [ + flight_proto_files, + flight_proto_grpc_files[1], + ], + dependencies: [ + arrow_dep, + grpc_dep, + protobuf_dep, + abseil_sync_dep, + thread_dep, + ], + cpp_shared_args: ['-DARROW_FLIGHT_EXPORTING'], + cpp_static_args: ['-DARROW_FLIGHT_STATIC'], + gnu_symbol_visibility: 'inlineshidden', +) + +arrow_flight_dep = declare_dependency( + link_with: arrow_flight, + dependencies: [grpc_dep, protobuf_dep, abseil_sync_dep], +) + +if needs_testing + arrow_flight_testing_lib = library( + 'arrow-flight-testing', + sources: [ + 'test_auth_handlers.cc', + 'test_definitions.cc', + 'test_flight_server.cc', + 'test_util.cc', + ], + dependencies: [arrow_test_dep, arrow_flight_dep, thread_dep], + cpp_shared_args: ['-DARROW_FLIGHT_EXPORTING'], + cpp_static_args: ['-DARROW_FLIGHT_STATIC'], + gnu_symbol_visibility: 'inlineshidden', + ) + + arrow_flight_test_dep = declare_dependency( + link_with: arrow_flight_testing_lib, + dependencies: [arrow_flight_dep], + ) +else + arrow_flight_test_dep = disabler() +endif + +flight_tests = ['flight_internals_test', 'flight_test'] +foreach flight_test : flight_tests + test_name = '@0@'.format(flight_test.replace('_', '-')) + exc = executable( + test_name, + sources: [ + '@0@.cc'.format(flight_test), + # flight_internals_test.cc transitively includes Flight.grpc.pb.h + # so we must declare that here to avoid a race condition + flight_proto_grpc_files[1], + ], + dependencies: [arrow_test_dep, arrow_flight_test_dep], + ) + test(test_name, exc) +endforeach + +flight_test_dep_no_main = [ + arrow_dep, + arrow_flight_test_dep, + gtest_dep, + gmock_dep, + gflags_dep, +] + +if needs_tests or needs_benchmarks + executable( + 'flight-test-server', + sources: ['test_server.cc'], + dependencies: flight_test_dep_no_main, + ) +endif + +if needs_benchmarks + server_proto_path = meson.project_source_root() / 'src' / 'arrow' / 'flight' + flight_proto_files = custom_target( + 'arrow-flight-benchmark-perf-proto-files', + input: [server_proto_path / 'perf.proto'], + output: ['perf.pb.cc', 'perf.pb.h'], + command: [ + protoc, + '--proto_path=' + meson.current_source_dir(), + '--cpp_out=' + meson.current_build_dir(), + '@INPUT@', + ], + ) + + executable( + 'arrow-flight-perf-server', + sources: ['perf_server.cc'] + flight_proto_files, + dependencies: [flight_test_dep_no_main, arrow_testing_dep], + ) + + executable( + 'arrow-flight-benchmark', + sources: ['flight_benchmark.cc'] + flight_proto_files, + dependencies: [flight_test_dep_no_main, arrow_testing_dep], + ) +endif diff --git a/cpp/src/arrow/flight/pch.h b/cpp/src/arrow/flight/pch.h deleted file mode 100644 index fff107fa8fc..00000000000 --- a/cpp/src/arrow/flight/pch.h +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Often-used headers, for precompiling. -// If updating this header, please make sure you check compilation speed -// before checking in. Adding headers which are not used extremely often -// may incur a slowdown, since it makes the precompiled header heavier to load. - -#include "arrow/flight/client.h" -#include "arrow/flight/server.h" -#include "arrow/flight/types.h" -#include "arrow/pch.h" diff --git a/cpp/src/arrow/flight/serialization_internal.cc b/cpp/src/arrow/flight/serialization_internal.cc index a64ab713ddd..34fcef1f837 100644 --- a/cpp/src/arrow/flight/serialization_internal.cc +++ b/cpp/src/arrow/flight/serialization_internal.cc @@ -457,6 +457,8 @@ Status ToPayload(const FlightDescriptor& descr, std::shared_ptr* out) { return Status::OK(); } +namespace { + // SessionOptionValue Status FromProto(const pb::SessionOptionValue& pb_val, SessionOptionValue* val) { @@ -524,6 +526,8 @@ Status ToProto(const std::map& map, return Status::OK(); } +} // namespace + // SetSessionOptionsRequest Status FromProto(const pb::SetSessionOptionsRequest& pb_request, diff --git a/cpp/src/arrow/flight/serialization_internal.h b/cpp/src/arrow/flight/serialization_internal.h index 827bde1674f..4d07efad815 100644 --- a/cpp/src/arrow/flight/serialization_internal.h +++ b/cpp/src/arrow/flight/serialization_internal.h @@ -99,69 +99,86 @@ Status UnpackProtoAction(const Action& action, google::protobuf::Message* out); // These functions depend on protobuf types which are not exported in the Flight DLL. -Status FromProto(const google::protobuf::Timestamp& pb_timestamp, Timestamp* timestamp); -Status FromProto(const pb::ActionType& pb_type, ActionType* type); -Status FromProto(const pb::Action& pb_action, Action* action); -Status FromProto(const pb::Result& pb_result, Result* result); -Status FromProto(const pb::CancelFlightInfoResult& pb_result, - CancelFlightInfoResult* result); -Status FromProto(const pb::Criteria& pb_criteria, Criteria* criteria); -Status FromProto(const pb::Location& pb_location, Location* location); -Status FromProto(const pb::Ticket& pb_ticket, Ticket* ticket); -Status FromProto(const pb::FlightData& pb_data, FlightDescriptor* descriptor, - std::unique_ptr* message); -Status FromProto(const pb::FlightDescriptor& pb_descr, FlightDescriptor* descr); -Status FromProto(const pb::FlightEndpoint& pb_endpoint, FlightEndpoint* endpoint); -Status FromProto(const pb::RenewFlightEndpointRequest& pb_request, - RenewFlightEndpointRequest* request); -Status FromProto(const pb::FlightInfo& pb_info, FlightInfo::Data* info); -Status FromProto(const pb::FlightInfo& pb_info, std::unique_ptr* info); -Status FromProto(const pb::PollInfo& pb_info, PollInfo* info); -Status FromProto(const pb::PollInfo& pb_info, std::unique_ptr* info); -Status FromProto(const pb::CancelFlightInfoRequest& pb_request, - CancelFlightInfoRequest* request); -Status FromProto(const pb::SchemaResult& pb_result, SchemaResult* result); -Status FromProto(const pb::BasicAuth& pb_basic_auth, BasicAuth* info); -Status FromProto(const pb::SetSessionOptionsRequest& pb_request, - SetSessionOptionsRequest* request); -Status FromProto(const pb::SetSessionOptionsResult& pb_result, - SetSessionOptionsResult* result); -Status FromProto(const pb::GetSessionOptionsRequest& pb_request, - GetSessionOptionsRequest* request); -Status FromProto(const pb::GetSessionOptionsResult& pb_result, - GetSessionOptionsResult* result); -Status FromProto(const pb::CloseSessionRequest& pb_request, CloseSessionRequest* request); -Status FromProto(const pb::CloseSessionResult& pb_result, CloseSessionResult* result); - -Status ToProto(const Timestamp& timestamp, google::protobuf::Timestamp* pb_timestamp); -Status ToProto(const FlightDescriptor& descr, pb::FlightDescriptor* pb_descr); -Status ToProto(const FlightEndpoint& endpoint, pb::FlightEndpoint* pb_endpoint); -Status ToProto(const RenewFlightEndpointRequest& request, - pb::RenewFlightEndpointRequest* pb_request); -Status ToProto(const FlightInfo& info, pb::FlightInfo* pb_info); -Status ToProto(const PollInfo& info, pb::PollInfo* pb_info); -Status ToProto(const CancelFlightInfoRequest& request, - pb::CancelFlightInfoRequest* pb_request); -Status ToProto(const ActionType& type, pb::ActionType* pb_type); -Status ToProto(const Action& action, pb::Action* pb_action); -Status ToProto(const Result& result, pb::Result* pb_result); -Status ToProto(const CancelFlightInfoResult& result, - pb::CancelFlightInfoResult* pb_result); -Status ToProto(const Criteria& criteria, pb::Criteria* pb_criteria); -Status ToProto(const Location& location, pb::Location* pb_location); -Status ToProto(const SchemaResult& result, pb::SchemaResult* pb_result); -Status ToProto(const Ticket& ticket, pb::Ticket* pb_ticket); -Status ToProto(const BasicAuth& basic_auth, pb::BasicAuth* pb_basic_auth); -Status ToProto(const SetSessionOptionsRequest& request, - pb::SetSessionOptionsRequest* pb_request); -Status ToProto(const SetSessionOptionsResult& result, - pb::SetSessionOptionsResult* pb_result); -Status ToProto(const GetSessionOptionsRequest& request, - pb::GetSessionOptionsRequest* pb_request); -Status ToProto(const GetSessionOptionsResult& result, - pb::GetSessionOptionsResult* pb_result); -Status ToProto(const CloseSessionRequest& request, pb::CloseSessionRequest* pb_request); -Status ToProto(const CloseSessionResult& result, pb::CloseSessionResult* pb_result); +ARROW_FLIGHT_EXPORT Status FromProto(const google::protobuf::Timestamp& pb_timestamp, + Timestamp* timestamp); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::ActionType& pb_type, ActionType* type); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::Action& pb_action, Action* action); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::Result& pb_result, Result* result); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::CancelFlightInfoResult& pb_result, + CancelFlightInfoResult* result); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::Criteria& pb_criteria, Criteria* criteria); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::Location& pb_location, Location* location); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::Ticket& pb_ticket, Ticket* ticket); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::FlightData& pb_data, + FlightDescriptor* descriptor, + std::unique_ptr* message); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::FlightDescriptor& pb_descr, + FlightDescriptor* descr); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::FlightEndpoint& pb_endpoint, + FlightEndpoint* endpoint); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::RenewFlightEndpointRequest& pb_request, + RenewFlightEndpointRequest* request); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::FlightInfo& pb_info, + FlightInfo::Data* info); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::FlightInfo& pb_info, + std::unique_ptr* info); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::PollInfo& pb_info, PollInfo* info); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::PollInfo& pb_info, + std::unique_ptr* info); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::CancelFlightInfoRequest& pb_request, + CancelFlightInfoRequest* request); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::SchemaResult& pb_result, + SchemaResult* result); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::BasicAuth& pb_basic_auth, BasicAuth* info); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::SetSessionOptionsRequest& pb_request, + SetSessionOptionsRequest* request); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::SetSessionOptionsResult& pb_result, + SetSessionOptionsResult* result); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::GetSessionOptionsRequest& pb_request, + GetSessionOptionsRequest* request); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::GetSessionOptionsResult& pb_result, + GetSessionOptionsResult* result); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::CloseSessionRequest& pb_request, + CloseSessionRequest* request); +ARROW_FLIGHT_EXPORT Status FromProto(const pb::CloseSessionResult& pb_result, + CloseSessionResult* result); + +ARROW_FLIGHT_EXPORT Status ToProto(const Timestamp& timestamp, + google::protobuf::Timestamp* pb_timestamp); +ARROW_FLIGHT_EXPORT Status ToProto(const FlightDescriptor& descr, + pb::FlightDescriptor* pb_descr); +ARROW_FLIGHT_EXPORT Status ToProto(const FlightEndpoint& endpoint, + pb::FlightEndpoint* pb_endpoint); +ARROW_FLIGHT_EXPORT Status ToProto(const RenewFlightEndpointRequest& request, + pb::RenewFlightEndpointRequest* pb_request); +ARROW_FLIGHT_EXPORT Status ToProto(const FlightInfo& info, pb::FlightInfo* pb_info); +ARROW_FLIGHT_EXPORT Status ToProto(const PollInfo& info, pb::PollInfo* pb_info); +ARROW_FLIGHT_EXPORT Status ToProto(const CancelFlightInfoRequest& request, + pb::CancelFlightInfoRequest* pb_request); +ARROW_FLIGHT_EXPORT Status ToProto(const ActionType& type, pb::ActionType* pb_type); +ARROW_FLIGHT_EXPORT Status ToProto(const Action& action, pb::Action* pb_action); +ARROW_FLIGHT_EXPORT Status ToProto(const Result& result, pb::Result* pb_result); +ARROW_FLIGHT_EXPORT Status ToProto(const CancelFlightInfoResult& result, + pb::CancelFlightInfoResult* pb_result); +ARROW_FLIGHT_EXPORT Status ToProto(const Criteria& criteria, pb::Criteria* pb_criteria); +ARROW_FLIGHT_EXPORT Status ToProto(const Location& location, pb::Location* pb_location); +ARROW_FLIGHT_EXPORT Status ToProto(const SchemaResult& result, + pb::SchemaResult* pb_result); +ARROW_FLIGHT_EXPORT Status ToProto(const Ticket& ticket, pb::Ticket* pb_ticket); +ARROW_FLIGHT_EXPORT Status ToProto(const BasicAuth& basic_auth, + pb::BasicAuth* pb_basic_auth); +ARROW_FLIGHT_EXPORT Status ToProto(const SetSessionOptionsRequest& request, + pb::SetSessionOptionsRequest* pb_request); +ARROW_FLIGHT_EXPORT Status ToProto(const SetSessionOptionsResult& result, + pb::SetSessionOptionsResult* pb_result); +ARROW_FLIGHT_EXPORT Status ToProto(const GetSessionOptionsRequest& request, + pb::GetSessionOptionsRequest* pb_request); +ARROW_FLIGHT_EXPORT Status ToProto(const GetSessionOptionsResult& result, + pb::GetSessionOptionsResult* pb_result); +ARROW_FLIGHT_EXPORT Status ToProto(const CloseSessionRequest& request, + pb::CloseSessionRequest* pb_request); +ARROW_FLIGHT_EXPORT Status ToProto(const CloseSessionResult& result, + pb::CloseSessionResult* pb_result); Status ToPayload(const FlightDescriptor& descr, std::shared_ptr* out); diff --git a/cpp/src/arrow/flight/server.cc b/cpp/src/arrow/flight/server.cc index adbdfb85f29..fa7f99e0126 100644 --- a/cpp/src/arrow/flight/server.cc +++ b/cpp/src/arrow/flight/server.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,7 @@ #include "arrow/flight/transport_server.h" #include "arrow/flight/types.h" #include "arrow/status.h" +#include "arrow/type.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/uri.h" @@ -275,78 +277,99 @@ Status FlightServerBase::GetSchema(const ServerCallContext& context, class RecordBatchStream::RecordBatchStreamImpl { public: - // Stages of the stream when producing payloads - enum class Stage { - NEW, // The stream has been created, but Next has not been called yet - DICTIONARY, // Dictionaries have been collected, and are being sent - RECORD_BATCH // Initial have been sent - }; - RecordBatchStreamImpl(const std::shared_ptr& reader, const ipc::IpcWriteOptions& options) - : reader_(reader), mapper_(*reader_->schema()), ipc_options_(options) {} + : reader_(reader), options_(options) {} std::shared_ptr schema() { return reader_->schema(); } Status GetSchemaPayload(FlightPayload* payload) { - return ipc::GetSchemaPayload(*reader_->schema(), ipc_options_, mapper_, - &payload->ipc_message); + if (!writer_) { + // Create the IPC writer on first call + auto payload_writer = + std::make_unique(&payload_deque_); + ARROW_ASSIGN_OR_RAISE( + writer_, ipc::internal::OpenRecordBatchWriter(std::move(payload_writer), + reader_->schema(), options_)); + } + + // Return the expected schema payload. + if (payload_deque_.empty()) { + return Status::UnknownError("No schema payload generated"); + } + *payload = std::move(payload_deque_.front()); + payload_deque_.pop_front(); + return Status::OK(); } Status Next(FlightPayload* payload) { - if (stage_ == Stage::NEW) { - RETURN_NOT_OK(reader_->ReadNext(¤t_batch_)); - if (!current_batch_) { - // Signal that iteration is over + // If we have previous payloads (dictionary messages or previous record batches) + // we will return them before reading the next record batch. + if (payload_deque_.empty()) { + std::shared_ptr batch; + RETURN_NOT_OK(reader_->ReadNext(&batch)); + if (!batch) { + // End of stream + if (writer_) { + RETURN_NOT_OK(writer_->Close()); + } payload->ipc_message.metadata = nullptr; return Status::OK(); } - ARROW_ASSIGN_OR_RAISE(dictionaries_, - ipc::CollectDictionaries(*current_batch_, mapper_)); - stage_ = Stage::DICTIONARY; + if (!writer_) { + return Status::UnknownError( + "Writer should be initialized before reading Next batches"); + } + // One WriteRecordBatch call might generate multiple payloads, so we + // need to collect them in a deque. + RETURN_NOT_OK(writer_->WriteRecordBatch(*batch)); } - if (stage_ == Stage::DICTIONARY) { - if (dictionary_index_ == static_cast(dictionaries_.size())) { - stage_ = Stage::RECORD_BATCH; - return ipc::GetRecordBatchPayload(*current_batch_, ipc_options_, - &payload->ipc_message); - } else { - return GetNextDictionary(payload); - } + // There must be at least one payload generated after WriteRecordBatch or + // from previous calls to WriteRecordBatch. + if (payload_deque_.empty()) { + return Status::UnknownError("IPC writer didn't produce any payloads"); } - RETURN_NOT_OK(reader_->ReadNext(¤t_batch_)); + *payload = std::move(payload_deque_.front()); + payload_deque_.pop_front(); + return Status::OK(); + } - // TODO(ARROW-10787): Delta dictionaries - if (!current_batch_) { - // Signal that iteration is over - payload->ipc_message.metadata = nullptr; - return Status::OK(); - } else { - return ipc::GetRecordBatchPayload(*current_batch_, ipc_options_, - &payload->ipc_message); + Status Close() { + if (writer_) { + RETURN_NOT_OK(writer_->Close()); } + return reader_->Close(); } - Status Close() { return reader_->Close(); } - private: - Status GetNextDictionary(FlightPayload* payload) { - const auto& it = dictionaries_[dictionary_index_++]; - return ipc::GetDictionaryPayload(it.first, it.second, ipc_options_, - &payload->ipc_message); - } + // Simple payload writer that uses a deque to store generated payloads. + class ServerRecordBatchPayloadWriter : public ipc::internal::IpcPayloadWriter { + public: + explicit ServerRecordBatchPayloadWriter(std::deque* payload_deque) + : payload_deque_(payload_deque) {} - Stage stage_ = Stage::NEW; - std::shared_ptr reader_; - ipc::DictionaryFieldMapper mapper_; - ipc::IpcWriteOptions ipc_options_; - std::shared_ptr current_batch_; - std::vector>> dictionaries_; + Status Start() override { return Status::OK(); } - // Index of next dictionary to send - int dictionary_index_ = 0; + Status WritePayload(const ipc::IpcPayload& ipc_payload) override { + FlightPayload payload; + payload.ipc_message = ipc_payload; + + payload_deque_->push_back(std::move(payload)); + return Status::OK(); + } + + Status Close() override { return Status::OK(); } + + private: + std::deque* payload_deque_; + }; + + std::shared_ptr reader_; + ipc::IpcWriteOptions options_; + std::unique_ptr writer_; + std::deque payload_deque_; }; FlightMetadataWriter::~FlightMetadataWriter() = default; diff --git a/cpp/src/arrow/flight/server_tracing_middleware.cc b/cpp/src/arrow/flight/server_tracing_middleware.cc index 6884308c7ff..f3001162002 100644 --- a/cpp/src/arrow/flight/server_tracing_middleware.cc +++ b/cpp/src/arrow/flight/server_tracing_middleware.cc @@ -190,7 +190,7 @@ std::vector TracingServerMiddleware::GetTrace const { return impl_->GetTraceContext(); } -constexpr char const TracingServerMiddleware::kMiddlewareName[]; +constexpr const char TracingServerMiddleware::kMiddlewareName[]; std::shared_ptr MakeTracingServerMiddlewareFactory() { return std::make_shared(); diff --git a/cpp/src/arrow/flight/server_tracing_middleware.h b/cpp/src/arrow/flight/server_tracing_middleware.h index 581c8354368..50c8294a63b 100644 --- a/cpp/src/arrow/flight/server_tracing_middleware.h +++ b/cpp/src/arrow/flight/server_tracing_middleware.h @@ -42,7 +42,7 @@ class ARROW_FLIGHT_EXPORT TracingServerMiddleware : public ServerMiddleware { public: ~TracingServerMiddleware(); - static constexpr char const kMiddlewareName[] = + static constexpr const char kMiddlewareName[] = "arrow::flight::TracingServerMiddleware"; std::string name() const override { return kMiddlewareName; } diff --git a/cpp/src/arrow/flight/sql/ArrowFlightSqlConfig.cmake.in b/cpp/src/arrow/flight/sql/ArrowFlightSqlConfig.cmake.in index 3a70dbdeda6..a282215af69 100644 --- a/cpp/src/arrow/flight/sql/ArrowFlightSqlConfig.cmake.in +++ b/cpp/src/arrow/flight/sql/ArrowFlightSqlConfig.cmake.in @@ -27,7 +27,7 @@ @PACKAGE_INIT@ include(CMakeFindDependencyMacro) -find_dependency(ArrowFlight) +find_dependency(ArrowFlight CONFIG) include("${CMAKE_CURRENT_LIST_DIR}/ArrowFlightSqlTargets.cmake") diff --git a/cpp/src/arrow/flight/sql/CMakeLists.txt b/cpp/src/arrow/flight/sql/CMakeLists.txt index 796cb9da18e..6fcdaba2ec8 100644 --- a/cpp/src/arrow/flight/sql/CMakeLists.txt +++ b/cpp/src/arrow/flight/sql/CMakeLists.txt @@ -48,7 +48,8 @@ add_custom_command(OUTPUT ${FLIGHT_SQL_GENERATED_PROTO_FILES} COMMAND ${FLIGHT_SQL_PROTOC_COMMAND} DEPENDS ${PROTO_DEPENDS}) -set_source_files_properties(${FLIGHT_SQL_GENERATED_PROTO_FILES} PROPERTIES GENERATED TRUE) +arrow_set_generated_proto_files_properties(${FLIGHT_SQL_GENERATED_PROTO_FILES}) + add_custom_target(flight_sql_protobuf_gen ALL DEPENDS ${FLIGHT_SQL_GENERATED_PROTO_FILES}) set(ARROW_FLIGHT_SQL_SRCS @@ -117,8 +118,9 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_EXAMPLES) set(ARROW_FLIGHT_SQL_TEST_SRCS server_test.cc server_session_middleware_internals_test.cc) - set(ARROW_FLIGHT_SQL_TEST_LIBS ${SQLite3_LIBRARIES}) + set(ARROW_FLIGHT_SQL_TEST_LIBS ${SQLite3_LIBRARIES} Boost::headers) set(ARROW_FLIGHT_SQL_ACERO_SRCS example/acero_server.cc) + set(ARROW_FLIGHT_SQL_TEST_EXTRA_LINK_LIBS "") if(ARROW_COMPUTE AND ARROW_PARQUET @@ -129,6 +131,7 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_EXAMPLES) else() list(APPEND ARROW_FLIGHT_SQL_TEST_LIBS arrow_substrait_shared) endif() + list(APPEND ARROW_FLIGHT_SQL_TEST_EXTRA_LINK_LIBS arrow_compute_testing) if(ARROW_BUILD_EXAMPLES) add_executable(acero-flight-sql-server ${ARROW_FLIGHT_SQL_ACERO_SRCS} @@ -146,6 +149,8 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_EXAMPLES) STATIC_LINK_LIBS ${ARROW_FLIGHT_SQL_TEST_LINK_LIBS} ${ARROW_FLIGHT_SQL_TEST_LIBS} + EXTRA_LINK_LIBS + ${ARROW_FLIGHT_SQL_TEST_EXTRA_LINK_LIBS} EXTRA_INCLUDES "${CMAKE_CURRENT_BINARY_DIR}/../" LABELS @@ -154,12 +159,12 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_EXAMPLES) add_executable(flight-sql-test-server test_server_cli.cc ${ARROW_FLIGHT_SQL_TEST_SERVER_SRCS}) target_link_libraries(flight-sql-test-server - PRIVATE ${ARROW_FLIGHT_SQL_TEST_LINK_LIBS} ${GFLAGS_LIBRARIES} - ${SQLite3_LIBRARIES}) + PRIVATE ${ARROW_FLIGHT_SQL_TEST_LINK_LIBS} + ${ARROW_FLIGHT_SQL_TEST_LIBS} ${GFLAGS_LIBRARIES}) add_executable(flight-sql-test-app test_app_cli.cc) target_link_libraries(flight-sql-test-app PRIVATE ${ARROW_FLIGHT_SQL_TEST_LINK_LIBS} - ${GFLAGS_LIBRARIES}) + Boost::headers ${GFLAGS_LIBRARIES}) if(ARROW_FLIGHT_TEST_LINKAGE STREQUAL "static" AND ARROW_BUILD_STATIC) foreach(TEST_TARGET arrow-flight-sql-test flight-sql-test-server flight-sql-test-app) @@ -168,3 +173,7 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_EXAMPLES) endforeach() endif() endif() + +if(ARROW_FLIGHT_SQL_ODBC) + add_subdirectory(odbc) +endif() diff --git a/cpp/src/arrow/flight/sql/client.cc b/cpp/src/arrow/flight/sql/client.cc index fe087cc947d..6fe4a405667 100644 --- a/cpp/src/arrow/flight/sql/client.cc +++ b/cpp/src/arrow/flight/sql/client.cc @@ -40,6 +40,7 @@ namespace flight { namespace sql { namespace { + arrow::Result GetFlightDescriptorForCommand( const google::protobuf::Message& command) { FlightDescriptor descriptor; @@ -96,6 +97,25 @@ Status ReadResult(ResultStream* results, google::protobuf::Message* message) { } return Status::OK(); } + +arrow::Result> BindParameters(FlightClient* client, + const FlightCallOptions& options, + const FlightDescriptor& descriptor, + RecordBatchReader* params) { + ARROW_ASSIGN_OR_RAISE(auto stream, + client->DoPut(options, descriptor, params->schema())); + while (true) { + ARROW_ASSIGN_OR_RAISE(auto batch, params->Next()); + if (!batch) break; + ARROW_RETURN_NOT_OK(stream.writer->WriteRecordBatch(*batch)); + } + ARROW_RETURN_NOT_OK(stream.writer->DoneWriting()); + std::shared_ptr metadata; + ARROW_RETURN_NOT_OK(stream.reader->ReadMetadata(&metadata)); + ARROW_RETURN_NOT_OK(stream.writer->Close()); + return metadata; +} + } // namespace const Transaction& no_transaction() { @@ -615,24 +635,6 @@ arrow::Result> PreparedStatement::ParseRespon parameter_schema); } -arrow::Result> BindParameters(FlightClient* client, - const FlightCallOptions& options, - const FlightDescriptor& descriptor, - RecordBatchReader* params) { - ARROW_ASSIGN_OR_RAISE(auto stream, - client->DoPut(options, descriptor, params->schema())); - while (true) { - ARROW_ASSIGN_OR_RAISE(auto batch, params->Next()); - if (!batch) break; - ARROW_RETURN_NOT_OK(stream.writer->WriteRecordBatch(*batch)); - } - ARROW_RETURN_NOT_OK(stream.writer->DoneWriting()); - std::shared_ptr metadata; - ARROW_RETURN_NOT_OK(stream.reader->ReadMetadata(&metadata)); - ARROW_RETURN_NOT_OK(stream.writer->Close()); - return metadata; -} - arrow::Result> PreparedStatement::Execute( const FlightCallOptions& options) { if (is_closed_) { diff --git a/cpp/src/arrow/flight/sql/column_metadata.cc b/cpp/src/arrow/flight/sql/column_metadata.cc index c855e2f09af..30f557084b2 100644 --- a/cpp/src/arrow/flight/sql/column_metadata.cc +++ b/cpp/src/arrow/flight/sql/column_metadata.cc @@ -55,6 +55,7 @@ const char* ColumnMetadata::kIsAutoIncrement = "ARROW:FLIGHT:SQL:IS_AUTO_INCREME const char* ColumnMetadata::kIsCaseSensitive = "ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE"; const char* ColumnMetadata::kIsReadOnly = "ARROW:FLIGHT:SQL:IS_READ_ONLY"; const char* ColumnMetadata::kIsSearchable = "ARROW:FLIGHT:SQL:IS_SEARCHABLE"; +const char* ColumnMetadata::kRemarks = "ARROW:FLIGHT:SQL:REMARKS"; ColumnMetadata::ColumnMetadata( std::shared_ptr metadata_map) @@ -98,20 +99,24 @@ arrow::Result ColumnMetadata::GetIsAutoIncrement() const { arrow::Result ColumnMetadata::GetIsCaseSensitive() const { std::string is_case_sensitive; - ARROW_ASSIGN_OR_RAISE(is_case_sensitive, metadata_map_->Get(kIsAutoIncrement)); + ARROW_ASSIGN_OR_RAISE(is_case_sensitive, metadata_map_->Get(kIsCaseSensitive)); return StringToBoolean(is_case_sensitive); } arrow::Result ColumnMetadata::GetIsReadOnly() const { std::string is_read_only; - ARROW_ASSIGN_OR_RAISE(is_read_only, metadata_map_->Get(kIsAutoIncrement)); + ARROW_ASSIGN_OR_RAISE(is_read_only, metadata_map_->Get(kIsReadOnly)); return StringToBoolean(is_read_only); } arrow::Result ColumnMetadata::GetIsSearchable() const { - std::string is_case_sensitive; - ARROW_ASSIGN_OR_RAISE(is_case_sensitive, metadata_map_->Get(kIsAutoIncrement)); - return StringToBoolean(is_case_sensitive); + std::string is_searchable; + ARROW_ASSIGN_OR_RAISE(is_searchable, metadata_map_->Get(kIsSearchable)); + return StringToBoolean(is_searchable); +} + +arrow::Result ColumnMetadata::GetRemarks() const { + return metadata_map_->Get(kRemarks); } ColumnMetadata::ColumnMetadataBuilder ColumnMetadata::Builder() { @@ -185,6 +190,12 @@ ColumnMetadata::ColumnMetadataBuilder::IsSearchable(bool is_searchable) { return *this; } +ColumnMetadata::ColumnMetadataBuilder& ColumnMetadata::ColumnMetadataBuilder::Remarks( + const std::string& remarks) { + metadata_map_->Append(ColumnMetadata::kRemarks, remarks); + return *this; +} + ColumnMetadata::ColumnMetadataBuilder::ColumnMetadataBuilder() : metadata_map_(std::make_shared()) {} diff --git a/cpp/src/arrow/flight/sql/column_metadata.h b/cpp/src/arrow/flight/sql/column_metadata.h index 0eb53f3e0bb..fe29df90401 100644 --- a/cpp/src/arrow/flight/sql/column_metadata.h +++ b/cpp/src/arrow/flight/sql/column_metadata.h @@ -66,6 +66,9 @@ class ARROW_FLIGHT_SQL_EXPORT ColumnMetadata { /// \brief Constant variable to hold the value of the key that /// will be used in the KeyValueMetadata class. static const char* kIsSearchable; + /// \brief Constant variable to hold the value of the key that + /// will be used in the KeyValueMetadata class. + static const char* kRemarks; /// \brief Static initializer. static ColumnMetadataBuilder Builder(); @@ -110,6 +113,10 @@ class ARROW_FLIGHT_SQL_EXPORT ColumnMetadata { /// \return The IsSearchable. arrow::Result GetIsSearchable() const; + /// \brief Return the Remarks set in the KeyValueMetadata. + /// \return The Remarks. + arrow::Result GetRemarks() const; + /// \brief Return the KeyValueMetadata. /// \return The KeyValueMetadata. const std::shared_ptr& metadata_map() const; @@ -169,6 +176,11 @@ class ARROW_FLIGHT_SQL_EXPORT ColumnMetadata { /// \return A ColumnMetadataBuilder. ColumnMetadataBuilder& IsSearchable(bool is_searchable); + /// \brief Set the column description in the KeyValueMetadata object. + /// \param[in] remarks The comment describing column. + /// \return A ColumnMetadataBuilder. + ColumnMetadataBuilder& Remarks(const std::string& remarks); + ColumnMetadata Build() const; private: diff --git a/cpp/src/arrow/flight/sql/example/sqlite_tables_schema_batch_reader.cc b/cpp/src/arrow/flight/sql/example/sqlite_tables_schema_batch_reader.cc index 55345ad477a..85332e6c4df 100644 --- a/cpp/src/arrow/flight/sql/example/sqlite_tables_schema_batch_reader.cc +++ b/cpp/src/arrow/flight/sql/example/sqlite_tables_schema_batch_reader.cc @@ -102,7 +102,10 @@ Status SqliteTablesWithSchemaBatchReader::ReadNext(std::shared_ptr* std::shared_ptr schema_array; ARROW_RETURN_NOT_OK(schema_builder.Finish(&schema_array)); - ARROW_ASSIGN_OR_RAISE(*batch, first_batch->AddColumn(4, "table_schema", schema_array)); + std::shared_ptr schema_field = + arrow::field("table_schema", schema_array->type(), false); + + ARROW_ASSIGN_OR_RAISE(*batch, first_batch->AddColumn(4, schema_field, schema_array)); return Status::OK(); } diff --git a/cpp/src/arrow/flight/sql/odbc/ArrowFlightSqlOdbcConfig.cmake.in b/cpp/src/arrow/flight/sql/odbc/ArrowFlightSqlOdbcConfig.cmake.in new file mode 100644 index 00000000000..da6d44ebc82 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/ArrowFlightSqlOdbcConfig.cmake.in @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This config sets the following variables in your project:: +# +# ArrowFlightSqlOdbc_FOUND - true if Arrow Flight SQL ODBC found on the system +# +# This config sets the following targets in your project:: +# +# ArrowFlightSqlOdbc::arrow_flight_sql_odbc_shared - for linked as shared library if shared library is built +# ArrowFlightSqlOdbc::arrow_flight_sql_odbc_static - for linked as static library if static library is built + +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +find_dependency(ArrowFlightSql) + +include("${CMAKE_CURRENT_LIST_DIR}/ArrowFlightSqlOdbcTargets.cmake") + +arrow_keep_backward_compatibility(ArrowFlightSqlOdbc arrow_flight_sql_odbc) + +check_required_components(ArrowFlightSqlOdbc) + +arrow_show_details(ArrowFlightSqlOdbc ARROW_FLIGHT_SQL_ODBC) diff --git a/cpp/src/arrow/flight/sql/odbc/CMakeLists.txt b/cpp/src/arrow/flight/sql/odbc/CMakeLists.txt new file mode 100644 index 00000000000..165428cd70c --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/CMakeLists.txt @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Use C++ 20 for ODBC and its subdirectory +# GH-44792: Arrow will switch to C++ 20 +set(CMAKE_CXX_STANDARD 20) + +add_custom_target(arrow_flight_sql_odbc) + +if(WIN32) + if(MSVC_VERSION GREATER_EQUAL 1900) + set(ODBCINST legacy_stdio_definitions odbccp32 shlwapi) + elseif(MINGW) + set(ODBCINST odbccp32 shlwapi) + endif() +elseif(APPLE) + set(ODBCINST iodbcinst) +else() + set(ODBCINST odbcinst) +endif() + +add_subdirectory(flight_sql) +add_subdirectory(odbcabstraction) + +arrow_install_all_headers("arrow/flight/sql/odbc") + +set(ARROW_FLIGHT_SQL_ODBC_SRCS entry_points.cc odbc_api.cc) + +if(WIN32) + list(APPEND ARROW_FLIGHT_SQL_ODBC_SRCS odbc.def) +endif() + +add_arrow_lib(arrow_flight_sql_odbc + CMAKE_PACKAGE_NAME + ArrowFlightSqlOdbc + PKG_CONFIG_NAME + arrow-flight-sql-odbc + OUTPUTS + ARROW_FLIGHT_SQL_ODBC_LIBRARIES + SOURCES + ${ARROW_FLIGHT_SQL_ODBC_SRCS} + DEPENDENCIES + arrow_flight_sql + DEFINITIONS + FMT_HEADER_ONLY + SHARED_LINK_FLAGS + ${ARROW_VERSION_SCRIPT_FLAGS} # Defined in cpp/arrow/CMakeLists.txt + SHARED_LINK_LIBS + arrow_flight_sql_shared + SHARED_INSTALL_INTERFACE_LIBS + ArrowFlight::arrow_flight_sql_shared + STATIC_LINK_LIBS + arrow_flight_sql_static + STATIC_INSTALL_INTERFACE_LIBS + ArrowFlight::arrow_flight_sql_static + SHARED_PRIVATE_LINK_LIBS + ODBC::ODBC + ${ODBCINST} + odbcabstraction + arrow_odbc_spi_impl) + +foreach(LIB_TARGET ${ARROW_FLIGHT_SQL_ODBC_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_FLIGHT_SQL_ODBC_EXPORTING) +endforeach() diff --git a/cpp/src/arrow/flight/sql/odbc/arrow-flight-sql-odbc.pc.in b/cpp/src/arrow/flight/sql/odbc/arrow-flight-sql-odbc.pc.in new file mode 100644 index 00000000000..78959034954 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/arrow-flight-sql-odbc.pc.in @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prefix=@CMAKE_INSTALL_PREFIX@ +includedir=@ARROW_PKG_CONFIG_INCLUDEDIR@ +libdir=@ARROW_PKG_CONFIG_LIBDIR@ + +Name: Apache Arrow Flight SQL ODBC +Description: Apache Arrow Flight SQL ODBC extension +Version: @ARROW_VERSION@ +Requires: arrow-flight-sql +Libs: -L${libdir} -larrow_flight_sql_odbc +Cflags.private: -DARROW_FLIGHT_SQL_ODBC_STATIC diff --git a/cpp/src/arrow/flight/sql/odbc/entry_points.cc b/cpp/src/arrow/flight/sql/odbc/entry_points.cc new file mode 100644 index 00000000000..6801868a3cc --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/entry_points.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// platform.h includes windows.h, so it needs to be included first +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include +#include +#include + +#include "arrow/flight/sql/odbc/odbc_api_internal.h" +#include "arrow/flight/sql/odbc/visibility.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_connection.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_descriptor.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_environment.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_statement.h" + +#include "arrow/util/logging.h" + +SQLRETURN SQL_API SQLAllocHandle(SQLSMALLINT type, SQLHANDLE parent, SQLHANDLE* result) { + return SQL_INVALID_HANDLE; +} diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/CMakeLists.txt b/cpp/src/arrow/flight/sql/odbc/flight_sql/CMakeLists.txt new file mode 100644 index 00000000000..02bb58c4b82 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/CMakeLists.txt @@ -0,0 +1,142 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_library(arrow_odbc_spi_impl + include/flight_sql/flight_sql_driver.h + accessors/binary_array_accessor.cc + accessors/binary_array_accessor.h + accessors/boolean_array_accessor.cc + accessors/boolean_array_accessor.h + accessors/common.h + accessors/date_array_accessor.cc + accessors/date_array_accessor.h + accessors/decimal_array_accessor.cc + accessors/decimal_array_accessor.h + accessors/main.h + accessors/primitive_array_accessor.cc + accessors/primitive_array_accessor.h + accessors/string_array_accessor.cc + accessors/string_array_accessor.h + accessors/time_array_accessor.cc + accessors/time_array_accessor.h + accessors/timestamp_array_accessor.cc + accessors/timestamp_array_accessor.h + address_info.cc + address_info.h + flight_sql_auth_method.cc + flight_sql_auth_method.h + flight_sql_connection.cc + flight_sql_connection.h + flight_sql_driver.cc + flight_sql_get_tables_reader.cc + flight_sql_get_tables_reader.h + flight_sql_get_type_info_reader.cc + flight_sql_get_type_info_reader.h + flight_sql_result_set.cc + flight_sql_result_set.h + flight_sql_result_set_accessors.cc + flight_sql_result_set_accessors.h + flight_sql_result_set_column.cc + flight_sql_result_set_column.h + flight_sql_result_set_metadata.cc + flight_sql_result_set_metadata.h + flight_sql_ssl_config.cc + flight_sql_ssl_config.h + flight_sql_statement.cc + flight_sql_statement.h + flight_sql_statement_get_columns.cc + flight_sql_statement_get_columns.h + flight_sql_statement_get_tables.cc + flight_sql_statement_get_tables.h + flight_sql_statement_get_type_info.cc + flight_sql_statement_get_type_info.h + flight_sql_stream_chunk_buffer.cc + flight_sql_stream_chunk_buffer.h + get_info_cache.cc + get_info_cache.h + json_converter.cc + json_converter.h + record_batch_transformer.cc + record_batch_transformer.h + scalar_function_reporter.cc + scalar_function_reporter.h + system_trust_store.cc + system_trust_store.h + utils.cc) +target_include_directories(arrow_odbc_spi_impl + PUBLIC include include/flight_sql + ${CMAKE_SOURCE_DIR}/odbcabstraction/include) +target_include_directories(arrow_odbc_spi_impl PUBLIC ${CMAKE_CURRENT_LIST_DIR}) + +if(WIN32) + target_sources(arrow_odbc_spi_impl + PRIVATE include/flight_sql/config/configuration.h + include/flight_sql/config/connection_string_parser.h + include/flight_sql/ui/add_property_window.h + include/flight_sql/ui/custom_window.h + include/flight_sql/ui/dsn_configuration_window.h + include/flight_sql/ui/window.h + config/configuration.cc + config/connection_string_parser.cc + ui/custom_window.cc + ui/window.cc + ui/dsn_configuration_window.cc + ui/add_property_window.cc + system_dsn.cc) +endif() + +target_link_libraries(arrow_odbc_spi_impl PUBLIC odbcabstraction arrow_flight_sql_shared + arrow_compute_shared Boost::locale) + +# Link libraries on MINGW64 and macOS +if(MINGW OR APPLE) + target_link_libraries(arrow_odbc_spi_impl PUBLIC ${ODBCINST}) +endif() + +set_target_properties(arrow_odbc_spi_impl + PROPERTIES ARCHIVE_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/$/lib + LIBRARY_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/$/lib + RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/$/lib) + +# CLI +add_executable(arrow_odbc_spi_impl_cli main.cc) +set_target_properties(arrow_odbc_spi_impl_cli + PROPERTIES RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/$/bin) +target_link_libraries(arrow_odbc_spi_impl_cli arrow_odbc_spi_impl) + +# Unit tests +add_arrow_test(odbc_spi_impl_test + SOURCES + accessors/boolean_array_accessor_test.cc + accessors/binary_array_accessor_test.cc + accessors/date_array_accessor_test.cc + accessors/decimal_array_accessor_test.cc + accessors/primitive_array_accessor_test.cc + accessors/string_array_accessor_test.cc + accessors/time_array_accessor_test.cc + accessors/timestamp_array_accessor_test.cc + flight_sql_connection_test.cc + parse_table_types_test.cc + json_converter_test.cc + record_batch_transformer_test.cc + utils_test.cc + EXTRA_LINK_LIBS + arrow_odbc_spi_impl) diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.cc new file mode 100644 index 00000000000..659b7638cd3 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.h" + +#include +#include +#include "arrow/array.h" + +namespace driver { +namespace flight_sql { + +using arrow::BinaryArray; +using odbcabstraction::RowStatus; + +namespace { + +inline RowStatus MoveSingleCellToBinaryBuffer(ColumnBinding* binding, BinaryArray* array, + int64_t arrow_row, int64_t i, + int64_t& value_offset, + bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics) { + RowStatus result = odbcabstraction::RowStatus_SUCCESS; + + const char* value = array->Value(arrow_row).data(); + size_t size_in_bytes = array->value_length(arrow_row); + + size_t remaining_length = static_cast(size_in_bytes - value_offset); + size_t value_length = std::min(remaining_length, binding->buffer_length); + + auto* byte_buffer = + static_cast(binding->buffer) + i * binding->buffer_length; + memcpy(byte_buffer, ((char*)value) + value_offset, value_length); + + if (remaining_length > binding->buffer_length) { + result = odbcabstraction::RowStatus_SUCCESS_WITH_INFO; + diagnostics.AddTruncationWarning(); + if (update_value_offset) { + value_offset += value_length; + } + } else if (update_value_offset) { + value_offset = -1; + } + + if (binding->str_len_buffer) { + binding->str_len_buffer[i] = static_cast(remaining_length); + } + + return result; +} + +} // namespace + +template +BinaryArrayFlightSqlAccessor::BinaryArrayFlightSqlAccessor(Array* array) + : FlightSqlAccessor>(array) {} + +template <> +RowStatus +BinaryArrayFlightSqlAccessor::MoveSingleCellImpl( + ColumnBinding* binding, int64_t arrow_row, int64_t i, int64_t& value_offset, + bool update_value_offset, odbcabstraction::Diagnostics& diagnostics) { + return MoveSingleCellToBinaryBuffer(binding, this->GetArray(), arrow_row, i, + value_offset, update_value_offset, diagnostics); +} + +template +size_t BinaryArrayFlightSqlAccessor::GetCellLengthImpl( + ColumnBinding* binding) const { + return binding->buffer_length; +} + +template class BinaryArrayFlightSqlAccessor; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.h new file mode 100644 index 00000000000..7b742881189 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/type_fwd.h" + +namespace driver { +namespace flight_sql { + +using arrow::BinaryArray; +using odbcabstraction::RowStatus; + +template +class BinaryArrayFlightSqlAccessor + : public FlightSqlAccessor> { + public: + explicit BinaryArrayFlightSqlAccessor(Array* array); + + RowStatus MoveSingleCellImpl(ColumnBinding* binding, int64_t arrow_row, int64_t i, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics); + + size_t GetCellLengthImpl(ColumnBinding* binding) const; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor_test.cc new file mode 100644 index 00000000000..51d8b9bbb6b --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor_test.cc @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { + +using arrow::BinaryType; +using odbcabstraction::OdbcVersion; + +using arrow::ArrayFromVector; + +TEST(BinaryArrayAccessor, Test_CDataType_BINARY_Basic) { + std::vector values = {"foo", "barx", "baz123"}; + std::shared_ptr array; + ArrayFromVector(values, &array); + + BinaryArrayFlightSqlAccessor accessor(array.get()); + + size_t max_str_len = 64; + std::vector buffer(values.size() * max_str_len); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_BINARY, 0, 0, buffer.data(), + max_str_len, str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (int i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i].length(), str_len_buffer[i]); + // Beware that CDataType_BINARY values are not null terminated. + // It's safe to create a std::string from this data because we know it's + // ASCII, this doesn't work with arbitrary binary data. + ASSERT_EQ(values[i], + std::string(buffer.data() + i * max_str_len, + buffer.data() + i * max_str_len + str_len_buffer[i])); + } +} + +TEST(BinaryArrayAccessor, Test_CDataType_BINARY_Truncation) { + std::vector values = {"ABCDEFABCDEFABCDEFABCDEFABCDEFABCDEFABCDEF"}; + std::shared_ptr array; + ArrayFromVector(values, &array); + + BinaryArrayFlightSqlAccessor accessor(array.get()); + + size_t max_str_len = 8; + std::vector buffer(values.size() * max_str_len); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_BINARY, 0, 0, buffer.data(), + max_str_len, str_len_buffer.data()); + + std::stringstream ss; + int64_t value_offset = 0; + + // Construct the whole string by concatenating smaller chunks from + // GetColumnarData + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + do { + diagnostics.Clear(); + int64_t original_value_offset = value_offset; + ASSERT_EQ(1, accessor.GetColumnarData(&binding, 0, 1, value_offset, true, diagnostics, + nullptr)); + ASSERT_EQ(values[0].length() - original_value_offset, str_len_buffer[0]); + + int64_t chunk_length = 0; + if (value_offset == -1) { + chunk_length = str_len_buffer[0]; + } else { + chunk_length = max_str_len; + } + + // Beware that CDataType_BINARY values are not null terminated. + // It's safe to create a std::string from this data because we know it's + // ASCII, this doesn't work with arbitrary binary data. + ss << std::string(buffer.data(), buffer.data() + chunk_length); + } while (value_offset < static_cast(values[0].length()) && value_offset != -1); + + ASSERT_EQ(values[0], ss.str()); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.cc new file mode 100644 index 00000000000..ea4d1ba72c2 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.cc @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.h" + +namespace driver { +namespace flight_sql { + +using arrow::BooleanArray; +using odbcabstraction::RowStatus; + +template +BooleanArrayFlightSqlAccessor::BooleanArrayFlightSqlAccessor(Array* array) + : FlightSqlAccessor>(array) {} + +template +RowStatus BooleanArrayFlightSqlAccessor::MoveSingleCellImpl( + ColumnBinding* binding, int64_t arrow_row, int64_t i, int64_t& value_offset, + bool update_value_offset, odbcabstraction::Diagnostics& diagnostics) { + typedef unsigned char c_type; + bool value = this->GetArray()->Value(arrow_row); + + auto* buffer = static_cast(binding->buffer); + buffer[i] = value ? 1 : 0; + + if (binding->str_len_buffer) { + binding->str_len_buffer[i] = static_cast(GetCellLengthImpl(binding)); + } + + return odbcabstraction::RowStatus_SUCCESS; +} + +template +size_t BooleanArrayFlightSqlAccessor::GetCellLengthImpl( + ColumnBinding* binding) const { + return sizeof(unsigned char); +} + +template class BooleanArrayFlightSqlAccessor; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.h new file mode 100644 index 00000000000..217cc0845c6 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/type_fwd.h" + +namespace driver { +namespace flight_sql { + +using arrow::BooleanArray; +using odbcabstraction::RowStatus; + +template +class BooleanArrayFlightSqlAccessor + : public FlightSqlAccessor> { + public: + explicit BooleanArrayFlightSqlAccessor(Array* array); + + RowStatus MoveSingleCellImpl(ColumnBinding* binding, int64_t arrow_row, int64_t i, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics); + + size_t GetCellLengthImpl(ColumnBinding* binding) const; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor_test.cc new file mode 100644 index 00000000000..31688200a56 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor_test.cc @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.h" +#include "arrow/testing/builder.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { + +using arrow::BooleanType; +using odbcabstraction::OdbcVersion; + +using arrow::ArrayFromVector; + +TEST(BooleanArrayFlightSqlAccessor, Test_BooleanArray_CDataType_BIT) { + const std::vector values = {true, false, true}; + std::shared_ptr array; + ArrayFromVector(values, &array); + + BooleanArrayFlightSqlAccessor accessor(array.get()); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_BIT, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (int i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(unsigned char), str_len_buffer[i]); + ASSERT_EQ(values[i] ? 1 : 0, buffer[i]); + } +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/common.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/common.h new file mode 100644 index 00000000000..8b74028a9c8 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/common.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include "arrow/array.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/scalar.h" + +namespace driver { +namespace flight_sql { + +template +inline size_t CopyFromArrayValuesToBinding(ARRAY_TYPE* array, ColumnBinding* binding, + int64_t starting_row, int64_t cells) { + constexpr ssize_t element_size = sizeof(typename ARRAY_TYPE::value_type); + + if (binding->str_len_buffer) { + for (int64_t i = 0; i < cells; ++i) { + int64_t current_row = starting_row + i; + if (array->IsNull(current_row)) { + binding->str_len_buffer[i] = odbcabstraction::NULL_DATA; + } else { + binding->str_len_buffer[i] = element_size; + } + } + } else { + // Duplicate this loop to avoid null checks within the loop. + for (int64_t i = starting_row; i < starting_row + cells; ++i) { + if (array->IsNull(i)) { + throw odbcabstraction::NullWithoutIndicatorException(); + } + } + } + + // Copy the entire array to the bound ODBC buffers. + // Note that the array should already have been sliced down to the same number + // of elements in the ODBC data array by the point in which this function is called. + const auto* values = array->raw_values(); + memcpy(binding->buffer, &values[starting_row], element_size * cells); + + return cells; +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.cc new file mode 100644 index 00000000000..b4f39b69de4 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.cc @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.h" +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h" + +using arrow::Date32Array; +using arrow::Date64Array; + +namespace { +template +int64_t convertDate(typename T::value_type value) { + return value; +} + +/// Converts the value from the array, which is in milliseconds, to seconds. +/// \param value the value extracted from the array in milliseconds. +/// \return the converted value in seconds. +template <> +int64_t convertDate(int64_t value) { + return value / driver::odbcabstraction::MILLI_TO_SECONDS_DIVISOR; +} + +/// Converts the value from the array, which is in days, to seconds. +/// \param value the value extracted from the array in days. +/// \return the converted value in seconds. +template <> +int64_t convertDate(int32_t value) { + return value * driver::odbcabstraction::DAYS_TO_SECONDS_MULTIPLIER; +} +} // namespace + +namespace driver { +namespace flight_sql { + +using odbcabstraction::DATE_STRUCT; +using odbcabstraction::RowStatus; + +using odbcabstraction::GetTimeForSecondsSinceEpoch; + +template +DateArrayFlightSqlAccessor::DateArrayFlightSqlAccessor( + Array* array) + : FlightSqlAccessor>(array) {} + +template +RowStatus DateArrayFlightSqlAccessor::MoveSingleCellImpl( + ColumnBinding* binding, int64_t arrow_row, int64_t cell_counter, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics) { + auto* buffer = static_cast(binding->buffer); + auto value = convertDate(this->GetArray()->Value(arrow_row)); + tm date{}; + + GetTimeForSecondsSinceEpoch(value, date); + + buffer[cell_counter].year = 1900 + (date.tm_year); + buffer[cell_counter].month = date.tm_mon + 1; + buffer[cell_counter].day = date.tm_mday; + + if (binding->str_len_buffer) { + binding->str_len_buffer[cell_counter] = + static_cast(GetCellLengthImpl(binding)); + } + + return odbcabstraction::RowStatus_SUCCESS; +} + +template +size_t DateArrayFlightSqlAccessor::GetCellLengthImpl( + ColumnBinding* binding) const { + return sizeof(DATE_STRUCT); +} + +template class DateArrayFlightSqlAccessor; +template class DateArrayFlightSqlAccessor; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.h new file mode 100644 index 00000000000..42f3d4ba220 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/type_fwd.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using odbcabstraction::RowStatus; + +template +class DateArrayFlightSqlAccessor + : public FlightSqlAccessor> { + public: + explicit DateArrayFlightSqlAccessor(Array* array); + + RowStatus MoveSingleCellImpl(ColumnBinding* binding, int64_t arrow_row, + int64_t cell_counter, int64_t& value_offset, + bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics); + + size_t GetCellLengthImpl(ColumnBinding* binding) const; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor_test.cc new file mode 100644 index 00000000000..b4a0a5d1128 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor_test.cc @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h" +#include "arrow/testing/builder.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { + +using arrow::Date32Array; +using arrow::Date32Type; +using arrow::Date64Array; +using arrow::Date64Type; +using arrow::NumericArray; + +using odbcabstraction::DATE_STRUCT; +using odbcabstraction::OdbcVersion; +using odbcabstraction::tagDATE_STRUCT; + +using arrow::ArrayFromVector; +using odbcabstraction::GetTimeForSecondsSinceEpoch; + +TEST(DateArrayAccessor, Test_Date32Array_CDataType_DATE) { + std::vector values = {7589, 12320, 18980, 19095, -1, 0}; + std::vector expected = { + {1990, 10, 12}, {2003, 9, 25}, {2021, 12, 19}, + {2022, 4, 13}, {1969, 12, 31}, {1970, 1, 1}, + }; + + std::shared_ptr array; + ArrayFromVector(values, &array); + + DateArrayFlightSqlAccessor accessor( + dynamic_cast*>(array.get())); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_DATE, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(DATE_STRUCT), str_len_buffer[i]); + + ASSERT_EQ(expected[i].year, buffer[i].year); + ASSERT_EQ(expected[i].month, buffer[i].month); + ASSERT_EQ(expected[i].day, buffer[i].day); + } +} + +TEST(DateArrayAccessor, Test_Date64Array_CDataType_DATE) { + std::vector values = { + 86400000, 172800000, 259200000, 1649793238110, 0, 345600000, 432000000, + 518400000, -86400000, -17987443200000, -24268068949000}; + std::vector expected = { + /* year(16), month(u16), day(u16) */ + {1970, 1, 2}, {1970, 1, 3}, {1970, 1, 4}, {2022, 4, 12}, + {1970, 1, 1}, {1970, 1, 5}, {1970, 1, 6}, {1970, 1, 7}, + {1969, 12, 31}, {1400, 1, 1}, {1200, 12, 22}, + }; + + std::shared_ptr array; + ArrayFromVector(values, &array); + + DateArrayFlightSqlAccessor accessor( + dynamic_cast*>(array.get())); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_DATE, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(DATE_STRUCT), str_len_buffer[i]); + ASSERT_EQ(expected[i].year, buffer[i].year); + ASSERT_EQ(expected[i].month, buffer[i].month); + ASSERT_EQ(expected[i].day, buffer[i].day); + } +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.cc new file mode 100644 index 00000000000..f093e152fdb --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.cc @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.h" + +#include "arrow/array.h" +#include "arrow/scalar.h" + +namespace driver { +namespace flight_sql { + +using arrow::Decimal128; +using arrow::Decimal128Array; +using arrow::Decimal128Type; +using arrow::Status; + +using odbcabstraction::DriverException; +using odbcabstraction::NUMERIC_STRUCT; +using odbcabstraction::RowStatus; + +template +DecimalArrayFlightSqlAccessor::DecimalArrayFlightSqlAccessor( + Array* array) + : FlightSqlAccessor>(array), + data_type_(static_cast(array->type().get())) {} + +template <> +RowStatus +DecimalArrayFlightSqlAccessor:: + MoveSingleCellImpl(ColumnBinding* binding, int64_t arrow_row, int64_t i, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics) { + auto result = &(static_cast(binding->buffer)[i]); + int32_t original_scale = data_type_->scale(); + + const uint8_t* bytes = this->GetArray()->Value(arrow_row); + Decimal128 value(bytes); + if (original_scale != binding->scale) { + const Status& status = value.Rescale(original_scale, binding->scale).Value(&value); + ThrowIfNotOK(status); + } + if (!value.FitsInPrecision(binding->precision)) { + throw DriverException("Decimal value doesn't fit in precision " + + std::to_string(binding->precision)); + } + + result->sign = value.IsNegative() ? 0 : 1; + + // Take the absolute value since the ODBC SQL_NUMERIC_STRUCT holds + // a positive-only number. + if (value.IsNegative()) { + Decimal128 abs_value = Decimal128::Abs(value); + abs_value.ToBytes(result->val); + } else { + value.ToBytes(result->val); + } + result->precision = static_cast(binding->precision); + result->scale = static_cast(binding->scale); + + result->precision = data_type_->precision(); + + if (binding->str_len_buffer) { + binding->str_len_buffer[i] = static_cast(GetCellLengthImpl(binding)); + } + + return odbcabstraction::RowStatus_SUCCESS; +} + +template +size_t DecimalArrayFlightSqlAccessor::GetCellLengthImpl( + ColumnBinding* binding) const { + return sizeof(NUMERIC_STRUCT); +} + +template class DecimalArrayFlightSqlAccessor; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.h new file mode 100644 index 00000000000..235e48446e2 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/type_fwd.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using arrow::Decimal128Type; +using odbcabstraction::RowStatus; + +template +class DecimalArrayFlightSqlAccessor + : public FlightSqlAccessor> { + public: + explicit DecimalArrayFlightSqlAccessor(Array* array); + + RowStatus MoveSingleCellImpl(ColumnBinding* binding, int64_t arrow_row, int64_t i, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics); + + size_t GetCellLengthImpl(ColumnBinding* binding) const; + + private: + Decimal128Type* data_type_; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor_test.cc new file mode 100644 index 00000000000..7abdc160efd --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor_test.cc @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.h" +#include "arrow/builder.h" +#include "arrow/testing/builder.h" +#include "arrow/util/decimal.h" +#include "gtest/gtest.h" + +namespace { + +using arrow::ArrayFromVector; +using arrow::Decimal128; +using arrow::Decimal128Array; + +using driver::odbcabstraction::NUMERIC_STRUCT; +using driver::odbcabstraction::OdbcVersion; + +using driver::flight_sql::ThrowIfNotOK; + +std::vector MakeDecimalVector(const std::vector& values, + int32_t scale) { + std::vector ret; + for (const auto& str : values) { + Decimal128 str_value; + int32_t str_precision; + int32_t str_scale; + + ThrowIfNotOK(Decimal128::FromString(str, &str_value, &str_precision, &str_scale)); + + Decimal128 scaled_value; + if (str_scale == scale) { + scaled_value = str_value; + } else { + scaled_value = str_value.Rescale(str_scale, scale).ValueOrDie(); + } + ret.push_back(scaled_value); + } + return ret; +} + +std::string ConvertNumericToString(NUMERIC_STRUCT& numeric) { + auto v = reinterpret_cast(numeric.val); + auto decimal = Decimal128(v[1], v[0]); + if (numeric.sign == 0) { + decimal.Negate(); + } + const std::string& string = decimal.ToString(numeric.scale); + + return string; +} +} // namespace + +namespace driver { +namespace flight_sql { + +void AssertNumericOutput(int input_precision, int input_scale, + const std::vector& values_str, int output_precision, + int output_scale, + const std::vector& expected_values_str) { + auto decimal_type = + std::make_shared(input_precision, input_scale); + const std::vector& values = + MakeDecimalVector(values_str, decimal_type->scale()); + + std::shared_ptr array; + ArrayFromVector(decimal_type, values, &array); + + DecimalArrayFlightSqlAccessor + accessor(array.get()); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_NUMERIC, output_precision, + output_scale, buffer.data(), 0, str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (int i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(NUMERIC_STRUCT), str_len_buffer[i]); + + ASSERT_EQ(output_precision, buffer[i].precision); + ASSERT_EQ(output_scale, buffer[i].scale); + ASSERT_STREQ(expected_values_str[i].c_str(), + ConvertNumericToString(buffer[i]).c_str()); + } +} + +TEST(DecimalArrayFlightSqlAccessor, Test_Decimal128Array_CDataType_NUMERIC_SameScale) { + const std::vector& input_values = {"25.212", "-25.212", "-123456789.123", + "123456789.123"}; + const std::vector& output_values = + input_values; // String values should be the same + + AssertNumericOutput(38, 3, input_values, 38, 3, output_values); +} + +TEST(DecimalArrayFlightSqlAccessor, + Test_Decimal128Array_CDataType_NUMERIC_IncreasingScale) { + const std::vector& input_values = {"25.212", "-25.212", "-123456789.123", + "123456789.123"}; + const std::vector& output_values = {"25.2120", "-25.2120", + "-123456789.1230", "123456789.1230"}; + + AssertNumericOutput(38, 3, input_values, 38, 4, output_values); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/main.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/main.h new file mode 100644 index 00000000000..638dfa30246 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/main.h @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/flight_sql/accessors/binary_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/boolean_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/date_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/decimal_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.h" diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.cc new file mode 100644 index 00000000000..9bc135e9de2 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.cc @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.h" + +namespace driver { +namespace flight_sql { + +using arrow::DoubleArray; +using arrow::FloatArray; +using arrow::Int16Array; +using arrow::Int32Array; +using arrow::Int64Array; +using arrow::Int8Array; +using arrow::UInt16Array; +using arrow::UInt32Array; +using arrow::UInt64Array; +using arrow::UInt8Array; + +template +PrimitiveArrayFlightSqlAccessor< + ARROW_ARRAY, TARGET_TYPE>::PrimitiveArrayFlightSqlAccessor(Array* array) + : FlightSqlAccessor>( + array) {} + +template +size_t PrimitiveArrayFlightSqlAccessor::GetColumnarDataImpl( + ColumnBinding* binding, int64_t starting_row, int64_t cells, int64_t& value_offset, + bool update_value_offset, odbcabstraction::Diagnostics& diagnostics, + uint16_t* row_status_array) { + return CopyFromArrayValuesToBinding(this->GetArray(), binding, + starting_row, cells); +} + +template +size_t PrimitiveArrayFlightSqlAccessor::GetCellLengthImpl( + ColumnBinding* binding) const { + return sizeof(typename ARROW_ARRAY::TypeClass::c_type); +} + +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; +template class PrimitiveArrayFlightSqlAccessor; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.h new file mode 100644 index 00000000000..30fc0465bb8 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/common.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/scalar.h" + +namespace driver { +namespace flight_sql { + +template +class PrimitiveArrayFlightSqlAccessor + : public FlightSqlAccessor< + ARROW_ARRAY, TARGET_TYPE, + PrimitiveArrayFlightSqlAccessor> { + public: + explicit PrimitiveArrayFlightSqlAccessor(Array* array); + + size_t GetColumnarDataImpl(ColumnBinding* binding, int64_t starting_row, int64_t cells, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics, + uint16_t* row_status_array); + + size_t GetCellLengthImpl(ColumnBinding* binding) const; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor_test.cc new file mode 100644 index 00000000000..d291b9e08f9 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor_test.cc @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/primitive_array_accessor.h" +#include +#include "arrow/testing/builder.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { + +using arrow::DoubleArray; +using arrow::FloatArray; +using arrow::Int16Array; +using arrow::Int32Array; +using arrow::Int64Array; +using arrow::Int8Array; +using arrow::UInt16Array; +using arrow::UInt32Array; +using arrow::UInt64Array; +using arrow::UInt8Array; + +using arrow::ArrayFromVector; + +template +void TestPrimitiveArraySqlAccessor() { + typedef typename ARROW_ARRAY::TypeClass::c_type c_type; + + std::vector values = {0, 1, 2, 3, 127}; + + std::shared_ptr array; + ArrayFromVector(values, &array); + + PrimitiveArrayFlightSqlAccessor accessor(array.get()); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(TARGET_TYPE, 0, 0, buffer.data(), values.size(), + str_len_buffer.data()); + + int64_t value_offset = 0; + driver::odbcabstraction::Diagnostics diagnostics("Dummy", "Dummy", + odbcabstraction::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (int i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(c_type), str_len_buffer[i]); + ASSERT_EQ(values[i], buffer[i]); + } +} + +using odbcabstraction::CDataType; + +TEST(PrimitiveArrayFlightSqlAccessor, Test_Int64Array_CDataType_SBIGINT) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_Int32Array_CDataType_SLONG) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_Int16Array_CDataType_SSHORT) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_Int8Array_CDataType_STINYINT) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_UInt64Array_CDataType_UBIGINT) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_UInt32Array_CDataType_ULONG) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_UInt16Array_CDataType_USHORT) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_UInt8Array_CDataType_UTINYINT) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_FloatArray_CDataType_FLOAT) { + TestPrimitiveArraySqlAccessor(); +} + +TEST(PrimitiveArrayFlightSqlAccessor, Test_DoubleArray_CDataType_DOUBLE) { + TestPrimitiveArraySqlAccessor(); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.cc new file mode 100644 index 00000000000..fc1e97a4765 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.cc @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.h" + +#include +#include "arrow/array.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/encoding.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using arrow::StringArray; +using odbcabstraction::RowStatus; + +namespace { + +#if defined _WIN32 || defined _WIN64 +std::string Utf8ToCLocale(const char* utf8_str, int len) { + thread_local boost::locale::generator g; + g.locale_cache_enabled(true); + std::locale loc = g(boost::locale::util::get_system_locale()); + return boost::locale::conv::from_utf(utf8_str, utf8_str + len, loc); +} +#endif + +template +inline RowStatus MoveSingleCellToCharBuffer(std::vector& buffer, + int64_t& last_retrieved_arrow_row, +#if defined _WIN32 || defined _WIN64 + std::string& clocale_str, +#endif + ColumnBinding* binding, StringArray* array, + int64_t arrow_row, int64_t i, + int64_t& value_offset, + bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics) { + RowStatus result = odbcabstraction::RowStatus_SUCCESS; + + // Arrow strings come as UTF-8 + const char* raw_value = array->Value(arrow_row).data(); + const size_t raw_value_length = array->value_length(arrow_row); + const void* value; + + size_t size_in_bytes; + if (sizeof(CHAR_TYPE) > sizeof(char)) { + if (last_retrieved_arrow_row != arrow_row) { + odbcabstraction::Utf8ToWcs(raw_value, raw_value_length, &buffer); + last_retrieved_arrow_row = arrow_row; + } + value = buffer.data(); + size_in_bytes = buffer.size(); + } else { +#if defined _WIN32 || defined _WIN64 + // Convert to C locale string + if (last_retrieved_arrow_row != arrow_row) { + clocale_str = Utf8ToCLocale(raw_value, raw_value_length); + last_retrieved_arrow_row = arrow_row; + } + const char* clocale_data = clocale_str.data(); + size_t clocale_length = clocale_str.size(); + + value = clocale_data; + size_in_bytes = clocale_length; +#else + value = raw_value; + size_in_bytes = raw_value_length; +#endif + } + + size_t remaining_length = static_cast(size_in_bytes - value_offset); + size_t value_length = std::min(remaining_length, binding->buffer_length); + + auto* byte_buffer = static_cast(binding->buffer) + i * binding->buffer_length; + auto* char_buffer = (CHAR_TYPE*)byte_buffer; + memcpy(char_buffer, ((char*)value) + value_offset, value_length); + + // Write a NUL terminator + if (binding->buffer_length >= remaining_length + sizeof(CHAR_TYPE)) { + // The entire remainder of the data was consumed. + char_buffer[remaining_length / sizeof(CHAR_TYPE)] = '\0'; + if (update_value_offset) { + // Mark that there's no data remaining. + value_offset = -1; + } + } else { + result = odbcabstraction::RowStatus_SUCCESS_WITH_INFO; + diagnostics.AddTruncationWarning(); + size_t chars_written = binding->buffer_length / sizeof(CHAR_TYPE); + // If we failed to even write one char, the buffer is too small to hold a + // NUL-terminator. + if (chars_written > 0) { + char_buffer[(chars_written - 1)] = '\0'; + if (update_value_offset) { + value_offset += binding->buffer_length - sizeof(CHAR_TYPE); + } + } + } + + if (binding->str_len_buffer) { + binding->str_len_buffer[i] = static_cast(remaining_length); + } + + return result; +} + +} // namespace + +template +StringArrayFlightSqlAccessor::StringArrayFlightSqlAccessor( + Array* array) + : FlightSqlAccessor>(array), + last_arrow_row_(-1) {} + +template +RowStatus StringArrayFlightSqlAccessor::MoveSingleCellImpl( + ColumnBinding* binding, int64_t arrow_row, int64_t i, int64_t& value_offset, + bool update_value_offset, odbcabstraction::Diagnostics& diagnostics) { + return MoveSingleCellToCharBuffer(buffer_, last_arrow_row_, +#if defined _WIN32 || defined _WIN64 + clocale_str_, +#endif + binding, this->GetArray(), arrow_row, i, + value_offset, update_value_offset, + diagnostics); +} + +template +size_t StringArrayFlightSqlAccessor::GetCellLengthImpl( + ColumnBinding* binding) const { + return binding->buffer_length; +} + +template class StringArrayFlightSqlAccessor; +template class StringArrayFlightSqlAccessor; +template class StringArrayFlightSqlAccessor; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.h new file mode 100644 index 00000000000..ed1a9e83ab4 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/encoding.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/type_fwd.h" + +namespace driver { +namespace flight_sql { + +using arrow::StringArray; +using odbcabstraction::CDataType; +using odbcabstraction::DriverException; +using odbcabstraction::RowStatus; + +using odbcabstraction::GetSqlWCharSize; + +template +class StringArrayFlightSqlAccessor + : public FlightSqlAccessor> { + public: + explicit StringArrayFlightSqlAccessor(Array* array); + + RowStatus MoveSingleCellImpl(ColumnBinding* binding, int64_t arrow_row, int64_t i, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics); + + size_t GetCellLengthImpl(ColumnBinding* binding) const; + + private: + std::vector buffer_; +#if defined _WIN32 || defined _WIN64 + std::string clocale_str_; +#endif + int64_t last_arrow_row_; +}; + +inline Accessor* CreateWCharStringArrayAccessor(arrow::Array* array) { + switch (GetSqlWCharSize()) { + case sizeof(char16_t): + return new StringArrayFlightSqlAccessor( + array); + case sizeof(char32_t): + return new StringArrayFlightSqlAccessor( + array); + default: + assert(false); + throw DriverException("Encoding is unsupported, SQLWCHAR size: " + + std::to_string(GetSqlWCharSize())); + } +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor_test.cc new file mode 100644 index 00000000000..3289914ffb5 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor_test.cc @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/string_array_accessor.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/encoding.h" +#include "arrow/testing/builder.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { +using arrow::StringType; +using odbcabstraction::OdbcVersion; + +using arrow::ArrayFromVector; +using odbcabstraction::GetSqlWCharSize; +using odbcabstraction::Utf8ToWcs; + +TEST(StringArrayAccessor, Test_CDataType_CHAR_Basic) { + std::vector values = {"foo", "barx", "baz123"}; + std::shared_ptr array; + ArrayFromVector(values, &array); + + StringArrayFlightSqlAccessor accessor( + array.get()); + + size_t max_str_len = 64; + std::vector buffer(values.size() * max_str_len); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_CHAR, 0, 0, buffer.data(), max_str_len, + str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (int i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i].length(), str_len_buffer[i]); + ASSERT_EQ(values[i], std::string(buffer.data() + i * max_str_len)); + } +} + +TEST(StringArrayAccessor, Test_CDataType_CHAR_Truncation) { + std::vector values = {"ABCDEFABCDEFABCDEFABCDEFABCDEFABCDEFABCDEF"}; + std::shared_ptr array; + ArrayFromVector(values, &array); + + StringArrayFlightSqlAccessor accessor( + array.get()); + + size_t max_str_len = 8; + std::vector buffer(values.size() * max_str_len); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_CHAR, 0, 0, buffer.data(), max_str_len, + str_len_buffer.data()); + + std::stringstream ss; + int64_t value_offset = 0; + + // Construct the whole string by concatenating smaller chunks from + // GetColumnarData + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + do { + diagnostics.Clear(); + int64_t original_value_offset = value_offset; + ASSERT_EQ(1, accessor.GetColumnarData(&binding, 0, 1, value_offset, true, diagnostics, + nullptr)); + ASSERT_EQ(values[0].length() - original_value_offset, str_len_buffer[0]); + + ss << buffer.data(); + } while (value_offset < static_cast(values[0].length()) && value_offset != -1); + + ASSERT_EQ(values[0], ss.str()); +} + +TEST(StringArrayAccessor, Test_CDataType_WCHAR_Basic) { + std::vector values = {"foo", "barx", "baz123"}; + std::shared_ptr array; + ArrayFromVector(values, &array); + + auto accessor = CreateWCharStringArrayAccessor(array.get()); + + size_t max_str_len = 64; + std::vector buffer(values.size() * max_str_len); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_WCHAR, 0, 0, buffer.data(), + max_str_len, str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor->GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (int i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i].length() * GetSqlWCharSize(), str_len_buffer[i]); + std::vector expected; + Utf8ToWcs(values[i].c_str(), &expected); + uint8_t* start = buffer.data() + i * max_str_len; + auto actual = std::vector(start, start + str_len_buffer[i]); + ASSERT_EQ(expected, actual); + } +} + +TEST(StringArrayAccessor, Test_CDataType_WCHAR_Truncation) { + std::vector values = {"ABCDEFA"}; + std::shared_ptr array; + ArrayFromVector(values, &array); + + auto accessor = CreateWCharStringArrayAccessor(array.get()); + + size_t max_str_len = 8; + std::vector buffer(values.size() * max_str_len); + std::vector str_len_buffer(values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_WCHAR, 0, 0, buffer.data(), + max_str_len, str_len_buffer.data()); + + std::basic_stringstream ss; + int64_t value_offset = 0; + + // Construct the whole string by concatenating smaller chunks from + // GetColumnarData + std::vector finalStr; + driver::odbcabstraction::Diagnostics diagnostics("Dummy", "Dummy", + odbcabstraction::V_3); + do { + int64_t original_value_offset = value_offset; + ASSERT_EQ(1, accessor->GetColumnarData(&binding, 0, 1, value_offset, true, + diagnostics, nullptr)); + ASSERT_EQ(values[0].length() * GetSqlWCharSize() - original_value_offset, + str_len_buffer[0]); + + size_t length = value_offset - original_value_offset; + if (value_offset == -1) { + length = buffer.size(); + } + finalStr.insert(finalStr.end(), buffer.data(), buffer.data() + length); + } while (value_offset < static_cast(values[0].length() * GetSqlWCharSize()) && + value_offset != -1); + + // Trim final null bytes + finalStr.resize(values[0].length() * GetSqlWCharSize()); + + std::vector expected; + Utf8ToWcs(values[0].c_str(), &expected); + ASSERT_EQ(expected, finalStr); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.cc new file mode 100644 index 00000000000..0ffa9fc84a7 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.cc @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using arrow::Time32Array; +using arrow::Time64Array; +using arrow::TimeType; +using arrow::TimeUnit; + +using odbcabstraction::DriverException; +using odbcabstraction::GetTimeForSecondsSinceEpoch; +using odbcabstraction::TIME_STRUCT; + +Accessor* CreateTimeAccessor(arrow::Array* array, arrow::Type::type type) { + auto time_type = arrow::internal::checked_pointer_cast(array->type()); + auto time_unit = time_type->unit(); + + if (type == arrow::Type::TIME32) { + switch (time_unit) { + case TimeUnit::SECOND: + return new TimeArrayFlightSqlAccessor(array); + case TimeUnit::MILLI: + return new TimeArrayFlightSqlAccessor(array); + case TimeUnit::MICRO: + return new TimeArrayFlightSqlAccessor(array); + case TimeUnit::NANO: + return new TimeArrayFlightSqlAccessor(array); + } + } else if (type == arrow::Type::TIME64) { + switch (time_unit) { + case TimeUnit::SECOND: + return new TimeArrayFlightSqlAccessor(array); + case TimeUnit::MILLI: + return new TimeArrayFlightSqlAccessor(array); + case TimeUnit::MICRO: + return new TimeArrayFlightSqlAccessor(array); + case TimeUnit::NANO: + return new TimeArrayFlightSqlAccessor(array); + } + } + assert(false); + throw DriverException("Unsupported input supplied to CreateTimeAccessor"); +} + +namespace { +template +int64_t ConvertTimeValue(typename T::value_type value, TimeUnit::type unit) { + return value; +} + +template <> +int64_t ConvertTimeValue(int32_t value, TimeUnit::type unit) { + return unit == TimeUnit::SECOND ? value + : value / odbcabstraction::MILLI_TO_SECONDS_DIVISOR; +} + +template <> +int64_t ConvertTimeValue(int64_t value, TimeUnit::type unit) { + return unit == TimeUnit::MICRO ? value / odbcabstraction::MICRO_TO_SECONDS_DIVISOR + : value / odbcabstraction::NANO_TO_SECONDS_DIVISOR; +} +} // namespace + +template +TimeArrayFlightSqlAccessor::TimeArrayFlightSqlAccessor( + Array* array) + : FlightSqlAccessor>( + array) {} + +template +RowStatus TimeArrayFlightSqlAccessor::MoveSingleCellImpl( + ColumnBinding* binding, int64_t arrow_row, int64_t cell_counter, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostic) { + auto* buffer = static_cast(binding->buffer); + + tm time{}; + + auto converted_value_seconds = + ConvertTimeValue(this->GetArray()->Value(arrow_row), UNIT); + + GetTimeForSecondsSinceEpoch(converted_value_seconds, time); + + buffer[cell_counter].hour = time.tm_hour; + buffer[cell_counter].minute = time.tm_min; + buffer[cell_counter].second = time.tm_sec; + + if (binding->str_len_buffer) { + binding->str_len_buffer[cell_counter] = + static_cast(GetCellLengthImpl(binding)); + } + return odbcabstraction::RowStatus_SUCCESS; +} + +template +size_t TimeArrayFlightSqlAccessor::GetCellLengthImpl( + ColumnBinding* binding) const { + return sizeof(TIME_STRUCT); +} + +template class TimeArrayFlightSqlAccessor; +template class TimeArrayFlightSqlAccessor; +template class TimeArrayFlightSqlAccessor; +template class TimeArrayFlightSqlAccessor; +template class TimeArrayFlightSqlAccessor; +template class TimeArrayFlightSqlAccessor; +template class TimeArrayFlightSqlAccessor; +template class TimeArrayFlightSqlAccessor; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.h new file mode 100644 index 00000000000..0d1997d7281 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/type_fwd.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using odbcabstraction::RowStatus; + +Accessor* CreateTimeAccessor(arrow::Array* array, arrow::Type::type type); + +template +class TimeArrayFlightSqlAccessor + : public FlightSqlAccessor< + ARROW_ARRAY, TARGET_TYPE, + TimeArrayFlightSqlAccessor> { + public: + explicit TimeArrayFlightSqlAccessor(Array* array); + + RowStatus MoveSingleCellImpl(ColumnBinding* binding, int64_t arrow_row, + int64_t cell_counter, int64_t& value_offset, + bool update_value_offset, + odbcabstraction::Diagnostics& diagnostic); + + size_t GetCellLengthImpl(ColumnBinding* binding) const; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor_test.cc new file mode 100644 index 00000000000..e9b5b95b2ca --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor_test.cc @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/time_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/testing/builder.h" +#include "gtest/gtest.h" +#include "odbcabstraction/calendar_utils.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using arrow::Time32Array; +using arrow::Time32Type; +using arrow::Time64Array; +using arrow::Time64Type; +using arrow::TimeUnit; + +using odbcabstraction::OdbcVersion; +using odbcabstraction::TIME_STRUCT; + +using arrow::ArrayFromVector; +using odbcabstraction::GetTimeForSecondsSinceEpoch; + +TEST(TEST_TIME32, TIME_WITH_SECONDS) { + auto value_field = field("f0", time32(TimeUnit::SECOND)); + + std::vector t32_values = {14896, 14897, 14892, 85400, 14893, 14895}; + + std::shared_ptr time32_array; + ArrayFromVector(value_field->type(), t32_values, &time32_array); + + TimeArrayFlightSqlAccessor + accessor(time32_array.get()); + + std::vector buffer(t32_values.size()); + std::vector str_len_buffer(t32_values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_TIME, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(t32_values.size(), + accessor.GetColumnarData(&binding, 0, t32_values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < t32_values.size(); ++i) { + ASSERT_EQ(sizeof(TIME_STRUCT), str_len_buffer[i]); + + tm time{}; + + GetTimeForSecondsSinceEpoch(t32_values[i], time); + ASSERT_EQ(buffer[i].hour, time.tm_hour); + ASSERT_EQ(buffer[i].minute, time.tm_min); + ASSERT_EQ(buffer[i].second, time.tm_sec); + } +} + +TEST(TEST_TIME32, TIME_WITH_MILLI) { + auto value_field = field("f0", time32(TimeUnit::MILLI)); + std::vector t32_values = {14896000, 14897000, 14892000, + 85400000, 14893000, 14895000}; + + std::shared_ptr time32_array; + ArrayFromVector(value_field->type(), t32_values, &time32_array); + + TimeArrayFlightSqlAccessor + accessor(time32_array.get()); + + std::vector buffer(t32_values.size()); + std::vector str_len_buffer(t32_values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_TIME, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(t32_values.size(), + accessor.GetColumnarData(&binding, 0, t32_values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < t32_values.size(); ++i) { + ASSERT_EQ(sizeof(TIME_STRUCT), str_len_buffer[i]); + + tm time{}; + + auto converted_value = t32_values[i] / odbcabstraction::MILLI_TO_SECONDS_DIVISOR; + GetTimeForSecondsSinceEpoch(converted_value, time); + + ASSERT_EQ(buffer[i].hour, time.tm_hour); + ASSERT_EQ(buffer[i].minute, time.tm_min); + ASSERT_EQ(buffer[i].second, time.tm_sec); + } +} + +TEST(TEST_TIME64, TIME_WITH_MICRO) { + auto value_field = field("f0", time64(TimeUnit::MICRO)); + + std::vector t64_values = {14896000, 14897000, 14892000, + 85400000, 14893000, 14895000}; + + std::shared_ptr time64_array; + ArrayFromVector(value_field->type(), t64_values, &time64_array); + + TimeArrayFlightSqlAccessor + accessor(time64_array.get()); + + std::vector buffer(t64_values.size()); + std::vector str_len_buffer(t64_values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_TIME, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(t64_values.size(), + accessor.GetColumnarData(&binding, 0, t64_values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < t64_values.size(); ++i) { + ASSERT_EQ(sizeof(TIME_STRUCT), str_len_buffer[i]); + + tm time{}; + + const auto convertedValue = t64_values[i] / odbcabstraction::MICRO_TO_SECONDS_DIVISOR; + GetTimeForSecondsSinceEpoch(convertedValue, time); + + ASSERT_EQ(buffer[i].hour, time.tm_hour); + ASSERT_EQ(buffer[i].minute, time.tm_min); + ASSERT_EQ(buffer[i].second, time.tm_sec); + } +} + +TEST(TEST_TIME64, TIME_WITH_NANO) { + auto value_field = field("f0", time64(TimeUnit::NANO)); + std::vector t64_values = {14896000000, 14897000000, 14892000000, + 85400000000, 14893000000, 14895000000}; + + std::shared_ptr time64_array; + ArrayFromVector(value_field->type(), t64_values, &time64_array); + + TimeArrayFlightSqlAccessor + accessor(time64_array.get()); + + std::vector buffer(t64_values.size()); + std::vector str_len_buffer(t64_values.size()); + + ColumnBinding binding(odbcabstraction::CDataType_TIME, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + + int64_t value_offset = 0; + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(t64_values.size(), + accessor.GetColumnarData(&binding, 0, t64_values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < t64_values.size(); ++i) { + ASSERT_EQ(sizeof(TIME_STRUCT), str_len_buffer[i]); + + tm time{}; + + const auto converted_value = t64_values[i] / odbcabstraction::NANO_TO_SECONDS_DIVISOR; + GetTimeForSecondsSinceEpoch(converted_value, time); + + ASSERT_EQ(buffer[i].hour, time.tm_hour); + ASSERT_EQ(buffer[i].minute, time.tm_min); + ASSERT_EQ(buffer[i].second, time.tm_sec); + } +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.cc new file mode 100644 index 00000000000..3eb05f96a6e --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.cc @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h" + +using arrow::TimeUnit; + +namespace { +int64_t GetConversionToSecondsDivisor(TimeUnit::type unit) { + int64_t divisor = 1; + switch (unit) { + case TimeUnit::SECOND: + divisor = 1; + break; + case TimeUnit::MILLI: + divisor = driver::odbcabstraction::MILLI_TO_SECONDS_DIVISOR; + break; + case TimeUnit::MICRO: + divisor = driver::odbcabstraction::MICRO_TO_SECONDS_DIVISOR; + break; + case TimeUnit::NANO: + divisor = driver::odbcabstraction::NANO_TO_SECONDS_DIVISOR; + break; + default: + assert(false); + throw driver::odbcabstraction::DriverException("Unrecognized time unit value: " + + std::to_string(unit)); + } + return divisor; +} + +uint32_t CalculateFraction(TimeUnit::type unit, int64_t units_since_epoch) { + /** + * Convert the given remainder and time unit to nanoseconds + * since the fraction field on TIMESTAMP_STRUCT is in nanoseconds. + */ + if (unit == TimeUnit::SECOND) { + return 0; + } + + const int64_t divisor = GetConversionToSecondsDivisor(unit); + const int64_t nano_divisor = GetConversionToSecondsDivisor(TimeUnit::NANO); + + // Safe remainder calculation that always gives a non-negative result + int64_t remainder = units_since_epoch % divisor; + if (remainder < 0) { + remainder += divisor; + } + + // Scale to nanoseconds + return static_cast(remainder * (nano_divisor / divisor)); +} +} // namespace + +namespace driver { +namespace flight_sql { + +using odbcabstraction::TIMESTAMP_STRUCT; + +using odbcabstraction::GetTimeForSecondsSinceEpoch; + +template +TimestampArrayFlightSqlAccessor::TimestampArrayFlightSqlAccessor( + Array* array) + : FlightSqlAccessor>(array) {} + +template +RowStatus TimestampArrayFlightSqlAccessor::MoveSingleCellImpl( + ColumnBinding* binding, int64_t arrow_row, int64_t cell_counter, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics) { + auto* buffer = static_cast(binding->buffer); + + int64_t value = this->GetArray()->Value(arrow_row); + const auto divisor = GetConversionToSecondsDivisor(UNIT); + const auto converted_result_seconds = + // We want floor division here; C++ will round towards zero + (value < 0) + /** + * Floor division: Shift all "fractional" (not a multiple of divisor) values so + * they round towards zero (and to the same value) along with the "floor" less + * than them, then add 1 to get back to the floor. Alternative we could shift + * negatively by (divisor - 1) but this breaks near INT64_MIN causing underflow. + */ + ? ((value + 1) / divisor) - 1 + // Towards zero is already floor + : value / divisor; + tm timestamp = {0}; + + GetTimeForSecondsSinceEpoch(converted_result_seconds, timestamp); + + buffer[cell_counter].year = 1900 + (timestamp.tm_year); + buffer[cell_counter].month = timestamp.tm_mon + 1; + buffer[cell_counter].day = timestamp.tm_mday; + buffer[cell_counter].hour = timestamp.tm_hour; + buffer[cell_counter].minute = timestamp.tm_min; + buffer[cell_counter].second = timestamp.tm_sec; + buffer[cell_counter].fraction = CalculateFraction(UNIT, value); + + if (binding->str_len_buffer) { + binding->str_len_buffer[cell_counter] = + static_cast(GetCellLengthImpl(binding)); + } + + return odbcabstraction::RowStatus_SUCCESS; +} + +template +size_t TimestampArrayFlightSqlAccessor::GetCellLengthImpl( + ColumnBinding* binding) const { + return sizeof(TIMESTAMP_STRUCT); +} + +template class TimestampArrayFlightSqlAccessor; +template class TimestampArrayFlightSqlAccessor; +template class TimestampArrayFlightSqlAccessor; +template class TimestampArrayFlightSqlAccessor; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.h new file mode 100644 index 00000000000..ad449a6f828 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/type_fwd.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using arrow::TimestampArray; +using arrow::TimeUnit; +using odbcabstraction::RowStatus; + +template +class TimestampArrayFlightSqlAccessor + : public FlightSqlAccessor> { + public: + explicit TimestampArrayFlightSqlAccessor(Array* array); + + RowStatus MoveSingleCellImpl(ColumnBinding* binding, int64_t arrow_row, + int64_t cell_counter, int64_t& value_offset, + bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics); + + size_t GetCellLengthImpl(ColumnBinding* binding) const; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor_test.cc new file mode 100644 index 00000000000..b99d954c870 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor_test.cc @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/accessors/timestamp_array_accessor.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/testing/builder.h" +#include "gtest/gtest.h" +#include "odbcabstraction/calendar_utils.h" + +namespace driver { +namespace flight_sql { + +using arrow::ArrayFromVector; +using arrow::TimestampType; +using arrow::TimeUnit; + +using odbcabstraction::OdbcVersion; +using odbcabstraction::TIMESTAMP_STRUCT; + +using odbcabstraction::GetTimeForSecondsSinceEpoch; + +TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_MILLI) { + std::vector values = {86400370, 172800000, 259200000, + 1649793238110LL, 345600000, 432000000, + 518400000, -86399000, 0, + -86399999, -86399001, 86400001, + 86400999, -3786912000000, -5364662400000, + -1500, -24268068949000}; + std::vector expected = { + /* year(16), month(u16), day(u16), hour(u16), minute(u16), second(u16), + fraction(u32) */ + {1970, 1, 2, 0, 0, 0, 370000000}, + {1970, 1, 3, 0, 0, 0, 0}, + {1970, 1, 4, 0, 0, 0, 0}, + {2022, 4, 12, 19, 53, 58, 110000000}, + {1970, 1, 5, 0, 0, 0, 0}, + {1970, 1, 6, 0, 0, 0, 0}, + {1970, 1, 7, 0, 0, 0, 0}, + {1969, 12, 31, 0, 0, 1, 0}, + {1970, 1, 1, 0, 0, 0, 0}, + /* Tests both ends of the fraction rounding range to ensure we don't tip the wrong + way */ + {1969, 12, 31, 0, 0, 0, 1000000}, + {1969, 12, 31, 0, 0, 0, 999000000}, + {1970, 1, 2, 0, 0, 0, 1000000}, + {1970, 1, 2, 0, 0, 0, 999000000}, + {1849, 12, 31, 0, 0, 0, 0U}, + {1800, 1, 1, 0, 0, 0, 0U}, + {1969, 12, 31, 23, 59, 58, 500000000U}, + {1200, 12, 22, 13, 44, 11, 0U}, + }; + + std::shared_ptr timestamp_array; + + auto timestamp_field = field("timestamp_field", timestamp(TimeUnit::MILLI)); + ArrayFromVector(timestamp_field->type(), values, + ×tamp_array); + + TimestampArrayFlightSqlAccessor + accessor(timestamp_array.get()); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + int64_t value_offset = 0; + ColumnBinding binding(odbcabstraction::CDataType_TIMESTAMP, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(TIMESTAMP_STRUCT), str_len_buffer[i]); + + ASSERT_EQ(buffer[i].year, expected[i].year); + ASSERT_EQ(buffer[i].month, expected[i].month); + ASSERT_EQ(buffer[i].day, expected[i].day); + ASSERT_EQ(buffer[i].hour, expected[i].hour); + ASSERT_EQ(buffer[i].minute, expected[i].minute); + ASSERT_EQ(buffer[i].second, expected[i].second); + ASSERT_EQ(buffer[i].fraction, expected[i].fraction); + } +} + +TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_SECONDS) { + std::vector values = {86400, 172800, 259200, 1649793238, + 345600, 432000, 518400}; + + std::shared_ptr timestamp_array; + + auto timestamp_field = field("timestamp_field", timestamp(TimeUnit::SECOND)); + ArrayFromVector(timestamp_field->type(), values, + ×tamp_array); + + TimestampArrayFlightSqlAccessor + accessor(timestamp_array.get()); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + int64_t value_offset = 0; + ColumnBinding binding(odbcabstraction::CDataType_TIMESTAMP, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(TIMESTAMP_STRUCT), str_len_buffer[i]); + tm date{}; + + auto converted_time = values[i]; + GetTimeForSecondsSinceEpoch(converted_time, date); + + ASSERT_EQ(buffer[i].year, 1900 + (date.tm_year)); + ASSERT_EQ(buffer[i].month, date.tm_mon + 1); + ASSERT_EQ(buffer[i].day, date.tm_mday); + ASSERT_EQ(buffer[i].hour, date.tm_hour); + ASSERT_EQ(buffer[i].minute, date.tm_min); + ASSERT_EQ(buffer[i].second, date.tm_sec); + ASSERT_EQ(buffer[i].fraction, 0); + } +} + +TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_MICRO) { + std::vector values = {86400000000, 1649793238000000}; + + std::shared_ptr timestamp_array; + + auto timestamp_field = field("timestamp_field", timestamp(TimeUnit::MICRO)); + ArrayFromVector(timestamp_field->type(), values, + ×tamp_array); + + TimestampArrayFlightSqlAccessor + accessor(timestamp_array.get()); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + int64_t value_offset = 0; + ColumnBinding binding(odbcabstraction::CDataType_TIMESTAMP, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(TIMESTAMP_STRUCT), str_len_buffer[i]); + + tm date{}; + + auto converted_time = values[i] / odbcabstraction::MICRO_TO_SECONDS_DIVISOR; + GetTimeForSecondsSinceEpoch(converted_time, date); + + ASSERT_EQ(buffer[i].year, 1900 + (date.tm_year)); + ASSERT_EQ(buffer[i].month, date.tm_mon + 1); + ASSERT_EQ(buffer[i].day, date.tm_mday); + ASSERT_EQ(buffer[i].hour, date.tm_hour); + ASSERT_EQ(buffer[i].minute, date.tm_min); + ASSERT_EQ(buffer[i].second, date.tm_sec); + constexpr uint32_t MICROS_PER_NANO = 1000; + ASSERT_EQ(buffer[i].fraction, + (values[i] % odbcabstraction::MICRO_TO_SECONDS_DIVISOR) * MICROS_PER_NANO); + } +} + +TEST(TEST_TIMESTAMP, TIMESTAMP_WITH_NANO) { + std::vector values = {86400000010000, 1649793238000000000}; + + std::shared_ptr timestamp_array; + + auto timestamp_field = field("timestamp_field", timestamp(TimeUnit::NANO)); + ArrayFromVector(timestamp_field->type(), values, + ×tamp_array); + + TimestampArrayFlightSqlAccessor + accessor(timestamp_array.get()); + + std::vector buffer(values.size()); + std::vector str_len_buffer(values.size()); + + int64_t value_offset = 0; + ColumnBinding binding(odbcabstraction::CDataType_TIMESTAMP, 0, 0, buffer.data(), 0, + str_len_buffer.data()); + + odbcabstraction::Diagnostics diagnostics("Foo", "Foo", OdbcVersion::V_3); + ASSERT_EQ(values.size(), + accessor.GetColumnarData(&binding, 0, values.size(), value_offset, false, + diagnostics, nullptr)); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(sizeof(TIMESTAMP_STRUCT), str_len_buffer[i]); + tm date{}; + + auto converted_time = values[i] / odbcabstraction::NANO_TO_SECONDS_DIVISOR; + GetTimeForSecondsSinceEpoch(converted_time, date); + + ASSERT_EQ(buffer[i].year, 1900 + (date.tm_year)); + ASSERT_EQ(buffer[i].month, date.tm_mon + 1); + ASSERT_EQ(buffer[i].day, date.tm_mday); + ASSERT_EQ(buffer[i].hour, date.tm_hour); + ASSERT_EQ(buffer[i].minute, date.tm_min); + ASSERT_EQ(buffer[i].second, date.tm_sec); + ASSERT_EQ(buffer[i].fraction, (values[i] % odbcabstraction::NANO_TO_SECONDS_DIVISOR)); + } +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/types.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/types.h new file mode 100644 index 00000000000..4e96b53aff9 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/accessors/types.h @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using odbcabstraction::CDataType; + +class FlightSqlResultSet; + +struct ColumnBinding { + void* buffer; + ssize_t* str_len_buffer; + size_t buffer_length; + CDataType target_type; + int precision; + int scale; + + ColumnBinding() = default; + + ColumnBinding(CDataType target_type, int precision, int scale, void* buffer, + size_t buffer_length, ssize_t* str_len_buffer) + : target_type(target_type), + precision(precision), + scale(scale), + buffer(buffer), + buffer_length(buffer_length), + str_len_buffer(str_len_buffer) {} +}; + +/// \brief Accessor interface meant to provide a way of populating data of a +/// single column to buffers bound by `ColumnarResultSet::BindColumn`. +class Accessor { + public: + const CDataType target_type_; + + explicit Accessor(CDataType target_type) : target_type_(target_type) {} + + virtual ~Accessor() = default; + + /// \brief Populates next cells + virtual size_t GetColumnarData(ColumnBinding* binding, int64_t starting_row, + size_t cells, int64_t& value_offset, + bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics, + uint16_t* row_status_array) = 0; + + virtual size_t GetCellLength(ColumnBinding* binding) const = 0; +}; + +template +class FlightSqlAccessor : public Accessor { + public: + explicit FlightSqlAccessor(Array* array) + : Accessor(TARGET_TYPE), + array_(arrow::internal::checked_cast(array)) {} + + size_t GetColumnarData(ColumnBinding* binding, int64_t starting_row, size_t cells, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics, + uint16_t* row_status_array) override { + return static_cast(this)->GetColumnarDataImpl( + binding, starting_row, cells, value_offset, update_value_offset, diagnostics, + row_status_array); + } + + size_t GetCellLength(ColumnBinding* binding) const override { + return static_cast(this)->GetCellLengthImpl(binding); + } + + protected: + size_t GetColumnarDataImpl(ColumnBinding* binding, int64_t starting_row, int64_t cells, + int64_t& value_offset, bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics, + uint16_t* row_status_array) { + for (int64_t i = 0; i < cells; ++i) { + int64_t current_arrow_row = starting_row + i; + if (array_->IsNull(current_arrow_row)) { + if (binding->str_len_buffer) { + binding->str_len_buffer[i] = odbcabstraction::NULL_DATA; + } else { + throw odbcabstraction::NullWithoutIndicatorException(); + } + } else { + // TODO: Optimize this by creating different versions of MoveSingleCell + // depending on if str_len_buffer is null. + auto row_status = MoveSingleCell(binding, current_arrow_row, i, value_offset, + update_value_offset, diagnostics); + if (row_status_array) { + row_status_array[i] = row_status; + } + } + } + + return static_cast(cells); + } + + inline ARROW_ARRAY* GetArray() { return array_; } + + private: + ARROW_ARRAY* array_; + + odbcabstraction::RowStatus MoveSingleCell(ColumnBinding* binding, int64_t arrow_row, + int64_t i, int64_t& value_offset, + bool update_value_offset, + odbcabstraction::Diagnostics& diagnostics) { + return static_cast(this)->MoveSingleCellImpl( + binding, arrow_row, i, value_offset, update_value_offset, diagnostics); + } + + odbcabstraction::RowStatus MoveSingleCellImpl( + ColumnBinding* binding, int64_t arrow_row, int64_t i, int64_t& value_offset, + bool update_value_offset, odbcabstraction::Diagnostics& diagnostics) { + std::stringstream ss; + ss << "Unknown type conversion from StringArray to target C type " << TARGET_TYPE; + throw odbcabstraction::DriverException(ss.str()); + } +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/address_info.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/address_info.cc new file mode 100644 index 00000000000..9d782c57e96 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/address_info.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/address_info.h" + +namespace driver { + +bool AddressInfo::GetAddressInfo(const std::string& host, char* host_name_info, + int64_t max_host) { + if (addrinfo_result_) { + freeaddrinfo(addrinfo_result_); + addrinfo_result_ = nullptr; + } + + int error; + error = getaddrinfo(host.c_str(), NULL, NULL, &addrinfo_result_); + + if (error != 0) { + return false; + } + + error = getnameinfo(addrinfo_result_->ai_addr, addrinfo_result_->ai_addrlen, + host_name_info, static_cast(max_host), NULL, 0, 0); + return error == 0; +} + +AddressInfo::~AddressInfo() { + if (addrinfo_result_) { + freeaddrinfo(addrinfo_result_); + addrinfo_result_ = nullptr; + } +} + +AddressInfo::AddressInfo() : addrinfo_result_(nullptr) {} +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/address_info.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/address_info.h new file mode 100644 index 00000000000..312d5689a98 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/address_info.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#if !_WIN32 +# include +#endif + +namespace driver { + +class AddressInfo { + private: + struct addrinfo* addrinfo_result_; + + public: + AddressInfo(); + + ~AddressInfo(); + + bool GetAddressInfo(const std::string& host, char* host_name_info, int64_t max_host); +}; +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/config/configuration.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/config/configuration.cc new file mode 100644 index 00000000000..2dee551915e --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/config/configuration.cc @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/configuration.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" + +#include +#include +#include +#include +#include + +namespace driver { +namespace flight_sql { +namespace config { + +static const char DEFAULT_DSN[] = "Apache Arrow Flight SQL"; +static const char DEFAULT_ENABLE_ENCRYPTION[] = TRUE_STR; +static const char DEFAULT_USE_CERT_STORE[] = TRUE_STR; +static const char DEFAULT_DISABLE_CERT_VERIFICATION[] = FALSE_STR; + +namespace { +std::string ReadDsnString(const std::string& dsn, const std::string_view& key, + const std::string& dflt = "") { +#define BUFFER_SIZE (1024) + std::vector buf(BUFFER_SIZE); + + std::string key_str = std::string(key); + int ret = + SQLGetPrivateProfileString(dsn.c_str(), key_str.c_str(), dflt.c_str(), buf.data(), + static_cast(buf.size()), "ODBC.INI"); + + if (ret > BUFFER_SIZE) { + // If there wasn't enough space, try again with the right size buffer. + buf.resize(ret + 1); + ret = + SQLGetPrivateProfileString(dsn.c_str(), key_str.c_str(), dflt.c_str(), buf.data(), + static_cast(buf.size()), "ODBC.INI"); + } + + return std::string(buf.data(), ret); +} + +void RemoveAllKnownKeys(std::vector& keys) { + // Remove all known DSN keys from the passed in set of keys, case insensitively. + keys.erase(std::remove_if(keys.begin(), keys.end(), + [&](auto& x) { + return std::find_if( + FlightSqlConnection::ALL_KEYS.begin(), + FlightSqlConnection::ALL_KEYS.end(), + [&](auto& s) { return boost::iequals(x, s); }) != + FlightSqlConnection::ALL_KEYS.end(); + }), + keys.end()); +} + +std::vector ReadAllKeys(const std::string& dsn) { + std::vector buf(BUFFER_SIZE); + + int ret = SQLGetPrivateProfileString(dsn.c_str(), NULL, "", buf.data(), + static_cast(buf.size()), "ODBC.INI"); + + if (ret > BUFFER_SIZE) { + // If there wasn't enough space, try again with the right size buffer. + buf.resize(ret + 1); + ret = SQLGetPrivateProfileString(dsn.c_str(), NULL, "", buf.data(), + static_cast(buf.size()), "ODBC.INI"); + } + + // When you pass NULL to SQLGetPrivateProfileString it gives back a \0 delimited list of + // all the keys. The below loop simply tokenizes all the keys and places them into a + // vector. + std::vector keys; + char* begin = buf.data(); + while (begin && *begin != '\0') { + char* cur; + for (cur = begin; *cur != '\0'; ++cur) { + } + keys.emplace_back(begin, cur); + begin = ++cur; + } + return keys; +} +} // namespace + +Configuration::Configuration() { + // No-op. +} + +Configuration::~Configuration() { + // No-op. +} + +void Configuration::LoadDefaults() { + Set(FlightSqlConnection::DSN, DEFAULT_DSN); + Set(FlightSqlConnection::USE_ENCRYPTION, DEFAULT_ENABLE_ENCRYPTION); + Set(FlightSqlConnection::USE_SYSTEM_TRUST_STORE, DEFAULT_USE_CERT_STORE); + Set(FlightSqlConnection::DISABLE_CERTIFICATE_VERIFICATION, + DEFAULT_DISABLE_CERT_VERIFICATION); +} + +void Configuration::LoadDsn(const std::string& dsn) { + Set(FlightSqlConnection::DSN, dsn); + Set(FlightSqlConnection::HOST, ReadDsnString(dsn, FlightSqlConnection::HOST)); + Set(FlightSqlConnection::PORT, ReadDsnString(dsn, FlightSqlConnection::PORT)); + Set(FlightSqlConnection::TOKEN, ReadDsnString(dsn, FlightSqlConnection::TOKEN)); + Set(FlightSqlConnection::UID, ReadDsnString(dsn, FlightSqlConnection::UID)); + Set(FlightSqlConnection::PWD, ReadDsnString(dsn, FlightSqlConnection::PWD)); + Set(FlightSqlConnection::USE_ENCRYPTION, + ReadDsnString(dsn, FlightSqlConnection::USE_ENCRYPTION, DEFAULT_ENABLE_ENCRYPTION)); + Set(FlightSqlConnection::TRUSTED_CERTS, + ReadDsnString(dsn, FlightSqlConnection::TRUSTED_CERTS)); + Set(FlightSqlConnection::USE_SYSTEM_TRUST_STORE, + ReadDsnString(dsn, FlightSqlConnection::USE_SYSTEM_TRUST_STORE, + DEFAULT_USE_CERT_STORE)); + Set(FlightSqlConnection::DISABLE_CERTIFICATE_VERIFICATION, + ReadDsnString(dsn, FlightSqlConnection::DISABLE_CERTIFICATE_VERIFICATION, + DEFAULT_DISABLE_CERT_VERIFICATION)); + + auto customKeys = ReadAllKeys(dsn); + RemoveAllKnownKeys(customKeys); + for (auto key : customKeys) { + std::string_view key_sv(key); + Set(key, ReadDsnString(dsn, key_sv)); + } +} + +void Configuration::Clear() { this->properties_.clear(); } + +bool Configuration::IsSet(const std::string_view& key) const { + return 0 != this->properties_.count(key); +} + +const std::string& Configuration::Get(const std::string_view& key) const { + const auto itr = this->properties_.find(key); + if (itr == this->properties_.cend()) { + static const std::string empty(""); + return empty; + } + return itr->second; +} + +void Configuration::Set(const std::string_view& key, const std::string& value) { + const std::string copy = boost::trim_copy(value); + if (!copy.empty()) { + this->properties_[key] = value; + } +} + +const driver::odbcabstraction::Connection::ConnPropertyMap& Configuration::GetProperties() + const { + return this->properties_; +} + +std::vector Configuration::GetCustomKeys() const { + driver::odbcabstraction::Connection::ConnPropertyMap copy_props(properties_); + for (auto& key : FlightSqlConnection::ALL_KEYS) { + copy_props.erase(key); + } + std::vector keys; + boost::copy(copy_props | boost::adaptors::map_keys, std::back_inserter(keys)); + return keys; +} + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/config/connection_string_parser.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/config/connection_string_parser.cc new file mode 100644 index 00000000000..df218bd021b --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/config/connection_string_parser.cc @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/connection_string_parser.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace driver { +namespace flight_sql { +namespace config { + +ConnectionStringParser::ConnectionStringParser(Configuration& cfg) : cfg_(cfg) { + // No-op. +} + +ConnectionStringParser::~ConnectionStringParser() { + // No-op. +} + +void ConnectionStringParser::ParseConnectionString(const char* str, size_t len, + char delimiter) { + std::string connect_str(str, len); + + while (connect_str.rbegin() != connect_str.rend() && *connect_str.rbegin() == 0) + connect_str.erase(connect_str.size() - 1); + + while (!connect_str.empty()) { + size_t attr_begin = connect_str.rfind(delimiter); + + if (attr_begin == std::string::npos) + attr_begin = 0; + else + ++attr_begin; + + size_t attr_eq_pos = connect_str.rfind('='); + + if (attr_eq_pos == std::string::npos) attr_eq_pos = 0; + + if (attr_begin < attr_eq_pos) { + const char* key_begin = connect_str.data() + attr_begin; + const char* key_end = connect_str.data() + attr_eq_pos; + std::string key(key_begin, key_end); + boost::algorithm::trim(key); + + const char* value_begin = connect_str.data() + attr_eq_pos + 1; + const char* value_end = connect_str.data() + connect_str.size(); + std::string value(value_begin, value_end); + boost::algorithm::trim(value); + + if (value[0] == '{' && value[value.size() - 1] == '}') { + value = value.substr(1, value.size() - 2); + } + + cfg_.Set(key, value); + } + + if (!attr_begin) break; + + connect_str.erase(attr_begin - 1); + } +} + +void ConnectionStringParser::ParseConnectionString(const std::string& str) { + ParseConnectionString(str.data(), str.size(), ';'); +} + +void ConnectionStringParser::ParseConfigAttributes(const char* str) { + size_t len = 0; + + // Getting list length. List is terminated by two '\0'. + while (str[len] || str[len + 1]) ++len; + + ++len; + + ParseConnectionString(str, len, '\0'); +} + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_auth_method.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_auth_method.cc new file mode 100644 index 00000000000..4c9e69d07df --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_auth_method.cc @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_auth_method.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" + +#include "arrow/flight/client.h" +#include "arrow/result.h" +#include "arrow/status.h" + +#include + +namespace driver { +namespace flight_sql { + +using arrow::Result; +using arrow::flight::FlightCallOptions; +using arrow::flight::FlightClient; +using arrow::flight::TimeoutDuration; +using driver::odbcabstraction::AuthenticationException; +using driver::odbcabstraction::CommunicationException; +using driver::odbcabstraction::Connection; + +namespace { +class NoOpAuthMethod : public FlightSqlAuthMethod { + public: + void Authenticate(FlightSqlConnection& connection, + FlightCallOptions& call_options) override { + // Do nothing + } +}; + +class NoOpClientAuthHandler : public arrow::flight::ClientAuthHandler { + public: + NoOpClientAuthHandler() {} + + arrow::Status Authenticate(arrow::flight::ClientAuthSender* outgoing, + arrow::flight::ClientAuthReader* incoming) override { + // Write a blank string. The server should ignore this and just accept any Handshake + // request. + return outgoing->Write(std::string()); + } + + arrow::Status GetToken(std::string* token) override { + *token = std::string(); + return arrow::Status::OK(); + } +}; + +class UserPasswordAuthMethod : public FlightSqlAuthMethod { + public: + UserPasswordAuthMethod(FlightClient& client, std::string user, std::string password) + : client_(client), user_(std::move(user)), password_(std::move(password)) {} + + void Authenticate(FlightSqlConnection& connection, + FlightCallOptions& call_options) override { + FlightCallOptions auth_call_options; + const boost::optional& login_timeout = + connection.GetAttribute(Connection::LOGIN_TIMEOUT); + if (login_timeout && boost::get(*login_timeout) > 0) { + // ODBC's LOGIN_TIMEOUT attribute and FlightCallOptions.timeout use + // seconds as time unit. + double timeout_seconds = static_cast(boost::get(*login_timeout)); + if (timeout_seconds > 0) { + auth_call_options.timeout = TimeoutDuration{timeout_seconds}; + } + } + + Result> bearer_result = + client_.AuthenticateBasicToken(auth_call_options, user_, password_); + + if (!bearer_result.ok()) { + const auto& flight_status = + arrow::flight::FlightStatusDetail::UnwrapStatus(bearer_result.status()); + if (flight_status != nullptr) { + if (flight_status->code() == arrow::flight::FlightStatusCode::Unauthenticated) { + throw AuthenticationException( + "Failed to authenticate with user and password: " + + bearer_result.status().ToString()); + } else if (flight_status->code() == + arrow::flight::FlightStatusCode::Unavailable) { + throw CommunicationException(bearer_result.status().message()); + } + } + + throw odbcabstraction::DriverException(bearer_result.status().message()); + } + + call_options.headers.push_back(bearer_result.ValueOrDie()); + } + + std::string GetUser() override { return user_; } + + private: + FlightClient& client_; + std::string user_; + std::string password_; +}; + +class TokenAuthMethod : public FlightSqlAuthMethod { + private: + FlightClient& client_; + std::string token_; // this is the token the user provides + + public: + TokenAuthMethod(FlightClient& client, std::string token) + : client_{client}, token_{std::move(token)} {} + + void Authenticate(FlightSqlConnection& connection, + FlightCallOptions& call_options) override { + // add the token to the headers + const std::pair token_header("authorization", + "Bearer " + token_); + call_options.headers.push_back(token_header); + + const arrow::Status status = client_.Authenticate( + call_options, + std::unique_ptr(new NoOpClientAuthHandler())); + if (!status.ok()) { + const auto& flight_status = arrow::flight::FlightStatusDetail::UnwrapStatus(status); + if (flight_status != nullptr) { + if (flight_status->code() == arrow::flight::FlightStatusCode::Unauthenticated) { + throw AuthenticationException("Failed to authenticate with token: " + token_ + + " Message: " + status.message()); + } else if (flight_status->code() == + arrow::flight::FlightStatusCode::Unavailable) { + throw CommunicationException(status.message()); + } + } + throw odbcabstraction::DriverException(status.message()); + } + } +}; +} // namespace + +std::unique_ptr FlightSqlAuthMethod::FromProperties( + const std::unique_ptr& client, + const Connection::ConnPropertyMap& properties) { + // Check if should use user-password authentication + auto it_user = properties.find(FlightSqlConnection::USER); + if (it_user == properties.end()) { + // The Microsoft OLE DB to ODBC bridge provider (MSDASQL) will write + // "User ID" and "Password" properties instead of mapping + // to ODBC compliant UID/PWD keys. + it_user = properties.find(FlightSqlConnection::USER_ID); + } + + auto it_password = properties.find(FlightSqlConnection::PASSWORD); + auto it_token = properties.find(FlightSqlConnection::TOKEN); + + if (it_user == properties.end() || it_password == properties.end()) { + // Accept UID/PWD as aliases for User/Password. These are suggested as + // standard properties in the documentation for SQLDriverConnect. + it_user = properties.find(FlightSqlConnection::UID); + it_password = properties.find(FlightSqlConnection::PWD); + } + if (it_user != properties.end() || it_password != properties.end()) { + const std::string& user = it_user != properties.end() ? it_user->second : ""; + const std::string& password = + it_password != properties.end() ? it_password->second : ""; + + return std::unique_ptr( + new UserPasswordAuthMethod(*client, user, password)); + } else if (it_token != properties.end()) { + const auto& token = it_token->second; + return std::unique_ptr(new TokenAuthMethod(*client, token)); + } + + return std::unique_ptr(new NoOpAuthMethod); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_auth_method.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_auth_method.h new file mode 100644 index 00000000000..b13caaeb5ab --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_auth_method.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include "arrow/flight/client.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h" + +namespace driver { +namespace flight_sql { + +class FlightSqlAuthMethod { + public: + virtual ~FlightSqlAuthMethod() = default; + + virtual void Authenticate(FlightSqlConnection& connection, + arrow::flight::FlightCallOptions& call_options) = 0; + + virtual std::string GetUser() { return std::string(); } + + static std::unique_ptr FromProperties( + const std::unique_ptr& client, + const odbcabstraction::Connection::ConnPropertyMap& properties); + + protected: + FlightSqlAuthMethod() = default; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection.cc new file mode 100644 index 00000000000..4faa20e3d5c --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection.cc @@ -0,0 +1,440 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/utils.h" + +#include "arrow/flight/client_cookie_middleware.h" +#include "arrow/flight/sql/odbc/flight_sql/address_info.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_auth_method.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_ssl_config.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/types.h" + +#include +#include +#include +#include +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" + +#include +#include + +#include "arrow/flight/sql/odbc/flight_sql/system_trust_store.h" + +#ifndef NI_MAXHOST +# define NI_MAXHOST 1025 +#endif + +namespace driver { +namespace flight_sql { + +using arrow::Result; +using arrow::Status; +using arrow::flight::FlightCallOptions; +using arrow::flight::FlightClient; +using arrow::flight::FlightClientOptions; +using arrow::flight::Location; +using arrow::flight::TimeoutDuration; +using arrow::flight::sql::FlightSqlClient; +using driver::odbcabstraction::AsBool; +using driver::odbcabstraction::CommunicationException; +using driver::odbcabstraction::Connection; +using driver::odbcabstraction::DriverException; +using driver::odbcabstraction::OdbcVersion; +using driver::odbcabstraction::Statement; + +const std::vector FlightSqlConnection::ALL_KEYS = { + FlightSqlConnection::DSN, + FlightSqlConnection::DRIVER, + FlightSqlConnection::HOST, + FlightSqlConnection::PORT, + FlightSqlConnection::TOKEN, + FlightSqlConnection::UID, + FlightSqlConnection::USER_ID, + FlightSqlConnection::PWD, + FlightSqlConnection::USE_ENCRYPTION, + FlightSqlConnection::TRUSTED_CERTS, + FlightSqlConnection::USE_SYSTEM_TRUST_STORE, + FlightSqlConnection::DISABLE_CERTIFICATE_VERIFICATION, + FlightSqlConnection::STRING_COLUMN_LENGTH, + FlightSqlConnection::USE_WIDE_CHAR, + FlightSqlConnection::CHUNK_BUFFER_CAPACITY}; + +namespace { + +#if _WIN32 || _WIN64 +constexpr auto SYSTEM_TRUST_STORE_DEFAULT = true; +constexpr auto STORES = {"CA", "MY", "ROOT", "SPC"}; + +inline std::string GetCerts() { + std::string certs; + + for (auto store : STORES) { + std::shared_ptr cert_iterator = + std::make_shared(store); + + if (!cert_iterator->SystemHasStore()) { + // If the system does not have the specific store, we skip it using the continue. + continue; + } + while (cert_iterator->HasNext()) { + certs += cert_iterator->GetNext(); + } + } + + return certs; +} + +#else + +constexpr auto SYSTEM_TRUST_STORE_DEFAULT = false; +inline std::string GetCerts() { return ""; } + +#endif + +const std::set + BUILT_IN_PROPERTIES = {FlightSqlConnection::HOST, + FlightSqlConnection::PORT, + FlightSqlConnection::USER, + FlightSqlConnection::USER_ID, + FlightSqlConnection::UID, + FlightSqlConnection::PASSWORD, + FlightSqlConnection::PWD, + FlightSqlConnection::TOKEN, + FlightSqlConnection::USE_ENCRYPTION, + FlightSqlConnection::DISABLE_CERTIFICATE_VERIFICATION, + FlightSqlConnection::TRUSTED_CERTS, + FlightSqlConnection::USE_SYSTEM_TRUST_STORE, + FlightSqlConnection::STRING_COLUMN_LENGTH, + FlightSqlConnection::USE_WIDE_CHAR}; + +Connection::ConnPropertyMap::const_iterator TrackMissingRequiredProperty( + const std::string_view& property, const Connection::ConnPropertyMap& properties, + std::vector& missing_attr) { + auto prop_iter = properties.find(property); + if (properties.end() == prop_iter) { + missing_attr.push_back(property); + } + return prop_iter; +} +} // namespace + +std::shared_ptr LoadFlightSslConfigs( + const Connection::ConnPropertyMap& conn_property_map) { + bool use_encryption = + AsBool(conn_property_map, FlightSqlConnection::USE_ENCRYPTION).value_or(true); + bool disable_cert = + AsBool(conn_property_map, FlightSqlConnection::DISABLE_CERTIFICATE_VERIFICATION) + .value_or(false); + bool use_system_trusted = + AsBool(conn_property_map, FlightSqlConnection::USE_SYSTEM_TRUST_STORE) + .value_or(SYSTEM_TRUST_STORE_DEFAULT); + + auto trusted_certs_iterator = + conn_property_map.find(FlightSqlConnection::TRUSTED_CERTS); + auto trusted_certs = trusted_certs_iterator != conn_property_map.end() + ? trusted_certs_iterator->second + : ""; + + return std::make_shared(disable_cert, trusted_certs, + use_system_trusted, use_encryption); +} + +void FlightSqlConnection::Connect(const ConnPropertyMap& properties, + std::vector& missing_attr) { + try { + auto flight_ssl_configs = LoadFlightSslConfigs(properties); + + Location location = BuildLocation(properties, missing_attr, flight_ssl_configs); + FlightClientOptions client_options = + BuildFlightClientOptions(properties, missing_attr, flight_ssl_configs); + + const std::shared_ptr& cookie_factory = + arrow::flight::GetCookieFactory(); + client_options.middleware.push_back(cookie_factory); + + std::unique_ptr flight_client; + ThrowIfNotOK(FlightClient::Connect(location, client_options).Value(&flight_client)); + + std::unique_ptr auth_method = + FlightSqlAuthMethod::FromProperties(flight_client, properties); + auth_method->Authenticate(*this, call_options_); + + sql_client_.reset(new FlightSqlClient(std::move(flight_client))); + closed_ = false; + + // Note: This should likely come from Flight instead of being from the + // connection properties to allow reporting a user for other auth mechanisms + // and also decouple the database user from user credentials. + + info_.SetProperty(SQL_USER_NAME, auth_method->GetUser()); + attribute_[CONNECTION_DEAD] = static_cast(SQL_FALSE); + + PopulateMetadataSettings(properties); + PopulateCallOptions(properties); + } catch (...) { + attribute_[CONNECTION_DEAD] = static_cast(SQL_TRUE); + sql_client_.reset(); + + throw; + } +} + +void FlightSqlConnection::PopulateMetadataSettings( + const Connection::ConnPropertyMap& conn_property_map) { + metadata_settings_.string_column_length = GetStringColumnLength(conn_property_map); + metadata_settings_.use_wide_char = GetUseWideChar(conn_property_map); + metadata_settings_.chunk_buffer_capacity = GetChunkBufferCapacity(conn_property_map); +} + +boost::optional FlightSqlConnection::GetStringColumnLength( + const Connection::ConnPropertyMap& conn_property_map) { + const int32_t min_string_column_length = 1; + + try { + return AsInt32(min_string_column_length, conn_property_map, + FlightSqlConnection::STRING_COLUMN_LENGTH); + } catch (const std::exception& e) { + diagnostics_.AddWarning( + std::string("Invalid value for connection property " + + std::string(FlightSqlConnection::STRING_COLUMN_LENGTH) + + ". Please ensure it has a valid numeric value. Message: " + e.what()), + "01000", odbcabstraction::ODBCErrorCodes_GENERAL_WARNING); + } + + return boost::none; +} + +bool FlightSqlConnection::GetUseWideChar(const ConnPropertyMap& conn_property_map) { +#if defined _WIN32 || defined _WIN64 + // Windows should use wide chars by default + bool default_value = true; +#else + // Mac and Linux should not use wide chars by default + bool default_value = false; +#endif + return AsBool(conn_property_map, FlightSqlConnection::USE_WIDE_CHAR) + .value_or(default_value); +} + +size_t FlightSqlConnection::GetChunkBufferCapacity( + const ConnPropertyMap& conn_property_map) { + size_t default_value = 5; + try { + return AsInt32(1, conn_property_map, FlightSqlConnection::CHUNK_BUFFER_CAPACITY) + .value_or(default_value); + } catch (const std::exception& e) { + diagnostics_.AddWarning( + std::string("Invalid value for connection property " + + std::string(FlightSqlConnection::CHUNK_BUFFER_CAPACITY) + + ". Please ensure it has a valid numeric value. Message: " + e.what()), + "01000", odbcabstraction::ODBCErrorCodes_GENERAL_WARNING); + } + + return default_value; +} + +const FlightCallOptions& FlightSqlConnection::PopulateCallOptions( + const ConnPropertyMap& props) { + // Set CONNECTION_TIMEOUT attribute or LOGIN_TIMEOUT depending on if this + // is the first request. + const boost::optional& connection_timeout = + closed_ ? GetAttribute(LOGIN_TIMEOUT) : GetAttribute(CONNECTION_TIMEOUT); + if (connection_timeout && boost::get(*connection_timeout) > 0) { + call_options_.timeout = + TimeoutDuration{static_cast(boost::get(*connection_timeout))}; + } + + for (auto prop : props) { + if (BUILT_IN_PROPERTIES.count(prop.first) != 0) { + continue; + } + + if (prop.first.find(' ') != std::string::npos) { + // Connection properties containing spaces will crash gRPC, but some tools + // such as the OLE DB to ODBC bridge generate unused properties containing spaces. + diagnostics_.AddWarning( + std::string("Ignoring connection option " + std::string(prop.first)) + + ". Server-specific options must be valid HTTP header names and " + + "cannot contain spaces.", + "01000", odbcabstraction::ODBCErrorCodes_GENERAL_WARNING); + continue; + } + + // Note: header names must be lower case for gRPC. + // gRPC will crash if they are not lower-case. + std::string key_lc = boost::algorithm::to_lower_copy(std::string(prop.first)); + call_options_.headers.emplace_back(std::make_pair(key_lc, prop.second)); + } + + return call_options_; +} + +FlightClientOptions FlightSqlConnection::BuildFlightClientOptions( + const ConnPropertyMap& properties, std::vector& missing_attr, + const std::shared_ptr& ssl_config) { + FlightClientOptions options; + // Persist state information using cookies if the FlightProducer supports it. + options.middleware.push_back(arrow::flight::GetCookieFactory()); + + if (ssl_config->UseEncryption()) { + if (ssl_config->ShouldDisableCertificateVerification()) { + options.disable_server_verification = + ssl_config->ShouldDisableCertificateVerification(); + } else { + if (ssl_config->UseSystemTrustStore()) { + const std::string certs = GetCerts(); + + options.tls_root_certs = certs; + } else if (!ssl_config->GetTrustedCerts().empty()) { + arrow::flight::CertKeyPair cert_key_pair; + ssl_config->PopulateOptionsWithCerts(&cert_key_pair); + options.tls_root_certs = cert_key_pair.pem_cert; + } + } + } + + return std::move(options); +} + +Location FlightSqlConnection::BuildLocation( + const ConnPropertyMap& properties, std::vector& missing_attr, + const std::shared_ptr& ssl_config) { + const auto& host_iter = TrackMissingRequiredProperty(HOST, properties, missing_attr); + + const auto& port_iter = TrackMissingRequiredProperty(PORT, properties, missing_attr); + + if (!missing_attr.empty()) { + std::vector missing_attr_string_vec(missing_attr.begin(), + missing_attr.end()); + std::string missing_attr_str = std::string("Missing required properties: ") + + boost::algorithm::join(missing_attr_string_vec, ", "); + throw DriverException(missing_attr_str); + } + + const std::string& host = host_iter->second; + const int& port = boost::lexical_cast(port_iter->second); + + Location location; + if (ssl_config->UseEncryption()) { + AddressInfo address_info; + char host_name_info[NI_MAXHOST] = ""; + bool operation_result = false; + + try { + auto ip_address = boost::asio::ip::make_address(host); + // We should only attempt to resolve the hostname from the IP if the given + // HOST input is an IP address. + if (ip_address.is_v4() || ip_address.is_v6()) { + operation_result = address_info.GetAddressInfo(host, host_name_info, NI_MAXHOST); + if (operation_result) { + ThrowIfNotOK(Location::ForGrpcTls(host_name_info, port).Value(&location)); + return location; + } + // TODO: We should log that we could not convert an IP to hostname here. + } + } catch (...) { + // This is expected. The Host attribute can be an IP or name, but make_address will + // throw if it is not an IP. + } + + ThrowIfNotOK(Location::ForGrpcTls(host, port).Value(&location)); + return location; + } + + ThrowIfNotOK(Location::ForGrpcTcp(host, port).Value(&location)); + return location; +} + +void FlightSqlConnection::Close() { + if (closed_) { + throw DriverException("Connection already closed."); + } + + sql_client_.reset(); + closed_ = true; + attribute_[CONNECTION_DEAD] = static_cast(SQL_TRUE); +} + +std::shared_ptr FlightSqlConnection::CreateStatement() { + return std::shared_ptr(new FlightSqlStatement( + diagnostics_, *sql_client_, call_options_, metadata_settings_)); +} + +bool FlightSqlConnection::SetAttribute(Connection::AttributeId attribute, + const Connection::Attribute& value) { + switch (attribute) { + case ACCESS_MODE: + // We will always return read-write. + return CheckIfSetToOnlyValidValue(value, + static_cast(SQL_MODE_READ_WRITE)); + case PACKET_SIZE: + return CheckIfSetToOnlyValidValue(value, static_cast(0)); + default: + attribute_[attribute] = value; + return true; + } +} + +boost::optional FlightSqlConnection::GetAttribute( + Connection::AttributeId attribute) { + switch (attribute) { + case ACCESS_MODE: + // FlightSQL does not provide this metadata. + return boost::make_optional(Attribute(static_cast(SQL_MODE_READ_WRITE))); + case PACKET_SIZE: + return boost::make_optional(Attribute(static_cast(0))); + default: + const auto& it = attribute_.find(attribute); + return boost::make_optional(it != attribute_.end(), it->second); + } +} + +Connection::Info FlightSqlConnection::GetInfo(uint16_t info_type) { + auto result = info_.GetInfo(info_type); + if (info_type == SQL_DBMS_NAME || info_type == SQL_SERVER_NAME) { + // Update the database component reported in error messages. + // We do this lazily for performance reasons. + diagnostics_.SetDataSourceComponent(boost::get(result)); + } + return result; +} + +FlightSqlConnection::FlightSqlConnection(OdbcVersion odbc_version, + const std::string& driver_version) + : diagnostics_("Apache Arrow", "Flight SQL", odbc_version), + odbc_version_(odbc_version), + info_(call_options_, sql_client_, driver_version), + closed_(true) { + attribute_[CONNECTION_DEAD] = static_cast(SQL_TRUE); + attribute_[LOGIN_TIMEOUT] = static_cast(0); + attribute_[CONNECTION_TIMEOUT] = static_cast(0); + attribute_[CURRENT_CATALOG] = ""; +} +odbcabstraction::Diagnostics& FlightSqlConnection::GetDiagnostics() { + return diagnostics_; +} + +void FlightSqlConnection::SetClosed(bool is_closed) { closed_ = is_closed; } + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h new file mode 100644 index 00000000000..ad812ece569 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h" + +#include +#include "arrow/flight/api.h" +#include "arrow/flight/sql/api.h" + +#include "arrow/flight/sql/odbc/flight_sql/get_info_cache.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" + +namespace driver { +namespace flight_sql { + +class FlightSqlSslConfig; + +/// \brief Create an instance of the FlightSqlSslConfig class, from the properties passed +/// into the map. +/// \param conn_property_map the map with the Connection properties. +/// \return An instance of the FlightSqlSslConfig. +std::shared_ptr LoadFlightSslConfigs( + const odbcabstraction::Connection::ConnPropertyMap& conn_property_map); + +class FlightSqlConnection : public odbcabstraction::Connection { + private: + odbcabstraction::MetadataSettings metadata_settings_; + std::map attribute_; + arrow::flight::FlightClientOptions client_options_; + arrow::flight::FlightCallOptions call_options_; + std::unique_ptr sql_client_; + GetInfoCache info_; + odbcabstraction::Diagnostics diagnostics_; + odbcabstraction::OdbcVersion odbc_version_; + bool closed_; + + void PopulateMetadataSettings(const Connection::ConnPropertyMap& conn_property_map); + + public: + static const std::vector ALL_KEYS; + static constexpr std::string_view DSN = "dsn"; + static constexpr std::string_view DRIVER = "driver"; + static constexpr std::string_view HOST = "host"; + static constexpr std::string_view PORT = "port"; + static constexpr std::string_view USER = "user"; + static constexpr std::string_view USER_ID = "user id"; + static constexpr std::string_view UID = "uid"; + static constexpr std::string_view PASSWORD = "password"; + static constexpr std::string_view PWD = "pwd"; + static constexpr std::string_view TOKEN = "token"; + static constexpr std::string_view USE_ENCRYPTION = "useEncryption"; + static constexpr std::string_view DISABLE_CERTIFICATE_VERIFICATION = + "disableCertificateVerification"; + static constexpr std::string_view TRUSTED_CERTS = "trustedCerts"; + static constexpr std::string_view USE_SYSTEM_TRUST_STORE = "useSystemTrustStore"; + static constexpr std::string_view STRING_COLUMN_LENGTH = "StringColumnLength"; + static constexpr std::string_view USE_WIDE_CHAR = "UseWideChar"; + static constexpr std::string_view CHUNK_BUFFER_CAPACITY = "ChunkBufferCapacity"; + + explicit FlightSqlConnection(odbcabstraction::OdbcVersion odbc_version, + const std::string& driver_version = "0.9.0.0"); + + void Connect(const ConnPropertyMap& properties, + std::vector& missing_attr) override; + + void Close() override; + + std::shared_ptr CreateStatement() override; + + bool SetAttribute(AttributeId attribute, const Attribute& value) override; + + boost::optional GetAttribute( + Connection::AttributeId attribute) override; + + Info GetInfo(uint16_t info_type) override; + + /// \brief Builds a Location used for FlightClient connection. + /// \note Visible for testing + static arrow::flight::Location BuildLocation( + const ConnPropertyMap& properties, std::vector& missing_attr, + const std::shared_ptr& ssl_config); + + /// \brief Builds a FlightClientOptions used for FlightClient connection. + /// \note Visible for testing + static arrow::flight::FlightClientOptions BuildFlightClientOptions( + const ConnPropertyMap& properties, std::vector& missing_attr, + const std::shared_ptr& ssl_config); + + /// \brief Builds a FlightCallOptions used on gRPC calls. + /// \note Visible for testing + const arrow::flight::FlightCallOptions& PopulateCallOptions( + const ConnPropertyMap& properties); + + odbcabstraction::Diagnostics& GetDiagnostics() override; + + /// \brief A setter to the field closed_. + /// \note Visible for testing + void SetClosed(bool is_closed); + + boost::optional GetStringColumnLength( + const ConnPropertyMap& conn_property_map); + + bool GetUseWideChar(const ConnPropertyMap& conn_property_map); + + size_t GetChunkBufferCapacity(const ConnPropertyMap& conn_property_map); +}; +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection_test.cc new file mode 100644 index 00000000000..11caa9aa61f --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_connection_test.cc @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/types.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { + +using arrow::flight::Location; +using arrow::flight::TimeoutDuration; +using odbcabstraction::Connection; + +TEST(AttributeTests, SetAndGetAttribute) { + FlightSqlConnection connection(odbcabstraction::V_3); + connection.SetClosed(false); + + connection.SetAttribute(Connection::CONNECTION_TIMEOUT, static_cast(200)); + const boost::optional first_value = + connection.GetAttribute(Connection::CONNECTION_TIMEOUT); + + EXPECT_TRUE(first_value); + + EXPECT_EQ(boost::get(*first_value), static_cast(200)); + + connection.SetAttribute(Connection::CONNECTION_TIMEOUT, static_cast(300)); + + const boost::optional change_value = + connection.GetAttribute(Connection::CONNECTION_TIMEOUT); + + EXPECT_TRUE(change_value); + EXPECT_EQ(boost::get(*change_value), static_cast(300)); + + connection.Close(); +} + +TEST(AttributeTests, GetAttributeWithoutSetting) { + FlightSqlConnection connection(odbcabstraction::V_3); + + const boost::optional optional = + connection.GetAttribute(Connection::CONNECTION_TIMEOUT); + connection.SetClosed(false); + + EXPECT_EQ(0, boost::get(*optional)); + + connection.Close(); +} + +TEST(MetadataSettingsTest, StringColumnLengthTest) { + FlightSqlConnection connection(odbcabstraction::V_3); + connection.SetClosed(false); + + const int32_t expected_string_column_length = 100000; + + const Connection::ConnPropertyMap properties = { + {FlightSqlConnection::HOST, std::string("localhost")}, // expect not used + {FlightSqlConnection::PORT, std::string("32010")}, // expect not used + {FlightSqlConnection::USE_ENCRYPTION, std::string("false")}, // expect not used + {FlightSqlConnection::STRING_COLUMN_LENGTH, + std::to_string(expected_string_column_length)}, + }; + + const boost::optional actual_string_column_length = + connection.GetStringColumnLength(properties); + + EXPECT_TRUE(actual_string_column_length); + EXPECT_EQ(expected_string_column_length, *actual_string_column_length); + + connection.Close(); +} + +TEST(MetadataSettingsTest, UseWideCharTest) { + FlightSqlConnection connection(odbcabstraction::V_3); + connection.SetClosed(false); + + const Connection::ConnPropertyMap properties1 = { + {FlightSqlConnection::USE_WIDE_CHAR, std::string("true")}, + }; + const Connection::ConnPropertyMap properties2 = { + {FlightSqlConnection::USE_WIDE_CHAR, std::string("false")}, + }; + + EXPECT_EQ(true, connection.GetUseWideChar(properties1)); + EXPECT_EQ(false, connection.GetUseWideChar(properties2)); + + connection.Close(); +} + +TEST(BuildLocationTests, ForTcp) { + std::vector missing_attr; + Connection::ConnPropertyMap properties = { + {FlightSqlConnection::HOST, std::string("localhost")}, + {FlightSqlConnection::PORT, std::string("32010")}, + {FlightSqlConnection::USE_ENCRYPTION, std::string("false")}, + }; + + const std::shared_ptr& ssl_config = + LoadFlightSslConfigs(properties); + + const Location& actual_location1 = + FlightSqlConnection::BuildLocation(properties, missing_attr, ssl_config); + const Location& actual_location2 = FlightSqlConnection::BuildLocation( + { + {FlightSqlConnection::HOST, std::string("localhost")}, + {FlightSqlConnection::PORT, std::string("32011")}, + }, + missing_attr, ssl_config); + + Location expected_location; + ASSERT_TRUE(Location::ForGrpcTcp("localhost", 32010).Value(&expected_location).ok()); + ASSERT_EQ(expected_location, actual_location1); + ASSERT_NE(expected_location, actual_location2); +} + +TEST(BuildLocationTests, ForTls) { + std::vector missing_attr; + Connection::ConnPropertyMap properties = { + {FlightSqlConnection::HOST, std::string("localhost")}, + {FlightSqlConnection::PORT, std::string("32010")}, + {FlightSqlConnection::USE_ENCRYPTION, std::string("1")}, + }; + + const std::shared_ptr& ssl_config = + LoadFlightSslConfigs(properties); + + const Location& actual_location1 = + FlightSqlConnection::BuildLocation(properties, missing_attr, ssl_config); + + Connection::ConnPropertyMap second_properties = { + {FlightSqlConnection::HOST, std::string("localhost")}, + {FlightSqlConnection::PORT, std::string("32011")}, + {FlightSqlConnection::USE_ENCRYPTION, std::string("1")}, + }; + + const std::shared_ptr& second_ssl_config = + LoadFlightSslConfigs(properties); + + const Location& actual_location2 = + FlightSqlConnection::BuildLocation(second_properties, missing_attr, ssl_config); + + Location expected_location; + ASSERT_TRUE(Location::ForGrpcTls("localhost", 32010).Value(&expected_location).ok()); + ASSERT_EQ(expected_location, actual_location1); + ASSERT_NE(expected_location, actual_location2); +} + +TEST(PopulateCallOptionsTest, ConnectionTimeout) { + FlightSqlConnection connection(odbcabstraction::V_3); + connection.SetClosed(false); + + // Expect default timeout to be -1 + ASSERT_EQ(TimeoutDuration{-1.0}, + connection.PopulateCallOptions(Connection::ConnPropertyMap()).timeout); + + connection.SetAttribute(Connection::CONNECTION_TIMEOUT, static_cast(10)); + ASSERT_EQ(TimeoutDuration{10.0}, + connection.PopulateCallOptions(Connection::ConnPropertyMap()).timeout); +} + +TEST(PopulateCallOptionsTest, GenericOption) { + FlightSqlConnection connection(odbcabstraction::V_3); + connection.SetClosed(false); + + Connection::ConnPropertyMap properties; + properties["Foo"] = "Bar"; + auto options = connection.PopulateCallOptions(properties); + auto headers = options.headers; + ASSERT_EQ(1, headers.size()); + + // Header name must be lower-case because gRPC will crash if it is not lower-case. + ASSERT_EQ("foo", headers[0].first); + + // Header value should preserve case. + ASSERT_EQ("Bar", headers[0].second); +} + +TEST(PopulateCallOptionsTest, GenericOptionWithSpaces) { + FlightSqlConnection connection(odbcabstraction::V_3); + connection.SetClosed(false); + + Connection::ConnPropertyMap properties; + properties["Persist Security Info"] = "False"; + auto options = connection.PopulateCallOptions(properties); + auto headers = options.headers; + // Header names with spaces must be omitted or gRPC will crash. + ASSERT_TRUE(headers.empty()); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_driver.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_driver.cc new file mode 100644 index 00000000000..e8157755165 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_driver.cc @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/flight_sql_driver.h" +#include "arrow/compute/api.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" +#include "arrow/util/string.h" + +using arrow::util::ArrowLogLevel; + +namespace driver { +namespace flight_sql { +static constexpr const char* kODBCLogLevel = "ARROW_ODBC_LOG_LEVEL"; + +using odbcabstraction::Connection; +using odbcabstraction::OdbcVersion; + +FlightSqlDriver::FlightSqlDriver() + : diagnostics_("Apache Arrow", "Flight SQL", OdbcVersion::V_3), version_("0.9.0.0") { + RegisterComputeKernels(); + // Register log after compute kernels check to avoid segfaults + RegisterLog(); +} + +FlightSqlDriver::~FlightSqlDriver() { + // Unregister log if logging is enabled + if (arrow::internal::GetEnvVar(kODBCLogLevel).ValueOr("").empty()) { + return; + } + arrow::util::ArrowLog::ShutDownArrowLog(); +} + +std::shared_ptr FlightSqlDriver::CreateConnection(OdbcVersion odbc_version) { + return std::make_shared(odbc_version, version_); +} + +odbcabstraction::Diagnostics& FlightSqlDriver::GetDiagnostics() { return diagnostics_; } + +void FlightSqlDriver::SetVersion(std::string version) { version_ = std::move(version); } + +void FlightSqlDriver::RegisterComputeKernels() { + auto registry = arrow::compute::GetFunctionRegistry(); + + // strptime is one of the required compute functions + auto strptime_func = registry->GetFunction("strptime"); + if (!strptime_func.ok()) { + // Register Kernel functions to library + ThrowIfNotOK(arrow::compute::Initialize()); + } +} + +void FlightSqlDriver::RegisterLog() { + std::string log_level_str = arrow::internal::GetEnvVar(kODBCLogLevel) + .Map(arrow::internal::AsciiToLower) + .Map(arrow::internal::TrimString) + .ValueOr(""); + if (log_level_str.empty()) { + return; + } + + auto log_level = ArrowLogLevel::ARROW_DEBUG; + + if (log_level_str == "fatal") { + log_level = ArrowLogLevel::ARROW_FATAL; + } else if (log_level_str == "error") { + log_level = ArrowLogLevel::ARROW_ERROR; + } else if (log_level_str == "warning") { + log_level = ArrowLogLevel::ARROW_WARNING; + } else if (log_level_str == "info") { + log_level = ArrowLogLevel::ARROW_INFO; + } else if (log_level_str == "debug") { + log_level = ArrowLogLevel::ARROW_DEBUG; + } else if (log_level_str == "trace") { + log_level = ArrowLogLevel::ARROW_TRACE; + } + + // Enable driver logging. Log files are not supported on Windows yet, since GLOG is not + // tested fully on Windows. + // Info log level is enabled by default. + if (log_level != ArrowLogLevel::ARROW_INFO) { + arrow::util::ArrowLog::StartArrowLog("arrow-flight-sql-odbc", log_level); + } +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_tables_reader.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_tables_reader.cc new file mode 100644 index 00000000000..ccd6058f8cd --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_tables_reader.cc @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_get_tables_reader.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/io/memory.h" +#include "arrow/status.h" + +#include + +namespace driver { +namespace flight_sql { + +using arrow::BinaryArray; +using arrow::StringArray; + +using arrow::internal::checked_pointer_cast; +using std::nullopt; + +GetTablesReader::GetTablesReader(std::shared_ptr record_batch) + : record_batch_(std::move(record_batch)), current_row_(-1) {} + +bool GetTablesReader::Next() { return ++current_row_ < record_batch_->num_rows(); } + +optional GetTablesReader::GetCatalogName() { + const auto& array = checked_pointer_cast(record_batch_->column(0)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetString(current_row_); +} + +optional GetTablesReader::GetDbSchemaName() { + const auto& array = checked_pointer_cast(record_batch_->column(1)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetString(current_row_); +} + +std::string GetTablesReader::GetTableName() { + const auto& array = checked_pointer_cast(record_batch_->column(2)); + + return array->GetString(current_row_); +} + +std::string GetTablesReader::GetTableType() { + const auto& array = checked_pointer_cast(record_batch_->column(3)); + + return array->GetString(current_row_); +} + +std::shared_ptr GetTablesReader::GetSchema() { + const auto& array = checked_pointer_cast(record_batch_->column(4)); + if (array == nullptr) { + return nullptr; + } + + // Create a non-owned Buffer to avoid copying + arrow::io::BufferReader dataset_schema_reader( + std::make_shared(array->GetView(current_row_))); + arrow::ipc::DictionaryMemo in_memo; + const arrow::Result>& result = + arrow::ipc::ReadSchema(&dataset_schema_reader, &in_memo); + if (!result.ok()) { + // TODO: Ignoring this error until we fix the problem on Dremio server + // The problem is that complex types columns are being returned without the children + // types. + return nullptr; + } + + return result.ValueOrDie(); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_tables_reader.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_tables_reader.h new file mode 100644 index 00000000000..adfd204b179 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_tables_reader.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" + +namespace driver { +namespace flight_sql { + +using arrow::RecordBatch; + +using std::optional; + +class GetTablesReader { + private: + std::shared_ptr record_batch_; + int64_t current_row_; + + public: + explicit GetTablesReader(std::shared_ptr record_batch); + + bool Next(); + + optional GetCatalogName(); + + optional GetDbSchemaName(); + + std::string GetTableName(); + + std::string GetTableType(); + + std::shared_ptr GetSchema(); +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_type_info_reader.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_type_info_reader.cc new file mode 100644 index 00000000000..25add2d1b05 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_type_info_reader.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_get_type_info_reader.h" +#include "arrow/array.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/io/memory.h" + +#include + +namespace driver { +namespace flight_sql { + +using arrow::BooleanArray; +using arrow::Int32Array; +using arrow::ListArray; +using arrow::StringArray; + +using arrow::internal::checked_pointer_cast; +using std::nullopt; + +GetTypeInfoReader::GetTypeInfoReader(std::shared_ptr record_batch) + : record_batch_(std::move(record_batch)), current_row_(-1) {} + +bool GetTypeInfoReader::Next() { return ++current_row_ < record_batch_->num_rows(); } + +std::string GetTypeInfoReader::GetTypeName() { + const auto& array = checked_pointer_cast(record_batch_->column(0)); + + return array->GetString(current_row_); +} + +int32_t GetTypeInfoReader::GetDataType() { + const auto& array = checked_pointer_cast(record_batch_->column(1)); + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetColumnSize() { + const auto& array = checked_pointer_cast(record_batch_->column(2)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetLiteralPrefix() { + const auto& array = checked_pointer_cast(record_batch_->column(3)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetString(current_row_); +} + +optional GetTypeInfoReader::GetLiteralSuffix() { + const auto& array = checked_pointer_cast(record_batch_->column(4)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetString(current_row_); +} + +optional> GetTypeInfoReader::GetCreateParams() { + const auto& array = checked_pointer_cast(record_batch_->column(5)); + + if (array->IsNull(current_row_)) return nullopt; + + int values_length = array->value_length(current_row_); + int start_offset = array->value_offset(current_row_); + const auto& values_array = checked_pointer_cast(array->values()); + + std::vector result(values_length); + for (int i = 0; i < values_length; ++i) { + result[i] = values_array->GetString(start_offset + i); + } + + return result; +} + +int32_t GetTypeInfoReader::GetNullable() { + const auto& array = checked_pointer_cast(record_batch_->column(6)); + + return array->GetView(current_row_); +} + +bool GetTypeInfoReader::GetCaseSensitive() { + const auto& array = checked_pointer_cast(record_batch_->column(7)); + + return array->GetView(current_row_); +} + +int32_t GetTypeInfoReader::GetSearchable() { + const auto& array = checked_pointer_cast(record_batch_->column(8)); + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetUnsignedAttribute() { + const auto& array = checked_pointer_cast(record_batch_->column(9)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetView(current_row_); +} + +bool GetTypeInfoReader::GetFixedPrecScale() { + const auto& array = checked_pointer_cast(record_batch_->column(10)); + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetAutoIncrement() { + const auto& array = checked_pointer_cast(record_batch_->column(11)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetLocalTypeName() { + const auto& array = checked_pointer_cast(record_batch_->column(12)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetString(current_row_); +} + +optional GetTypeInfoReader::GetMinimumScale() { + const auto& array = checked_pointer_cast(record_batch_->column(13)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetMaximumScale() { + const auto& array = checked_pointer_cast(record_batch_->column(14)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetView(current_row_); +} + +int32_t GetTypeInfoReader::GetSqlDataType() { + const auto& array = checked_pointer_cast(record_batch_->column(15)); + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetDatetimeSubcode() { + const auto& array = checked_pointer_cast(record_batch_->column(16)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetNumPrecRadix() { + const auto& array = checked_pointer_cast(record_batch_->column(17)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetView(current_row_); +} + +optional GetTypeInfoReader::GetIntervalPrecision() { + const auto& array = checked_pointer_cast(record_batch_->column(18)); + + if (array->IsNull(current_row_)) return nullopt; + + return array->GetView(current_row_); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_type_info_reader.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_type_info_reader.h new file mode 100644 index 00000000000..896ebfbdea5 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_get_type_info_reader.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" + +namespace driver { +namespace flight_sql { + +using arrow::RecordBatch; + +using std::optional; + +class GetTypeInfoReader { + private: + std::shared_ptr record_batch_; + int64_t current_row_; + + public: + explicit GetTypeInfoReader(std::shared_ptr record_batch); + + bool Next(); + + std::string GetTypeName(); + + int32_t GetDataType(); + + optional GetColumnSize(); + + optional GetLiteralPrefix(); + + optional GetLiteralSuffix(); + + optional> GetCreateParams(); + + int32_t GetNullable(); + + bool GetCaseSensitive(); + + int32_t GetSearchable(); + + optional GetUnsignedAttribute(); + + bool GetFixedPrecScale(); + + optional GetAutoIncrement(); + + optional GetLocalTypeName(); + + optional GetMinimumScale(); + + optional GetMaximumScale(); + + int32_t GetSqlDataType(); + + optional GetDatetimeSubcode(); + + optional GetNumPrecRadix(); + + optional GetIntervalPrecision(); +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.cc new file mode 100644 index 00000000000..971d73b3a90 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.cc @@ -0,0 +1,281 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include "arrow/flight/types.h" +#include "arrow/scalar.h" + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_column.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using arrow::RecordBatch; +using arrow::Scalar; +using arrow::Status; +using arrow::flight::FlightEndpoint; +using arrow::flight::FlightStreamChunk; +using arrow::flight::FlightStreamReader; +using odbcabstraction::CDataType; +using odbcabstraction::DriverException; + +FlightSqlResultSet::FlightSqlResultSet( + FlightSqlClient& flight_sql_client, + const arrow::flight::FlightCallOptions& call_options, + const std::shared_ptr& flight_info, + const std::shared_ptr& transformer, + odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings) + : metadata_settings_(metadata_settings), + chunk_buffer_(flight_sql_client, call_options, flight_info, + metadata_settings_.chunk_buffer_capacity), + transformer_(transformer), + metadata_(transformer + ? new FlightSqlResultSetMetadata(transformer->GetTransformedSchema(), + metadata_settings_) + : new FlightSqlResultSetMetadata(flight_info, metadata_settings_)), + columns_(metadata_->GetColumnCount()), + get_data_offsets_(metadata_->GetColumnCount(), 0), + diagnostics_(diagnostics), + current_row_(0), + num_binding_(0), + reset_get_data_(false) { + current_chunk_.data = nullptr; + if (transformer_) { + schema_ = transformer_->GetTransformedSchema(); + } else { + ThrowIfNotOK(flight_info->GetSchema(nullptr).Value(&schema_)); + } + + for (size_t i = 0; i < columns_.size(); ++i) { + columns_[i] = FlightSqlResultSetColumn(metadata_settings.use_wide_char); + } +} + +size_t FlightSqlResultSet::Move(size_t rows, size_t bind_offset, size_t bind_type, + uint16_t* row_status_array) { + // Consider it might be the first call to Move() and current_chunk is not + // populated yet + assert(rows > 0); + if (current_chunk_.data == nullptr) { + if (!chunk_buffer_.GetNext(¤t_chunk_)) { + return 0; + } + + if (transformer_) { + current_chunk_.data = transformer_->Transform(current_chunk_.data); + } + + for (size_t column_num = 0; column_num < columns_.size(); ++column_num) { + columns_[column_num].ResetAccessor(current_chunk_.data->column(column_num)); + } + } + + // Reset GetData value offsets. + if (num_binding_ != get_data_offsets_.size() && reset_get_data_) { + std::fill(get_data_offsets_.begin(), get_data_offsets_.end(), 0); + } + + size_t fetched_rows = 0; + while (fetched_rows < rows) { + size_t batch_rows = current_chunk_.data->num_rows(); + size_t rows_to_fetch = std::min(static_cast(rows - fetched_rows), + static_cast(batch_rows - current_row_)); + + if (rows_to_fetch == 0) { + if (!chunk_buffer_.GetNext(¤t_chunk_)) { + break; + } + + if (transformer_) { + current_chunk_.data = transformer_->Transform(current_chunk_.data); + } + + for (size_t column_num = 0; column_num < columns_.size(); ++column_num) { + columns_[column_num].ResetAccessor(current_chunk_.data->column(column_num)); + } + current_row_ = 0; + continue; + } + + for (auto& column : columns_) { + // There can be unbound columns. + if (!column.is_bound) continue; + + auto* accessor = column.GetAccessorForBinding(); + ColumnBinding shifted_binding = column.binding; + uint16_t* shifted_row_status_array = + row_status_array ? &row_status_array[fetched_rows] : nullptr; + + if (shifted_row_status_array) { + std::fill(shifted_row_status_array, &shifted_row_status_array[rows_to_fetch], + odbcabstraction::RowStatus_SUCCESS); + } + + size_t accessor_rows = 0; + try { + if (!bind_type) { + // Columnar binding. Have the accessor convert multiple rows. + if (shifted_binding.buffer) { + shifted_binding.buffer = + static_cast(shifted_binding.buffer) + + accessor->GetCellLength(&shifted_binding) * fetched_rows + bind_offset; + } + + if (shifted_binding.str_len_buffer) { + shifted_binding.str_len_buffer = reinterpret_cast( + reinterpret_cast( + &shifted_binding.str_len_buffer[fetched_rows]) + + bind_offset); + } + + int64_t value_offset = 0; + accessor_rows = accessor->GetColumnarData( + &shifted_binding, current_row_, rows_to_fetch, value_offset, false, + diagnostics_, shifted_row_status_array); + } else { + // Row-wise binding. Identify the base position of the buffer and indicator + // based on the bind offset, the number of already-fetched rows, and the + // bind_type holding the size of an application-side row. + if (shifted_binding.buffer) { + shifted_binding.buffer = static_cast(shifted_binding.buffer) + + bind_offset + bind_type * fetched_rows; + } + + if (shifted_binding.str_len_buffer) { + shifted_binding.str_len_buffer = reinterpret_cast( + reinterpret_cast(shifted_binding.str_len_buffer) + bind_offset + + bind_type * fetched_rows); + } + + // Loop and run the accessor one-row-at-a-time. + for (size_t i = 0; i < rows_to_fetch; ++i) { + int64_t value_offset = 0; + + // Adjust offsets passed to the accessor as we fetch rows. + // Note that current_row_ is updated outside of this loop. + accessor_rows += accessor->GetColumnarData( + &shifted_binding, current_row_ + i, 1, value_offset, false, diagnostics_, + shifted_row_status_array); + if (shifted_binding.buffer) { + shifted_binding.buffer = + static_cast(shifted_binding.buffer) + bind_type; + } + + if (shifted_binding.str_len_buffer) { + shifted_binding.str_len_buffer = reinterpret_cast( + reinterpret_cast(shifted_binding.str_len_buffer) + bind_type); + } + + if (shifted_row_status_array) { + shifted_row_status_array++; + } + } + } + } catch (...) { + if (shifted_row_status_array) { + std::fill(shifted_row_status_array, &shifted_row_status_array[rows_to_fetch], + odbcabstraction::RowStatus_ERROR); + } + throw; + } + + if (rows_to_fetch != accessor_rows) { + throw DriverException("Expected the same number of rows for all columns"); + } + } + + current_row_ += static_cast(rows_to_fetch); + fetched_rows += rows_to_fetch; + } + + if (rows > fetched_rows && row_status_array) { + std::fill(&row_status_array[fetched_rows], &row_status_array[rows], + odbcabstraction::RowStatus_NOROW); + } + return fetched_rows; +} + +void FlightSqlResultSet::Close() { + chunk_buffer_.Close(); + current_chunk_.data = nullptr; +} + +void FlightSqlResultSet::Cancel() { + chunk_buffer_.Close(); + current_chunk_.data = nullptr; +} + +bool FlightSqlResultSet::GetData(int column_n, int16_t target_type, int precision, + int scale, void* buffer, size_t buffer_length, + ssize_t* str_len_buffer) { + reset_get_data_ = true; + // Check if the offset is already at the end. + int64_t& value_offset = get_data_offsets_[column_n - 1]; + if (value_offset == -1) { + return false; + } + + ColumnBinding binding(ConvertCDataTypeFromV2ToV3(target_type), precision, scale, buffer, + buffer_length, str_len_buffer); + + auto& column = columns_[column_n - 1]; + Accessor* accessor = column.GetAccessorForGetData(binding.target_type); + + // Note: current_row_ is always positioned at the index _after_ the one we are + // on after calling Move(). So if we want to get data from the _last_ row + // fetched, we need to subtract one from the current row. + accessor->GetColumnarData(&binding, current_row_ - 1, 1, value_offset, true, + diagnostics_, nullptr); + + // If there was truncation, the converter would have reported it to the diagnostics. + return diagnostics_.HasWarning(); +} + +std::shared_ptr FlightSqlResultSet::GetMetadata() { return metadata_; } + +void FlightSqlResultSet::BindColumn(int column_n, int16_t target_type, int precision, + int scale, void* buffer, size_t buffer_length, + ssize_t* str_len_buffer) { + auto& column = columns_[column_n - 1]; + if (buffer == nullptr) { + if (column.is_bound) { + num_binding_--; + } + column.ResetBinding(); + return; + } + + if (!column.is_bound) { + num_binding_++; + } + + ColumnBinding binding(ConvertCDataTypeFromV2ToV3(target_type), precision, scale, buffer, + buffer_length, str_len_buffer); + column.SetBinding(binding, schema_->field(column_n - 1)->type()->id()); +} + +FlightSqlResultSet::~FlightSqlResultSet() = default; +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.h new file mode 100644 index 00000000000..db86bd4c4b6 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/client.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.h" +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/flight/types.h" + +namespace driver { +namespace flight_sql { + +using arrow::Schema; +using arrow::flight::FlightEndpoint; +using arrow::flight::FlightInfo; +using arrow::flight::FlightStreamChunk; +using arrow::flight::FlightStreamReader; +using arrow::flight::sql::FlightSqlClient; +using odbcabstraction::CDataType; +using odbcabstraction::DriverException; +using odbcabstraction::ResultSet; +using odbcabstraction::ResultSetMetadata; + +class FlightSqlResultSetColumn; + +class FlightSqlResultSet : public ResultSet { + private: + const odbcabstraction::MetadataSettings& metadata_settings_; + FlightStreamChunkBuffer chunk_buffer_; + FlightStreamChunk current_chunk_; + std::shared_ptr schema_; + std::shared_ptr transformer_; + std::shared_ptr metadata_; + std::vector columns_; + std::vector get_data_offsets_; + odbcabstraction::Diagnostics& diagnostics_; + int64_t current_row_; + int num_binding_; + bool reset_get_data_; + + public: + ~FlightSqlResultSet() override; + + FlightSqlResultSet(FlightSqlClient& flight_sql_client, + const arrow::flight::FlightCallOptions& call_options, + const std::shared_ptr& flight_info, + const std::shared_ptr& transformer, + odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings); + + void Close() override; + + void Cancel() override; + + bool GetData(int column_n, int16_t target_type, int precision, int scale, void* buffer, + size_t buffer_length, ssize_t* str_len_buffer) override; + + size_t Move(size_t rows, size_t bind_offset, size_t bind_type, + uint16_t* row_status_array) override; + + std::shared_ptr GetMetadata() override; + + void BindColumn(int column_n, int16_t target_type, int precision, int scale, + void* buffer, size_t buffer_length, ssize_t* str_len_buffer) override; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_accessors.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_accessors.cc new file mode 100644 index 00000000000..324d36d9436 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_accessors.cc @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_accessors.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/main.h" + +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +namespace driver { +namespace flight_sql { + +using arrow::Date32Array; +using arrow::Date64Array; +using arrow::Decimal128Array; +using arrow::DoubleArray; +using arrow::FloatArray; +using arrow::Int16Array; +using arrow::Int32Array; +using arrow::Int64Array; +using arrow::Int8Array; +using arrow::TimestampType; +using arrow::UInt16Array; +using arrow::UInt32Array; +using arrow::UInt64Array; +using arrow::UInt8Array; + +using odbcabstraction::CDataType; + +typedef std::pair SourceAndTargetPair; +typedef std::function AccessorConstructor; + +namespace { + +const std::unordered_map> + ACCESSORS_CONSTRUCTORS = { + {SourceAndTargetPair(arrow::Type::type::STRING, odbcabstraction::CDataType_CHAR), + [](arrow::Array* array) { + return new StringArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::STRING, odbcabstraction::CDataType_WCHAR), + CreateWCharStringArrayAccessor}, + {SourceAndTargetPair(arrow::Type::type::DOUBLE, + odbcabstraction::CDataType_DOUBLE), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::FLOAT, odbcabstraction::CDataType_FLOAT), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::INT64, + odbcabstraction::CDataType_SBIGINT), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::UINT64, + odbcabstraction::CDataType_UBIGINT), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::INT32, odbcabstraction::CDataType_SLONG), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::UINT32, odbcabstraction::CDataType_ULONG), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::INT16, odbcabstraction::CDataType_SSHORT), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::UINT16, + odbcabstraction::CDataType_USHORT), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::INT8, + odbcabstraction::CDataType_STINYINT), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor< + Int8Array, odbcabstraction::CDataType_STINYINT>(array); + }}, + {SourceAndTargetPair(arrow::Type::type::UINT8, + odbcabstraction::CDataType_UTINYINT), + [](arrow::Array* array) { + return new PrimitiveArrayFlightSqlAccessor< + UInt8Array, odbcabstraction::CDataType_UTINYINT>(array); + }}, + {SourceAndTargetPair(arrow::Type::type::BOOL, odbcabstraction::CDataType_BIT), + [](arrow::Array* array) { + return new BooleanArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::BINARY, + odbcabstraction::CDataType_BINARY), + [](arrow::Array* array) { + return new BinaryArrayFlightSqlAccessor( + array); + }}, + {SourceAndTargetPair(arrow::Type::type::DATE32, odbcabstraction::CDataType_DATE), + [](arrow::Array* array) { + return new DateArrayFlightSqlAccessor(array); + }}, + {SourceAndTargetPair(arrow::Type::type::DATE64, odbcabstraction::CDataType_DATE), + [](arrow::Array* array) { + return new DateArrayFlightSqlAccessor(array); + }}, + {SourceAndTargetPair(arrow::Type::type::TIMESTAMP, + odbcabstraction::CDataType_TIMESTAMP), + [](arrow::Array* array) { + auto time_type = + arrow::internal::checked_pointer_cast(array->type()); + auto time_unit = time_type->unit(); + Accessor* result; + switch (time_unit) { + case TimeUnit::SECOND: + result = new TimestampArrayFlightSqlAccessor< + odbcabstraction::CDataType_TIMESTAMP, TimeUnit::SECOND>(array); + break; + case TimeUnit::MILLI: + result = new TimestampArrayFlightSqlAccessor< + odbcabstraction::CDataType_TIMESTAMP, TimeUnit::MILLI>(array); + break; + case TimeUnit::MICRO: + result = new TimestampArrayFlightSqlAccessor< + odbcabstraction::CDataType_TIMESTAMP, TimeUnit::MICRO>(array); + break; + case TimeUnit::NANO: + result = new TimestampArrayFlightSqlAccessor< + odbcabstraction::CDataType_TIMESTAMP, TimeUnit::NANO>(array); + break; + default: + assert(false); + throw DriverException("Unrecognized time unit " + + std::to_string(time_unit)); + } + return result; + }}, + {SourceAndTargetPair(arrow::Type::type::TIME32, odbcabstraction::CDataType_TIME), + [](arrow::Array* array) { + return CreateTimeAccessor(array, arrow::Type::type::TIME32); + }}, + {SourceAndTargetPair(arrow::Type::type::TIME64, odbcabstraction::CDataType_TIME), + [](arrow::Array* array) { + return CreateTimeAccessor(array, arrow::Type::type::TIME64); + }}, + {SourceAndTargetPair(arrow::Type::type::DECIMAL128, + odbcabstraction::CDataType_NUMERIC), + [](arrow::Array* array) { + return new DecimalArrayFlightSqlAccessor( + array); + }}}; +} // namespace + +std::unique_ptr CreateAccessor(arrow::Array* source_array, + CDataType target_type) { + auto it = ACCESSORS_CONSTRUCTORS.find( + SourceAndTargetPair(source_array->type_id(), target_type)); + if (it != ACCESSORS_CONSTRUCTORS.end()) { + auto accessor = it->second(source_array); + return std::unique_ptr(accessor); + } + + std::stringstream ss; + ss << "Unsupported type conversion! Tried to convert '" + << source_array->type()->ToString() << "' to C type '" << target_type << "'"; + throw odbcabstraction::DriverException(ss.str()); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_accessors.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_accessors.h new file mode 100644 index 00000000000..3f7d6856083 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_accessors.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace driver { +namespace flight_sql { + +class Accessor; +class FlightSqlResultSet; + +std::unique_ptr CreateAccessor(arrow::Array* source_array, + odbcabstraction::CDataType target_type); + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_column.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_column.cc new file mode 100644 index 00000000000..2a5d116b1e4 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_column.cc @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_column.h" +#include +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_accessors.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" + +namespace driver { +namespace flight_sql { + +namespace { +std::shared_ptr CastArray(const std::shared_ptr& original_array, + CDataType target_type) { + bool conversion = NeedArrayConversion(original_array->type()->id(), target_type); + + if (conversion) { + auto converter = GetConverter(original_array->type_id(), target_type); + return converter(original_array); + } else { + return original_array; + } +} +} // namespace + +std::unique_ptr FlightSqlResultSetColumn::CreateAccessor( + CDataType target_type) { + cached_casted_array_ = CastArray(original_array_, target_type); + + return flight_sql::CreateAccessor(cached_casted_array_.get(), target_type); +} + +Accessor* FlightSqlResultSetColumn::GetAccessorForTargetType(CDataType target_type) { + // Cast the original array to a type matching the target_type. + if (target_type == odbcabstraction::CDataType_DEFAULT) { + target_type = ConvertArrowTypeToC(original_array_->type_id(), use_wide_char); + } + + cached_accessor_ = CreateAccessor(target_type); + return cached_accessor_.get(); +} + +FlightSqlResultSetColumn::FlightSqlResultSetColumn(bool use_wide_char) + : use_wide_char(use_wide_char), is_bound(false) {} + +void FlightSqlResultSetColumn::SetBinding(const ColumnBinding& new_binding, + arrow::Type::type arrow_type) { + binding = new_binding; + is_bound = true; + + if (binding.target_type == odbcabstraction::CDataType_DEFAULT) { + binding.target_type = ConvertArrowTypeToC(arrow_type, use_wide_char); + } + + // Overwrite the binding if the caller is using SQL_C_NUMERIC and has used zero + // precision if it is zero (this is precision unset and will always fail). + if (binding.precision == 0 && + binding.target_type == odbcabstraction::CDataType_NUMERIC) { + binding.precision = arrow::Decimal128Type::kMaxPrecision; + } + + // Rebuild the accessor and casted array if the target type changed. + if (original_array_ && + (!cached_casted_array_ || cached_accessor_->target_type_ != binding.target_type)) { + cached_accessor_ = CreateAccessor(binding.target_type); + } +} + +void FlightSqlResultSetColumn::ResetBinding() { + is_bound = false; + cached_casted_array_.reset(); + cached_accessor_.reset(); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_column.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_column.h new file mode 100644 index 00000000000..e530c17efba --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_column.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array.h" +#include "arrow/flight/sql/odbc/flight_sql/accessors/types.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" + +namespace driver { +namespace flight_sql { + +using arrow::Array; + +class FlightSqlResultSetColumn { + private: + std::shared_ptr original_array_; + std::shared_ptr cached_casted_array_; + std::unique_ptr cached_accessor_; + + std::unique_ptr CreateAccessor(CDataType target_type); + + Accessor* GetAccessorForTargetType(CDataType target_type); + + public: + FlightSqlResultSetColumn() = default; + explicit FlightSqlResultSetColumn(bool use_wide_char); + + ColumnBinding binding; + bool use_wide_char; + bool is_bound; + + inline Accessor* GetAccessorForBinding() { return cached_accessor_.get(); } + + inline Accessor* GetAccessorForGetData(CDataType target_type) { + if (target_type == odbcabstraction::CDataType_DEFAULT) { + target_type = ConvertArrowTypeToC(original_array_->type_id(), use_wide_char); + } + + if (cached_accessor_ && cached_accessor_->target_type_ == target_type) { + return cached_accessor_.get(); + } + return GetAccessorForTargetType(target_type); + } + + void SetBinding(const ColumnBinding& new_binding, arrow::Type::type arrow_type); + + void ResetBinding(); + + inline void ResetAccessor(std::shared_ptr array) { + original_array_ = std::move(array); + if (cached_accessor_) { + cached_accessor_ = CreateAccessor(cached_accessor_->target_type_); + } else if (is_bound) { + cached_accessor_ = CreateAccessor(binding.target_type); + } else { + cached_casted_array_.reset(); + cached_accessor_.reset(); + } + } +}; +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.cc new file mode 100644 index 00000000000..38b1410b6ac --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.cc @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.h" +#include +#include +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" + +namespace driver { +namespace flight_sql { + +using arrow::DataType; +using arrow::Field; +using odbcabstraction::SqlDataType; + +using std::make_optional; +using std::nullopt; + +constexpr int32_t DefaultDecimalPrecision = 38; + +// This indicates the column length used when the both property StringColumnLength is not +// specified and the server does not provide a length on column metadata. +constexpr int32_t DefaultLengthForVariableLengthColumns = 1024; + +namespace { +std::shared_ptr empty_metadata_map( + new arrow::KeyValueMetadata); + +inline arrow::flight::sql::ColumnMetadata GetMetadata( + const std::shared_ptr& field) { + const auto& metadata_map = field->metadata(); + + arrow::flight::sql::ColumnMetadata metadata(metadata_map ? metadata_map + : empty_metadata_map); + return metadata; +} + +arrow::Result GetFieldPrecision(const std::shared_ptr& field) { + return GetMetadata(field).GetPrecision(); +} +} // namespace + +size_t FlightSqlResultSetMetadata::GetColumnCount() { return schema_->num_fields(); } + +std::string FlightSqlResultSetMetadata::GetColumnName(int column_position) { + return schema_->field(column_position - 1)->name(); +} + +std::string FlightSqlResultSetMetadata::GetName(int column_position) { + return schema_->field(column_position - 1)->name(); +} + +size_t FlightSqlResultSetMetadata::GetPrecision(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + + int32_t column_size = GetFieldPrecision(field).ValueOrElse([] { return 0; }); + SqlDataType data_type_v3 = + GetDataTypeFromArrowFieldV3(field, metadata_settings_.use_wide_char); + + return GetColumnSize(data_type_v3, column_size).value_or(0); +} + +size_t FlightSqlResultSetMetadata::GetScale(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + arrow::flight::sql::ColumnMetadata metadata = GetMetadata(field); + + int32_t type_scale = metadata.GetScale().ValueOrElse([] { return 0; }); + SqlDataType data_type_v3 = + GetDataTypeFromArrowFieldV3(field, metadata_settings_.use_wide_char); + + return GetTypeScale(data_type_v3, type_scale).value_or(0); +} + +uint16_t FlightSqlResultSetMetadata::GetDataType(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + const SqlDataType concise_type = + GetDataTypeFromArrowFieldV3(field, metadata_settings_.use_wide_char); + return GetNonConciseDataType(concise_type); +} + +driver::odbcabstraction::Nullability FlightSqlResultSetMetadata::IsNullable( + int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + return field->nullable() ? odbcabstraction::NULLABILITY_NULLABLE + : odbcabstraction::NULLABILITY_NO_NULLS; +} + +std::string FlightSqlResultSetMetadata::GetSchemaName(int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + + return metadata.GetSchemaName().ValueOrElse([] { return ""; }); +} + +std::string FlightSqlResultSetMetadata::GetCatalogName(int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + + return metadata.GetCatalogName().ValueOrElse([] { return ""; }); +} + +std::string FlightSqlResultSetMetadata::GetTableName(int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + + return metadata.GetTableName().ValueOrElse([] { return ""; }); +} + +std::string FlightSqlResultSetMetadata::GetColumnLabel(int column_position) { + return schema_->field(column_position - 1)->name(); +} + +size_t FlightSqlResultSetMetadata::GetColumnDisplaySize(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + + int32_t column_size = metadata_settings_.string_column_length.value_or( + GetFieldPrecision(field).ValueOr(DefaultLengthForVariableLengthColumns)); + SqlDataType data_type_v3 = + GetDataTypeFromArrowFieldV3(field, metadata_settings_.use_wide_char); + + return GetDisplaySize(data_type_v3, column_size).value_or(odbcabstraction::NO_TOTAL); +} + +std::string FlightSqlResultSetMetadata::GetBaseColumnName(int column_position) { + return schema_->field(column_position - 1)->name(); +} + +std::string FlightSqlResultSetMetadata::GetBaseTableName(int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + return metadata.GetTableName().ValueOrElse([] { return ""; }); +} + +uint16_t FlightSqlResultSetMetadata::GetConciseType(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + + const SqlDataType sqlColumnType = + GetDataTypeFromArrowFieldV3(field, metadata_settings_.use_wide_char); + return sqlColumnType; +} + +size_t FlightSqlResultSetMetadata::GetLength(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + + int32_t column_size = metadata_settings_.string_column_length.value_or( + GetFieldPrecision(field).ValueOr(DefaultLengthForVariableLengthColumns)); + SqlDataType data_type_v3 = + GetDataTypeFromArrowFieldV3(field, metadata_settings_.use_wide_char); + + return flight_sql::GetLength(data_type_v3, column_size) + .value_or(DefaultLengthForVariableLengthColumns); +} + +std::string FlightSqlResultSetMetadata::GetLiteralPrefix(int column_position) { + // TODO: Flight SQL column metadata does not have this, should we add to the spec? + return ""; +} + +std::string FlightSqlResultSetMetadata::GetLiteralSuffix(int column_position) { + // TODO: Flight SQL column metadata does not have this, should we add to the spec? + return ""; +} + +std::string FlightSqlResultSetMetadata::GetLocalTypeName(int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + + // TODO: Is local type name the same as type name? + return metadata.GetTypeName().ValueOrElse([] { return ""; }); +} + +size_t FlightSqlResultSetMetadata::GetNumPrecRadix(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + SqlDataType data_type_v3 = + GetDataTypeFromArrowFieldV3(field, metadata_settings_.use_wide_char); + + return GetRadixFromSqlDataType(data_type_v3).value_or(odbcabstraction::NO_TOTAL); +} + +size_t FlightSqlResultSetMetadata::GetOctetLength(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + arrow::flight::sql::ColumnMetadata metadata = GetMetadata(field); + + int32_t column_size = metadata_settings_.string_column_length.value_or( + GetFieldPrecision(field).ValueOr(DefaultLengthForVariableLengthColumns)); + SqlDataType data_type_v3 = + GetDataTypeFromArrowFieldV3(field, metadata_settings_.use_wide_char); + + // Workaround to get the precision for Decimal and Numeric types, since server doesn't + // return it currently. + // TODO: Use the server precision when its fixed. + std::shared_ptr arrow_type = field->type(); + if (arrow_type->id() == arrow::Type::DECIMAL128) { + int32_t precision = GetDecimalTypePrecision(arrow_type); + return GetCharOctetLength(data_type_v3, column_size, precision) + .value_or(DefaultDecimalPrecision + 2); + } + + return GetCharOctetLength(data_type_v3, column_size) + .value_or(DefaultLengthForVariableLengthColumns); +} + +std::string FlightSqlResultSetMetadata::GetTypeName(int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + + return metadata.GetTypeName().ValueOrElse([] { return ""; }); +} + +driver::odbcabstraction::Updatability FlightSqlResultSetMetadata::GetUpdatable( + int column_position) { + return odbcabstraction::UPDATABILITY_READWRITE_UNKNOWN; +} + +bool FlightSqlResultSetMetadata::IsAutoUnique(int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + + // TODO: Is AutoUnique equivalent to AutoIncrement? + return metadata.GetIsAutoIncrement().ValueOrElse([] { return false; }); +} + +bool FlightSqlResultSetMetadata::IsCaseSensitive(int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + + return metadata.GetIsCaseSensitive().ValueOrElse([] { return false; }); +} + +driver::odbcabstraction::Searchability FlightSqlResultSetMetadata::IsSearchable( + int column_position) { + arrow::flight::sql::ColumnMetadata metadata = + GetMetadata(schema_->field(column_position - 1)); + + bool is_searchable = metadata.GetIsSearchable().ValueOrElse([] { return false; }); + return is_searchable ? odbcabstraction::SEARCHABILITY_ALL + : odbcabstraction::SEARCHABILITY_NONE; +} + +bool FlightSqlResultSetMetadata::IsUnsigned(int column_position) { + const std::shared_ptr& field = schema_->field(column_position - 1); + + switch (field->type()->id()) { + case arrow::Type::UINT8: + case arrow::Type::UINT16: + case arrow::Type::UINT32: + case arrow::Type::UINT64: + return true; + default: + return false; + } +} + +bool FlightSqlResultSetMetadata::IsFixedPrecScale(int column_position) { + // TODO: Flight SQL column metadata does not have this, should we add to the spec? + return false; +} + +FlightSqlResultSetMetadata::FlightSqlResultSetMetadata( + std::shared_ptr schema, + const odbcabstraction::MetadataSettings& metadata_settings) + : metadata_settings_(metadata_settings), schema_(std::move(schema)) {} + +FlightSqlResultSetMetadata::FlightSqlResultSetMetadata( + const std::shared_ptr& flight_info, + const odbcabstraction::MetadataSettings& metadata_settings) + : metadata_settings_(metadata_settings) { + arrow::ipc::DictionaryMemo dict_memo; + + ThrowIfNotOK(flight_info->GetSchema(&dict_memo).Value(&schema_)); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.h new file mode 100644 index 00000000000..f8e78eb2d6d --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set_metadata.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/flight/types.h" +#include "arrow/type.h" + +namespace driver { +namespace flight_sql { +class FlightSqlResultSetMetadata : public odbcabstraction::ResultSetMetadata { + private: + const odbcabstraction::MetadataSettings& metadata_settings_; + std::shared_ptr schema_; + + public: + FlightSqlResultSetMetadata( + const std::shared_ptr& flight_info, + const odbcabstraction::MetadataSettings& metadata_settings); + + FlightSqlResultSetMetadata(std::shared_ptr schema, + const odbcabstraction::MetadataSettings& metadata_settings); + + size_t GetColumnCount() override; + + std::string GetColumnName(int column_position) override; + + size_t GetPrecision(int column_position) override; + + size_t GetScale(int column_position) override; + + uint16_t GetDataType(int column_position) override; + + odbcabstraction::Nullability IsNullable(int column_position) override; + + std::string GetSchemaName(int column_position) override; + + std::string GetCatalogName(int column_position) override; + + std::string GetTableName(int column_position) override; + + std::string GetColumnLabel(int column_position) override; + + size_t GetColumnDisplaySize(int column_position) override; + + std::string GetBaseColumnName(int column_position) override; + + std::string GetBaseTableName(int column_position) override; + + uint16_t GetConciseType(int column_position) override; + + size_t GetLength(int column_position) override; + + std::string GetLiteralPrefix(int column_position) override; + + std::string GetLiteralSuffix(int column_position) override; + + std::string GetLocalTypeName(int column_position) override; + + std::string GetName(int column_position) override; + + size_t GetNumPrecRadix(int column_position) override; + + size_t GetOctetLength(int column_position) override; + + std::string GetTypeName(int column_position) override; + + odbcabstraction::Updatability GetUpdatable(int column_position) override; + + bool IsAutoUnique(int column_position) override; + + bool IsCaseSensitive(int column_position) override; + + odbcabstraction::Searchability IsSearchable(int column_position) override; + + bool IsUnsigned(int column_position) override; + + bool IsFixedPrecScale(int column_position) override; +}; +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_ssl_config.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_ssl_config.cc new file mode 100644 index 00000000000..9becf0e6f1f --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_ssl_config.cc @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_ssl_config.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" + +namespace driver { +namespace flight_sql { + +FlightSqlSslConfig::FlightSqlSslConfig(bool disable_certificate_verification, + const std::string& trusted_certs, + bool system_trust_store, bool use_encryption) + : trusted_certs_(trusted_certs), + use_encryption_(use_encryption), + disable_certificate_verification_(disable_certificate_verification), + system_trust_store_(system_trust_store) {} + +bool FlightSqlSslConfig::UseEncryption() const { return use_encryption_; } + +bool FlightSqlSslConfig::ShouldDisableCertificateVerification() const { + return disable_certificate_verification_; +} + +const std::string& FlightSqlSslConfig::GetTrustedCerts() const { return trusted_certs_; } + +bool FlightSqlSslConfig::UseSystemTrustStore() const { return system_trust_store_; } + +void FlightSqlSslConfig::PopulateOptionsWithCerts(arrow::flight::CertKeyPair* out) { + try { + std::ifstream cert_file(trusted_certs_); + if (!cert_file) { + throw odbcabstraction::DriverException("Could not open certificate: " + + trusted_certs_); + } + std::stringstream cert; + cert << cert_file.rdbuf(); + out->pem_cert = cert.str(); + } catch (const std::ifstream::failure& e) { + throw odbcabstraction::DriverException(e.what()); + } +} +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_ssl_config.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_ssl_config.h new file mode 100644 index 00000000000..bf1c3c9cbba --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_ssl_config.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace driver { +namespace flight_sql { + +/// \brief An Auxiliary class that holds all the information to perform +/// a SSL connection. +class FlightSqlSslConfig { + public: + FlightSqlSslConfig(bool disable_certificate_verification, + const std::string& trusted_certs, bool system_trust_store, + bool use_encryption); + + /// \brief Tells if ssl is enabled. By default it will be true. + /// \return Whether ssl is enabled. + bool UseEncryption() const; + + /// \brief Tells if disable certificate verification is enabled. + /// \return Whether disable certificate verification is enabled. + bool ShouldDisableCertificateVerification() const; + + /// \brief The path to the trusted certificate. + /// \return Certificate path. + const std::string& GetTrustedCerts() const; + + /// \brief Tells if we need to check if the certificate is in the system trust store. + /// \return Whether to use the system trust store. + bool UseSystemTrustStore() const; + + /// \brief Loads the certificate file and extract the certificate file from it + /// and create the object CertKeyPair with it on. + /// \param out A CertKeyPair with the cert on it. + void PopulateOptionsWithCerts(arrow::flight::CertKeyPair* out); + + private: + const std::string trusted_certs_; + const bool use_encryption_; + const bool disable_certificate_verification_; + const bool system_trust_store_; +}; +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement.cc new file mode 100644 index 00000000000..137594b68d6 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement.cc @@ -0,0 +1,298 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement.h" +#include +#include +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_columns.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_type_info.h" +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/io/memory.h" + +#include +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" + +namespace driver { +namespace flight_sql { + +using arrow::Result; +using arrow::Status; +using arrow::flight::FlightCallOptions; +using arrow::flight::FlightClientOptions; +using arrow::flight::FlightInfo; +using arrow::flight::Location; +using arrow::flight::TimeoutDuration; +using arrow::flight::sql::FlightSqlClient; +using arrow::flight::sql::PreparedStatement; +using driver::odbcabstraction::DriverException; +using driver::odbcabstraction::ResultSet; +using driver::odbcabstraction::ResultSetMetadata; +using driver::odbcabstraction::Statement; + +namespace { + +void ClosePreparedStatementIfAny( + std::shared_ptr& prepared_statement) { + if (prepared_statement != nullptr) { + ThrowIfNotOK(prepared_statement->Close()); + prepared_statement.reset(); + } +} + +} // namespace + +FlightSqlStatement::FlightSqlStatement( + const odbcabstraction::Diagnostics& diagnostics, FlightSqlClient& sql_client, + FlightCallOptions call_options, + const odbcabstraction::MetadataSettings& metadata_settings) + : diagnostics_("Apache Arrow", diagnostics.GetDataSourceComponent(), + diagnostics.GetOdbcVersion()), + sql_client_(sql_client), + call_options_(std::move(call_options)), + metadata_settings_(metadata_settings) { + attribute_[METADATA_ID] = static_cast(SQL_FALSE); + attribute_[MAX_LENGTH] = static_cast(0); + attribute_[NOSCAN] = static_cast(SQL_NOSCAN_OFF); + attribute_[QUERY_TIMEOUT] = static_cast(0); + call_options_.timeout = TimeoutDuration{-1}; +} + +bool FlightSqlStatement::SetAttribute(StatementAttributeId attribute, + const Attribute& value) { + switch (attribute) { + case METADATA_ID: + return CheckIfSetToOnlyValidValue(value, static_cast(SQL_FALSE)); + case NOSCAN: + return CheckIfSetToOnlyValidValue(value, static_cast(SQL_NOSCAN_OFF)); + case MAX_LENGTH: + return CheckIfSetToOnlyValidValue(value, static_cast(0)); + case QUERY_TIMEOUT: + if (boost::get(value) > 0) { + call_options_.timeout = + TimeoutDuration{static_cast(boost::get(value))}; + } else { + call_options_.timeout = TimeoutDuration{-1}; + // Intentional fall-through. + } + default: + attribute_[attribute] = value; + return true; + } +} + +boost::optional FlightSqlStatement::GetAttribute( + StatementAttributeId attribute) { + const auto& it = attribute_.find(attribute); + return boost::make_optional(it != attribute_.end(), it->second); +} + +boost::optional> FlightSqlStatement::Prepare( + const std::string& query) { + ClosePreparedStatementIfAny(prepared_statement_); + + Result> result = + sql_client_.Prepare(call_options_, query); + ThrowIfNotOK(result.status()); + + prepared_statement_ = *result; + + const auto& result_set_metadata = std::make_shared( + prepared_statement_->dataset_schema(), metadata_settings_); + return boost::optional>(result_set_metadata); +} + +bool FlightSqlStatement::ExecutePrepared() { + assert(prepared_statement_.get() != nullptr); + + Result> result = prepared_statement_->Execute(); + ThrowIfNotOK(result.status()); + + current_result_set_ = std::make_shared( + sql_client_, call_options_, result.ValueOrDie(), nullptr, diagnostics_, + metadata_settings_); + + return true; +} + +bool FlightSqlStatement::Execute(const std::string& query) { + ClosePreparedStatementIfAny(prepared_statement_); + + Result> result = sql_client_.Execute(call_options_, query); + ThrowIfNotOK(result.status()); + + current_result_set_ = std::make_shared( + sql_client_, call_options_, result.ValueOrDie(), nullptr, diagnostics_, + metadata_settings_); + + return true; +} + +std::shared_ptr FlightSqlStatement::GetResultSet() { + return current_result_set_; +} + +int64_t FlightSqlStatement::GetUpdateCount() { return -1; } + +std::shared_ptr FlightSqlStatement::GetTables( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* table_type, + const ColumnNames& column_names) { + ClosePreparedStatementIfAny(prepared_statement_); + + std::vector table_types; + + if ((catalog_name && *catalog_name == "%") && (schema_name && schema_name->empty()) && + (table_name && table_name->empty())) { + current_result_set_ = GetTablesForSQLAllCatalogs( + column_names, call_options_, sql_client_, diagnostics_, metadata_settings_); + } else if ((catalog_name && catalog_name->empty()) && + (schema_name && *schema_name == "%") && + (table_name && table_name->empty())) { + current_result_set_ = + GetTablesForSQLAllDbSchemas(column_names, call_options_, sql_client_, schema_name, + diagnostics_, metadata_settings_); + } else if ((catalog_name && catalog_name->empty()) && + (schema_name && schema_name->empty()) && + (table_name && table_name->empty()) && (table_type && *table_type == "%")) { + current_result_set_ = GetTablesForSQLAllTableTypes( + column_names, call_options_, sql_client_, diagnostics_, metadata_settings_); + } else { + if (table_type) { + ParseTableTypes(*table_type, table_types); + } + + current_result_set_ = GetTablesForGenericUse( + column_names, call_options_, sql_client_, catalog_name, schema_name, table_name, + table_types, diagnostics_, metadata_settings_); + } + + return current_result_set_; +} + +std::shared_ptr FlightSqlStatement::GetTables_V2( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* table_type) { + ColumnNames column_names{"TABLE_QUALIFIER", "TABLE_OWNER", "TABLE_NAME", "TABLE_TYPE", + "REMARKS"}; + + return GetTables(catalog_name, schema_name, table_name, table_type, column_names); +} + +std::shared_ptr FlightSqlStatement::GetTables_V3( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* table_type) { + ColumnNames column_names{"TABLE_CAT", "TABLE_SCHEM", "TABLE_NAME", "TABLE_TYPE", + "REMARKS"}; + + return GetTables(catalog_name, schema_name, table_name, table_type, column_names); +} + +std::shared_ptr FlightSqlStatement::GetColumns_V2( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* column_name) { + ClosePreparedStatementIfAny(prepared_statement_); + + Result> result = sql_client_.GetTables( + call_options_, catalog_name, schema_name, table_name, true, nullptr); + ThrowIfNotOK(result.status()); + + auto flight_info = result.ValueOrDie(); + + auto transformer = std::make_shared( + metadata_settings_, odbcabstraction::V_2, column_name); + + current_result_set_ = + std::make_shared(sql_client_, call_options_, flight_info, + transformer, diagnostics_, metadata_settings_); + + return current_result_set_; +} + +std::shared_ptr FlightSqlStatement::GetColumns_V3( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* column_name) { + ClosePreparedStatementIfAny(prepared_statement_); + + Result> result = sql_client_.GetTables( + call_options_, catalog_name, schema_name, table_name, true, nullptr); + ThrowIfNotOK(result.status()); + + auto flight_info = result.ValueOrDie(); + + auto transformer = std::make_shared( + metadata_settings_, odbcabstraction::V_3, column_name); + + current_result_set_ = + std::make_shared(sql_client_, call_options_, flight_info, + transformer, diagnostics_, metadata_settings_); + + return current_result_set_; +} + +std::shared_ptr FlightSqlStatement::GetTypeInfo_V2(int16_t data_type) { + ClosePreparedStatementIfAny(prepared_statement_); + + Result> result = sql_client_.GetXdbcTypeInfo(call_options_); + ThrowIfNotOK(result.status()); + + auto flight_info = result.ValueOrDie(); + + auto transformer = std::make_shared( + metadata_settings_, odbcabstraction::V_2, data_type); + + current_result_set_ = + std::make_shared(sql_client_, call_options_, flight_info, + transformer, diagnostics_, metadata_settings_); + + return current_result_set_; +} + +std::shared_ptr FlightSqlStatement::GetTypeInfo_V3(int16_t data_type) { + ClosePreparedStatementIfAny(prepared_statement_); + + Result> result = sql_client_.GetXdbcTypeInfo(call_options_); + ThrowIfNotOK(result.status()); + + auto flight_info = result.ValueOrDie(); + + auto transformer = std::make_shared( + metadata_settings_, odbcabstraction::V_3, data_type); + + current_result_set_ = + std::make_shared(sql_client_, call_options_, flight_info, + transformer, diagnostics_, metadata_settings_); + + return current_result_set_; +} + +odbcabstraction::Diagnostics& FlightSqlStatement::GetDiagnostics() { + return diagnostics_; +} + +void FlightSqlStatement::Cancel() { + if (!current_result_set_) return; + current_result_set_->Cancel(); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement.h new file mode 100644 index 00000000000..7ffb02ba40b --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/statement.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" + +#include "arrow/flight/api.h" +#include "arrow/flight/sql/api.h" +#include "arrow/flight/types.h" + +namespace driver { +namespace flight_sql { + +class FlightSqlStatement : public odbcabstraction::Statement { + private: + odbcabstraction::Diagnostics diagnostics_; + std::map attribute_; + arrow::flight::FlightCallOptions call_options_; + arrow::flight::sql::FlightSqlClient& sql_client_; + std::shared_ptr current_result_set_; + std::shared_ptr prepared_statement_; + const odbcabstraction::MetadataSettings& metadata_settings_; + + std::shared_ptr GetTables(const std::string* catalog_name, + const std::string* schema_name, + const std::string* table_name, + const std::string* table_type, + const ColumnNames& column_names); + + public: + FlightSqlStatement(const odbcabstraction::Diagnostics& diagnostics, + arrow::flight::sql::FlightSqlClient& sql_client, + arrow::flight::FlightCallOptions call_options, + const odbcabstraction::MetadataSettings& metadata_settings); + + bool SetAttribute(StatementAttributeId attribute, const Attribute& value) override; + + boost::optional GetAttribute(StatementAttributeId attribute) override; + + boost::optional> Prepare( + const std::string& query) override; + + bool ExecutePrepared() override; + + bool Execute(const std::string& query) override; + + std::shared_ptr GetResultSet() override; + + int64_t GetUpdateCount() override; + + std::shared_ptr GetTables_V2( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* table_type) override; + + std::shared_ptr GetTables_V3( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* table_type) override; + + std::shared_ptr GetColumns_V2( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* column_name) override; + + std::shared_ptr GetColumns_V3( + const std::string* catalog_name, const std::string* schema_name, + const std::string* table_name, const std::string* column_name) override; + + std::shared_ptr GetTypeInfo_V2(int16_t data_type) override; + + std::shared_ptr GetTypeInfo_V3(int16_t data_type) override; + + odbcabstraction::Diagnostics& GetDiagnostics() override; + + void Cancel() override; +}; +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_columns.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_columns.cc new file mode 100644 index 00000000000..56a21c04af3 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_columns.cc @@ -0,0 +1,248 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_columns.h" +#include "arrow/flight/sql/column_metadata.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_get_tables_reader.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +namespace driver { +namespace flight_sql { + +using arrow::Schema; +using arrow::flight::sql::ColumnMetadata; +using std::make_optional; +using std::nullopt; +using std::optional; + +namespace { +std::shared_ptr GetColumns_V3_Schema() { + return arrow::schema({ + field("TABLE_CAT", arrow::utf8()), + field("TABLE_SCHEM", arrow::utf8()), + field("TABLE_NAME", arrow::utf8()), + field("COLUMN_NAME", arrow::utf8()), + field("DATA_TYPE", arrow::int16()), + field("TYPE_NAME", arrow::utf8()), + field("COLUMN_SIZE", arrow::int32()), + field("BUFFER_LENGTH", arrow::int32()), + field("DECIMAL_DIGITS", arrow::int16()), + field("NUM_PREC_RADIX", arrow::int16()), + field("NULLABLE", arrow::int16()), + field("REMARKS", arrow::utf8()), + field("COLUMN_DEF", arrow::utf8()), + field("SQL_DATA_TYPE", arrow::int16()), + field("SQL_DATETIME_SUB", arrow::int16()), + field("CHAR_OCTET_LENGTH", arrow::int32()), + field("ORDINAL_POSITION", arrow::int32()), + field("IS_NULLABLE", arrow::utf8()), + }); +} + +std::shared_ptr GetColumns_V2_Schema() { + return arrow::schema({ + field("TABLE_QUALIFIER", arrow::utf8()), + field("TABLE_OWNER", arrow::utf8()), + field("TABLE_NAME", arrow::utf8()), + field("COLUMN_NAME", arrow::utf8()), + field("DATA_TYPE", arrow::int16()), + field("TYPE_NAME", arrow::utf8()), + field("PRECISION", arrow::int32()), + field("LENGTH", arrow::int32()), + field("SCALE", arrow::int16()), + field("RADIX", arrow::int16()), + field("NULLABLE", arrow::int16()), + field("REMARKS", arrow::utf8()), + field("COLUMN_DEF", arrow::utf8()), + field("SQL_DATA_TYPE", arrow::int16()), + field("SQL_DATETIME_SUB", arrow::int16()), + field("CHAR_OCTET_LENGTH", arrow::int32()), + field("ORDINAL_POSITION", arrow::int32()), + field("IS_NULLABLE", arrow::utf8()), + }); +} + +Result> TransformInner( + const odbcabstraction::OdbcVersion odbc_version, + const std::shared_ptr& original, + const optional& column_name_pattern, + const MetadataSettings& metadata_settings) { + GetColumns_RecordBatchBuilder builder(odbc_version); + GetColumns_RecordBatchBuilder::Data data; + + GetTablesReader reader(original); + + optional column_name_regex = + column_name_pattern ? make_optional(ConvertSqlPatternToRegex(*column_name_pattern)) + : nullopt; + + while (reader.Next()) { + const auto& table_catalog = reader.GetCatalogName(); + const auto& table_schema = reader.GetDbSchemaName(); + const auto& table_name = reader.GetTableName(); + const std::shared_ptr& schema = reader.GetSchema(); + if (schema == nullptr) { + // TODO: Remove this if after fixing TODO on GetTablesReader::GetSchema() + // This is because of a problem on Dremio server, where complex types columns + // are being returned without the children types, so we are simply ignoring + // it by now. + continue; + } + for (int i = 0; i < schema->num_fields(); ++i) { + const std::shared_ptr& field = schema->field(i); + + if (column_name_regex && + !boost::xpressive::regex_match(field->name(), *column_name_regex)) { + continue; + } + + odbcabstraction::SqlDataType data_type_v3 = + GetDataTypeFromArrowFieldV3(field, metadata_settings.use_wide_char); + + ColumnMetadata metadata(field->metadata()); + + data.table_cat = table_catalog; + data.table_schem = table_schema; + data.table_name = table_name; + data.column_name = field->name(); + data.data_type = odbc_version == odbcabstraction::V_3 + ? data_type_v3 + : ConvertSqlDataTypeFromV3ToV2(data_type_v3); + + // TODO: Use `metadata.GetTypeName()` when ARROW-16064 is merged. + const auto& type_name_result = field->metadata()->Get("ARROW:FLIGHT:SQL:TYPE_NAME"); + data.type_name = type_name_result.ok() ? type_name_result.ValueOrDie() + : GetTypeNameFromSqlDataType(data_type_v3); + + const Result& precision_result = metadata.GetPrecision(); + data.column_size = + precision_result.ok() ? make_optional(precision_result.ValueOrDie()) : nullopt; + data.char_octet_length = GetCharOctetLength(data_type_v3, precision_result); + + data.buffer_length = GetBufferLength(data_type_v3, data.column_size); + + const Result& scale_result = metadata.GetScale(); + data.decimal_digits = + scale_result.ok() ? make_optional(scale_result.ValueOrDie()) : nullopt; + data.num_prec_radix = GetRadixFromSqlDataType(data_type_v3); + data.nullable = field->nullable(); + data.remarks = nullopt; + data.column_def = nullopt; + data.sql_data_type = GetNonConciseDataType(data_type_v3); + data.sql_datetime_sub = GetSqlDateTimeSubCode(data_type_v3); + data.ordinal_position = i + 1; + data.is_nullable = field->nullable() ? "YES" : "NO"; + + ARROW_RETURN_NOT_OK(builder.Append(data)); + } + } + + return builder.Build(); +} +} // namespace + +GetColumns_RecordBatchBuilder::GetColumns_RecordBatchBuilder( + odbcabstraction::OdbcVersion odbc_version) + : odbc_version_(odbc_version) {} + +Result> GetColumns_RecordBatchBuilder::Build() { + ARROW_ASSIGN_OR_RAISE(auto TABLE_CAT_Array, TABLE_CAT_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto TABLE_SCHEM_Array, TABLE_SCHEM_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto TABLE_NAME_Array, TABLE_NAME_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto COLUMN_NAME_Array, COLUMN_NAME_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto DATA_TYPE_Array, DATA_TYPE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto TYPE_NAME_Array, TYPE_NAME_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto COLUMN_SIZE_Array, COLUMN_SIZE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto BUFFER_LENGTH_Array, BUFFER_LENGTH_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto DECIMAL_DIGITS_Array, DECIMAL_DIGITS_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto NUM_PREC_RADIX_Array, NUM_PREC_RADIX_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto NULLABLE_Array, NULLABLE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto REMARKS_Array, REMARKS_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto COLUMN_DEF_Array, COLUMN_DEF_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto SQL_DATA_TYPE_Array, SQL_DATA_TYPE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto SQL_DATETIME_SUB_Array, SQL_DATETIME_SUB_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto CHAR_OCTET_LENGTH_Array, CHAR_OCTET_LENGTH_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto ORDINAL_POSITION_Array, ORDINAL_POSITION_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto IS_NULLABLE_Array, IS_NULLABLE_Builder_.Finish()) + + std::vector> arrays = { + TABLE_CAT_Array, TABLE_SCHEM_Array, TABLE_NAME_Array, + COLUMN_NAME_Array, DATA_TYPE_Array, TYPE_NAME_Array, + COLUMN_SIZE_Array, BUFFER_LENGTH_Array, DECIMAL_DIGITS_Array, + NUM_PREC_RADIX_Array, NULLABLE_Array, REMARKS_Array, + COLUMN_DEF_Array, SQL_DATA_TYPE_Array, SQL_DATETIME_SUB_Array, + CHAR_OCTET_LENGTH_Array, ORDINAL_POSITION_Array, IS_NULLABLE_Array}; + + const std::shared_ptr& schema = odbc_version_ == odbcabstraction::V_3 + ? GetColumns_V3_Schema() + : GetColumns_V2_Schema(); + return RecordBatch::Make(schema, num_rows_, arrays); +} + +Status GetColumns_RecordBatchBuilder::Append( + const GetColumns_RecordBatchBuilder::Data& data) { + ARROW_RETURN_NOT_OK(AppendToBuilder(TABLE_CAT_Builder_, data.table_cat)); + ARROW_RETURN_NOT_OK(AppendToBuilder(TABLE_SCHEM_Builder_, data.table_schem)); + ARROW_RETURN_NOT_OK(AppendToBuilder(TABLE_NAME_Builder_, data.table_name)); + ARROW_RETURN_NOT_OK(AppendToBuilder(COLUMN_NAME_Builder_, data.column_name)); + ARROW_RETURN_NOT_OK(AppendToBuilder(DATA_TYPE_Builder_, data.data_type)); + ARROW_RETURN_NOT_OK(AppendToBuilder(TYPE_NAME_Builder_, data.type_name)); + ARROW_RETURN_NOT_OK(AppendToBuilder(COLUMN_SIZE_Builder_, data.column_size)); + ARROW_RETURN_NOT_OK(AppendToBuilder(BUFFER_LENGTH_Builder_, data.buffer_length)); + ARROW_RETURN_NOT_OK(AppendToBuilder(DECIMAL_DIGITS_Builder_, data.decimal_digits)); + ARROW_RETURN_NOT_OK(AppendToBuilder(NUM_PREC_RADIX_Builder_, data.num_prec_radix)); + ARROW_RETURN_NOT_OK(AppendToBuilder(NULLABLE_Builder_, data.nullable)); + ARROW_RETURN_NOT_OK(AppendToBuilder(REMARKS_Builder_, data.remarks)); + ARROW_RETURN_NOT_OK(AppendToBuilder(COLUMN_DEF_Builder_, data.column_def)); + ARROW_RETURN_NOT_OK(AppendToBuilder(SQL_DATA_TYPE_Builder_, data.sql_data_type)); + ARROW_RETURN_NOT_OK(AppendToBuilder(SQL_DATETIME_SUB_Builder_, data.sql_datetime_sub)); + ARROW_RETURN_NOT_OK( + AppendToBuilder(CHAR_OCTET_LENGTH_Builder_, data.char_octet_length)); + ARROW_RETURN_NOT_OK(AppendToBuilder(ORDINAL_POSITION_Builder_, data.ordinal_position)); + ARROW_RETURN_NOT_OK(AppendToBuilder(IS_NULLABLE_Builder_, data.is_nullable)); + num_rows_++; + + return Status::OK(); +} + +GetColumns_Transformer::GetColumns_Transformer( + const MetadataSettings& metadata_settings, + const odbcabstraction::OdbcVersion odbc_version, + const std::string* column_name_pattern) + : metadata_settings_(metadata_settings), + odbc_version_(odbc_version), + column_name_pattern_(column_name_pattern ? make_optional(*column_name_pattern) + : nullopt) {} + +std::shared_ptr GetColumns_Transformer::Transform( + const std::shared_ptr& original) { + const Result>& result = + TransformInner(odbc_version_, original, column_name_pattern_, metadata_settings_); + ThrowIfNotOK(result.status()); + + return result.ValueOrDie(); +} + +std::shared_ptr GetColumns_Transformer::GetTransformedSchema() { + return odbc_version_ == odbcabstraction::V_3 ? GetColumns_V3_Schema() + : GetColumns_V2_Schema(); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_columns.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_columns.h new file mode 100644 index 00000000000..5970bdf1243 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_columns.h @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/status.h" + +namespace driver { +namespace flight_sql { + +using arrow::Int16Builder; +using arrow::Int32Builder; +using arrow::Result; +using arrow::Status; +using arrow::StringBuilder; + +using odbcabstraction::MetadataSettings; +using std::optional; + +class GetColumns_RecordBatchBuilder { + private: + odbcabstraction::OdbcVersion odbc_version_; + + StringBuilder TABLE_CAT_Builder_; + StringBuilder TABLE_SCHEM_Builder_; + StringBuilder TABLE_NAME_Builder_; + StringBuilder COLUMN_NAME_Builder_; + Int16Builder DATA_TYPE_Builder_; + StringBuilder TYPE_NAME_Builder_; + Int32Builder COLUMN_SIZE_Builder_; + Int32Builder BUFFER_LENGTH_Builder_; + Int16Builder DECIMAL_DIGITS_Builder_; + Int16Builder NUM_PREC_RADIX_Builder_; + Int16Builder NULLABLE_Builder_; + StringBuilder REMARKS_Builder_; + StringBuilder COLUMN_DEF_Builder_; + Int16Builder SQL_DATA_TYPE_Builder_; + Int16Builder SQL_DATETIME_SUB_Builder_; + Int32Builder CHAR_OCTET_LENGTH_Builder_; + Int32Builder ORDINAL_POSITION_Builder_; + StringBuilder IS_NULLABLE_Builder_; + int64_t num_rows_{0}; + + public: + struct Data { + optional table_cat; + optional table_schem; + std::string table_name; + std::string column_name; + std::string type_name; + optional column_size; + optional buffer_length; + optional decimal_digits; + optional num_prec_radix; + optional remarks; + optional column_def; + int16_t sql_data_type{}; + optional sql_datetime_sub; + optional char_octet_length; + optional is_nullable; + int16_t data_type; + int16_t nullable; + int32_t ordinal_position; + }; + + explicit GetColumns_RecordBatchBuilder(odbcabstraction::OdbcVersion odbc_version); + + Result> Build(); + + Status Append(const Data& data); +}; + +class GetColumns_Transformer : public RecordBatchTransformer { + private: + const MetadataSettings& metadata_settings_; + odbcabstraction::OdbcVersion odbc_version_; + optional column_name_pattern_; + + public: + explicit GetColumns_Transformer(const MetadataSettings& metadata_settings, + odbcabstraction::OdbcVersion odbc_version, + const std::string* column_name_pattern); + + std::shared_ptr Transform( + const std::shared_ptr& original) override; + + std::shared_ptr GetTransformedSchema() override; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.cc new file mode 100644 index 00000000000..a3cdf9768d2 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.cc @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.h" +#include "arrow/flight/api.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.h" +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/types.h" + +namespace driver { +namespace flight_sql { + +using arrow::Result; +using arrow::flight::FlightClientOptions; +using arrow::flight::FlightInfo; +using arrow::flight::sql::FlightSqlClient; + +void ParseTableTypes(const std::string& table_type, + std::vector& table_types) { + bool encountered = false; // for checking if there is a single quote + std::string curr_parse; // the current string + + for (char temp : table_type) { // while still in the string + switch (temp) { // switch depending on the character + case '\'': // if the character is a single quote + if (encountered) { + encountered = false; // if we already found a single quote, reset encountered + } else { + encountered = + true; // if we haven't found a single quote, set encountered to true + } + break; + case ',': // if it is a comma + if (!encountered) { // if we have not found a single quote + table_types.push_back(curr_parse); // put our current string into our vector + curr_parse = ""; // reset the current string + break; + } + default: // if it is a normal character + if (encountered && isspace(temp)) { + curr_parse.push_back(temp); // if we have found a single quote put the + // whitespace, we don't care + } else if (temp == '\'' || temp == ' ') { + break; // if the current character is a single quote, trash it and go to + // the next character. + } else { + curr_parse.push_back(temp); // if all of the above failed, put the + // character into the current string + } + break; // go to the next character + } + } + table_types.emplace_back( + curr_parse); // if we have found a single quote put the whitespace, + // we don't care +} + +std::shared_ptr GetTablesForSQLAllCatalogs( + const ColumnNames& names, FlightCallOptions& call_options, + FlightSqlClient& sql_client, odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings) { + Result> result = sql_client.GetCatalogs(call_options); + + std::shared_ptr schema; + std::shared_ptr flight_info; + + ThrowIfNotOK(result.status()); + flight_info = result.ValueOrDie(); + ThrowIfNotOK(flight_info->GetSchema(nullptr).Value(&schema)); + + auto transformer = RecordBatchTransformerWithTasksBuilder(schema) + .RenameField("catalog_name", names.catalog_column) + .AddFieldOfNulls(names.schema_column, arrow::utf8()) + .AddFieldOfNulls(names.table_column, arrow::utf8()) + .AddFieldOfNulls(names.table_type_column, arrow::utf8()) + .AddFieldOfNulls(names.remarks_column, arrow::utf8()) + .Build(); + + return std::make_shared( + sql_client, call_options, flight_info, transformer, diagnostics, metadata_settings); +} + +std::shared_ptr GetTablesForSQLAllDbSchemas( + const ColumnNames& names, FlightCallOptions& call_options, + FlightSqlClient& sql_client, const std::string* schema_name, + odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings) { + Result> result = + sql_client.GetDbSchemas(call_options, nullptr, schema_name); + + std::shared_ptr schema; + std::shared_ptr flight_info; + + ThrowIfNotOK(result.status()); + flight_info = result.ValueOrDie(); + ThrowIfNotOK(flight_info->GetSchema(nullptr).Value(&schema)); + + auto transformer = RecordBatchTransformerWithTasksBuilder(schema) + .AddFieldOfNulls(names.catalog_column, arrow::utf8()) + .RenameField("db_schema_name", names.schema_column) + .AddFieldOfNulls(names.table_column, arrow::utf8()) + .AddFieldOfNulls(names.table_type_column, arrow::utf8()) + .AddFieldOfNulls(names.remarks_column, arrow::utf8()) + .Build(); + + return std::make_shared( + sql_client, call_options, flight_info, transformer, diagnostics, metadata_settings); +} + +std::shared_ptr GetTablesForSQLAllTableTypes( + const ColumnNames& names, FlightCallOptions& call_options, + FlightSqlClient& sql_client, odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings) { + Result> result = sql_client.GetTableTypes(call_options); + + std::shared_ptr schema; + std::shared_ptr flight_info; + + ThrowIfNotOK(result.status()); + flight_info = result.ValueOrDie(); + ThrowIfNotOK(flight_info->GetSchema(nullptr).Value(&schema)); + + auto transformer = RecordBatchTransformerWithTasksBuilder(schema) + .AddFieldOfNulls(names.catalog_column, arrow::utf8()) + .AddFieldOfNulls(names.schema_column, arrow::utf8()) + .AddFieldOfNulls(names.table_column, arrow::utf8()) + .RenameField("table_type", names.table_type_column) + .AddFieldOfNulls(names.remarks_column, arrow::utf8()) + .Build(); + + return std::make_shared( + sql_client, call_options, flight_info, transformer, diagnostics, metadata_settings); +} + +std::shared_ptr GetTablesForGenericUse( + const ColumnNames& names, FlightCallOptions& call_options, + FlightSqlClient& sql_client, const std::string* catalog_name, + const std::string* schema_name, const std::string* table_name, + const std::vector& table_types, + odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings) { + Result> result = sql_client.GetTables( + call_options, catalog_name, schema_name, table_name, false, &table_types); + + std::shared_ptr schema; + std::shared_ptr flight_info; + + ThrowIfNotOK(result.status()); + flight_info = result.ValueOrDie(); + ThrowIfNotOK(flight_info->GetSchema(nullptr).Value(&schema)); + + auto transformer = RecordBatchTransformerWithTasksBuilder(schema) + .RenameField("catalog_name", names.catalog_column) + .RenameField("db_schema_name", names.schema_column) + .RenameField("table_name", names.table_column) + .RenameField("table_type", names.table_type_column) + .AddFieldOfNulls(names.remarks_column, arrow::utf8()) + .Build(); + + return std::make_shared( + sql_client, call_options, flight_info, transformer, diagnostics, metadata_settings); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.h new file mode 100644 index 00000000000..8f0dc5fef6d --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.h @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/client.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/flight/types.h" +#include "arrow/type.h" + +namespace driver { +namespace flight_sql { + +using arrow::flight::FlightCallOptions; +using arrow::flight::sql::FlightSqlClient; +using odbcabstraction::MetadataSettings; +using odbcabstraction::ResultSet; + +typedef struct { + std::string catalog_column; + std::string schema_column; + std::string table_column; + std::string table_type_column; + std::string remarks_column; +} ColumnNames; + +void ParseTableTypes(const std::string& table_type, + std::vector& table_types); + +std::shared_ptr GetTablesForSQLAllCatalogs( + const ColumnNames& column_names, FlightCallOptions& call_options, + FlightSqlClient& sql_client, odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings); + +std::shared_ptr GetTablesForSQLAllDbSchemas( + const ColumnNames& column_names, FlightCallOptions& call_options, + FlightSqlClient& sql_client, const std::string* schema_name, + odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings); + +std::shared_ptr GetTablesForSQLAllTableTypes( + const ColumnNames& column_names, FlightCallOptions& call_options, + FlightSqlClient& sql_client, odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings); + +std::shared_ptr GetTablesForGenericUse( + const ColumnNames& column_names, FlightCallOptions& call_options, + FlightSqlClient& sql_client, const std::string* catalog_name, + const std::string* schema_name, const std::string* table_name, + const std::vector& table_types, + odbcabstraction::Diagnostics& diagnostics, + const odbcabstraction::MetadataSettings& metadata_settings); +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_type_info.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_type_info.cc new file mode 100644 index 00000000000..eddba5a08c2 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_type_info.cc @@ -0,0 +1,233 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_type_info.h" +#include +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_get_type_info_reader.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +namespace driver { +namespace flight_sql { + +using std::make_optional; +using std::nullopt; +using std::optional; + +namespace { +std::shared_ptr GetTypeInfo_V3_Schema() { + return arrow::schema({ + field("TYPE_NAME", arrow::utf8(), false), + field("DATA_TYPE", arrow::int16(), false), + field("COLUMN_SIZE", arrow::int32()), + field("LITERAL_PREFIX", arrow::utf8()), + field("LITERAL_SUFFIX", arrow::utf8()), + field("CREATE_PARAMS", arrow::utf8()), + field("NULLABLE", arrow::int16(), false), + field("CASE_SENSITIVE", arrow::int16(), false), + field("SEARCHABLE", arrow::int16(), false), + field("UNSIGNED_ATTRIBUTE", arrow::int16()), + field("FIXED_PREC_SCALE", arrow::int16(), false), + field("AUTO_UNIQUE_VALUE", arrow::int16()), + field("LOCAL_TYPE_NAME", arrow::utf8()), + field("MINIMUM_SCALE", arrow::int16()), + field("MAXIMUM_SCALE", arrow::int16()), + field("SQL_DATA_TYPE", arrow::int16(), false), + field("SQL_DATETIME_SUB", arrow::int16()), + field("NUM_PREC_RADIX", arrow::int32()), + field("INTERVAL_PRECISION", arrow::int16()), + }); +} + +std::shared_ptr GetTypeInfo_V2_Schema() { + return arrow::schema({ + field("TYPE_NAME", arrow::utf8(), false), + field("DATA_TYPE", arrow::int16(), false), + field("PRECISION", arrow::int32()), + field("LITERAL_PREFIX", arrow::utf8()), + field("LITERAL_SUFFIX", arrow::utf8()), + field("CREATE_PARAMS", arrow::utf8()), + field("NULLABLE", arrow::int16(), false), + field("CASE_SENSITIVE", arrow::int16(), false), + field("SEARCHABLE", arrow::int16(), false), + field("UNSIGNED_ATTRIBUTE", arrow::int16()), + field("MONEY", arrow::int16(), false), + field("AUTO_INCREMENT", arrow::int16()), + field("LOCAL_TYPE_NAME", arrow::utf8()), + field("MINIMUM_SCALE", arrow::int16()), + field("MAXIMUM_SCALE", arrow::int16()), + field("SQL_DATA_TYPE", arrow::int16(), false), + field("SQL_DATETIME_SUB", arrow::int16()), + field("NUM_PREC_RADIX", arrow::int32()), + field("INTERVAL_PRECISION", arrow::int16()), + }); +} + +Result> TransformInner( + const odbcabstraction::OdbcVersion odbc_version, + const std::shared_ptr& original, int data_type, + const MetadataSettings& metadata_settings_) { + GetTypeInfoRecordBatchBuilder builder(odbc_version); + GetTypeInfoRecordBatchBuilder::Data data; + + GetTypeInfoReader reader(original); + + while (reader.Next()) { + auto data_type_v3 = EnsureRightSqlCharType( + static_cast(reader.GetDataType()), + metadata_settings_.use_wide_char); + int16_t data_type_v2 = ConvertSqlDataTypeFromV3ToV2(data_type_v3); + + if (data_type != odbcabstraction::ALL_TYPES && data_type_v3 != data_type && + data_type_v2 != data_type) { + continue; + } + + data.data_type = odbc_version == odbcabstraction::V_3 ? data_type_v3 : data_type_v2; + data.type_name = reader.GetTypeName(); + data.column_size = reader.GetColumnSize(); + data.literal_prefix = reader.GetLiteralPrefix(); + data.literal_suffix = reader.GetLiteralSuffix(); + + const auto& create_params = reader.GetCreateParams(); + if (create_params) { + data.create_params = boost::algorithm::join(*create_params, ","); + } else { + data.create_params = nullopt; + } + + data.nullable = reader.GetNullable() ? odbcabstraction::NULLABILITY_NULLABLE + : odbcabstraction::NULLABILITY_NO_NULLS; + data.case_sensitive = reader.GetCaseSensitive(); + data.searchable = reader.GetSearchable() ? odbcabstraction::SEARCHABILITY_ALL + : odbcabstraction::SEARCHABILITY_NONE; + data.unsigned_attribute = reader.GetUnsignedAttribute(); + data.fixed_prec_scale = reader.GetFixedPrecScale(); + data.auto_unique_value = reader.GetAutoIncrement(); + data.local_type_name = reader.GetLocalTypeName(); + data.minimum_scale = reader.GetMinimumScale(); + data.maximum_scale = reader.GetMaximumScale(); + data.sql_data_type = EnsureRightSqlCharType( + static_cast(reader.GetSqlDataType()), + metadata_settings_.use_wide_char); + data.sql_datetime_sub = + GetSqlDateTimeSubCode(static_cast(data.data_type)); + data.num_prec_radix = reader.GetNumPrecRadix(); + data.interval_precision = reader.GetIntervalPrecision(); + + ARROW_RETURN_NOT_OK(builder.Append(data)); + } + + return builder.Build(); +} +} // namespace + +GetTypeInfoRecordBatchBuilder::GetTypeInfoRecordBatchBuilder( + odbcabstraction::OdbcVersion odbc_version) + : odbc_version_(odbc_version) {} + +Result> GetTypeInfoRecordBatchBuilder::Build() { + ARROW_ASSIGN_OR_RAISE(auto TYPE_NAME_Array, TYPE_NAME_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto DATA_TYPE_Array, DATA_TYPE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto COLUMN_SIZE_Array, COLUMN_SIZE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto LITERAL_PREFIX_Array, LITERAL_PREFIX_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto LITERAL_SUFFIX_Array, LITERAL_SUFFIX_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto CREATE_PARAMS_Array, CREATE_PARAMS_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto NULLABLE_Array, NULLABLE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto CASE_SENSITIVE_Array, CASE_SENSITIVE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto SEARCHABLE_Array, SEARCHABLE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto UNSIGNED_ATTRIBUTE_Array, + UNSIGNED_ATTRIBUTE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto FIXED_PREC_SCALE_Array, FIXED_PREC_SCALE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto AUTO_UNIQUE_VALUE_Array, AUTO_UNIQUE_VALUE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto LOCAL_TYPE_NAME_Array, LOCAL_TYPE_NAME_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto MINIMUM_SCALE_Array, MINIMUM_SCALE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto MAXIMUM_SCALE_Array, MAXIMUM_SCALE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto SQL_DATA_TYPE_Array, SQL_DATA_TYPE_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto SQL_DATETIME_SUB_Array, SQL_DATETIME_SUB_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto NUM_PREC_RADIX_Array, NUM_PREC_RADIX_Builder_.Finish()) + ARROW_ASSIGN_OR_RAISE(auto INTERVAL_PRECISION_Array, + INTERVAL_PRECISION_Builder_.Finish()) + + std::vector> arrays = { + TYPE_NAME_Array, DATA_TYPE_Array, COLUMN_SIZE_Array, + LITERAL_PREFIX_Array, LITERAL_SUFFIX_Array, CREATE_PARAMS_Array, + NULLABLE_Array, CASE_SENSITIVE_Array, SEARCHABLE_Array, + UNSIGNED_ATTRIBUTE_Array, FIXED_PREC_SCALE_Array, AUTO_UNIQUE_VALUE_Array, + LOCAL_TYPE_NAME_Array, MINIMUM_SCALE_Array, MAXIMUM_SCALE_Array, + SQL_DATA_TYPE_Array, SQL_DATETIME_SUB_Array, NUM_PREC_RADIX_Array, + INTERVAL_PRECISION_Array}; + + const std::shared_ptr& schema = odbc_version_ == odbcabstraction::V_3 + ? GetTypeInfo_V3_Schema() + : GetTypeInfo_V2_Schema(); + return RecordBatch::Make(schema, num_rows_, arrays); +} + +Status GetTypeInfoRecordBatchBuilder::Append( + const GetTypeInfoRecordBatchBuilder::Data& data) { + ARROW_RETURN_NOT_OK(AppendToBuilder(TYPE_NAME_Builder_, data.type_name)); + ARROW_RETURN_NOT_OK(AppendToBuilder(DATA_TYPE_Builder_, data.data_type)); + ARROW_RETURN_NOT_OK(AppendToBuilder(COLUMN_SIZE_Builder_, data.column_size)); + ARROW_RETURN_NOT_OK(AppendToBuilder(LITERAL_PREFIX_Builder_, data.literal_prefix)); + ARROW_RETURN_NOT_OK(AppendToBuilder(LITERAL_SUFFIX_Builder_, data.literal_suffix)); + ARROW_RETURN_NOT_OK(AppendToBuilder(CREATE_PARAMS_Builder_, data.create_params)); + ARROW_RETURN_NOT_OK(AppendToBuilder(NULLABLE_Builder_, data.nullable)); + ARROW_RETURN_NOT_OK(AppendToBuilder(CASE_SENSITIVE_Builder_, data.case_sensitive)); + ARROW_RETURN_NOT_OK(AppendToBuilder(SEARCHABLE_Builder_, data.searchable)); + ARROW_RETURN_NOT_OK( + AppendToBuilder(UNSIGNED_ATTRIBUTE_Builder_, data.unsigned_attribute)); + ARROW_RETURN_NOT_OK(AppendToBuilder(FIXED_PREC_SCALE_Builder_, data.fixed_prec_scale)); + ARROW_RETURN_NOT_OK( + AppendToBuilder(AUTO_UNIQUE_VALUE_Builder_, data.auto_unique_value)); + ARROW_RETURN_NOT_OK(AppendToBuilder(LOCAL_TYPE_NAME_Builder_, data.local_type_name)); + ARROW_RETURN_NOT_OK(AppendToBuilder(MINIMUM_SCALE_Builder_, data.minimum_scale)); + ARROW_RETURN_NOT_OK(AppendToBuilder(MAXIMUM_SCALE_Builder_, data.maximum_scale)); + ARROW_RETURN_NOT_OK(AppendToBuilder(SQL_DATA_TYPE_Builder_, data.sql_data_type)); + ARROW_RETURN_NOT_OK(AppendToBuilder(SQL_DATETIME_SUB_Builder_, data.sql_datetime_sub)); + ARROW_RETURN_NOT_OK(AppendToBuilder(NUM_PREC_RADIX_Builder_, data.num_prec_radix)); + ARROW_RETURN_NOT_OK( + AppendToBuilder(INTERVAL_PRECISION_Builder_, data.interval_precision)); + num_rows_++; + + return Status::OK(); +} + +GetTypeInfoTransformer::GetTypeInfoTransformer( + const MetadataSettings& metadata_settings, + const odbcabstraction::OdbcVersion odbc_version, int data_type) + : metadata_settings_(metadata_settings), + odbc_version_(odbc_version), + data_type_(data_type) {} + +std::shared_ptr GetTypeInfoTransformer::Transform( + const std::shared_ptr& original) { + const Result>& result = + TransformInner(odbc_version_, original, data_type_, metadata_settings_); + ThrowIfNotOK(result.status()); + + return result.ValueOrDie(); +} + +std::shared_ptr GetTypeInfoTransformer::GetTransformedSchema() { + return odbc_version_ == odbcabstraction::V_3 ? GetTypeInfo_V3_Schema() + : GetTypeInfo_V2_Schema(); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_type_info.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_type_info.h new file mode 100644 index 00000000000..f212a659887 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_type_info.h @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" +#include "arrow/status.h" + +namespace driver { +namespace flight_sql { + +using arrow::Int16Builder; +using arrow::Int32Builder; +using arrow::Result; +using arrow::Status; +using arrow::StringBuilder; + +using odbcabstraction::MetadataSettings; +using std::optional; + +class GetTypeInfoRecordBatchBuilder { + private: + odbcabstraction::OdbcVersion odbc_version_; + + StringBuilder TYPE_NAME_Builder_; + Int16Builder DATA_TYPE_Builder_; + Int32Builder COLUMN_SIZE_Builder_; + StringBuilder LITERAL_PREFIX_Builder_; + StringBuilder LITERAL_SUFFIX_Builder_; + StringBuilder CREATE_PARAMS_Builder_; + Int16Builder NULLABLE_Builder_; + Int16Builder CASE_SENSITIVE_Builder_; + Int16Builder SEARCHABLE_Builder_; + Int16Builder UNSIGNED_ATTRIBUTE_Builder_; + Int16Builder FIXED_PREC_SCALE_Builder_; + Int16Builder AUTO_UNIQUE_VALUE_Builder_; + StringBuilder LOCAL_TYPE_NAME_Builder_; + Int16Builder MINIMUM_SCALE_Builder_; + Int16Builder MAXIMUM_SCALE_Builder_; + Int16Builder SQL_DATA_TYPE_Builder_; + Int16Builder SQL_DATETIME_SUB_Builder_; + Int32Builder NUM_PREC_RADIX_Builder_; + Int16Builder INTERVAL_PRECISION_Builder_; + int64_t num_rows_{0}; + + public: + struct Data { + std::string type_name; + int16_t data_type; + optional column_size; + optional literal_prefix; + optional literal_suffix; + optional create_params; + int16_t nullable; + int16_t case_sensitive; + int16_t searchable; + optional unsigned_attribute; + int16_t fixed_prec_scale; + optional auto_unique_value; + optional local_type_name; + optional minimum_scale; + optional maximum_scale; + int16_t sql_data_type; + optional sql_datetime_sub; + optional num_prec_radix; + optional interval_precision; + }; + + explicit GetTypeInfoRecordBatchBuilder(odbcabstraction::OdbcVersion odbc_version); + + Result> Build(); + + Status Append(const Data& data); +}; + +class GetTypeInfoTransformer : public RecordBatchTransformer { + private: + const MetadataSettings& metadata_settings_; + odbcabstraction::OdbcVersion odbc_version_; + int data_type_; + + public: + explicit GetTypeInfoTransformer(const MetadataSettings& metadata_settings, + odbcabstraction::OdbcVersion odbc_version, + int data_type); + + std::shared_ptr Transform( + const std::shared_ptr& original) override; + + std::shared_ptr GetTransformedSchema() override; +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.cc new file mode 100644 index 00000000000..b0c93db2ddc --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.cc @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" + +namespace driver { +namespace flight_sql { + +using arrow::flight::FlightEndpoint; + +FlightStreamChunkBuffer::FlightStreamChunkBuffer( + FlightSqlClient& flight_sql_client, + const arrow::flight::FlightCallOptions& call_options, + const std::shared_ptr& flight_info, size_t queue_capacity) + : queue_(queue_capacity) { + // FIXME: Endpoint iteration should consider endpoints may be at different hosts + for (const auto& endpoint : flight_info->endpoints()) { + const arrow::flight::Ticket& ticket = endpoint.ticket; + + auto result = flight_sql_client.DoGet(call_options, ticket); + ThrowIfNotOK(result.status()); + std::shared_ptr stream_reader_ptr(std::move(result.ValueOrDie())); + + BlockingQueue>::Supplier supplier = [=] { + auto result = stream_reader_ptr->Next(); + bool is_not_ok = !result.ok(); + bool is_not_empty = result.ok() && (result.ValueOrDie().data != nullptr); + + return boost::make_optional(is_not_ok || is_not_empty, std::move(result)); + }; + queue_.AddProducer(std::move(supplier)); + } +} + +bool FlightStreamChunkBuffer::GetNext(FlightStreamChunk* chunk) { + Result result; + if (!queue_.Pop(&result)) { + return false; + } + + if (!result.status().ok()) { + Close(); + throw odbcabstraction::DriverException(result.status().message()); + } + *chunk = std::move(result.ValueOrDie()); + return chunk->data != nullptr; +} + +void FlightStreamChunkBuffer::Close() { queue_.Close(); } + +FlightStreamChunkBuffer::~FlightStreamChunkBuffer() { Close(); } + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.h new file mode 100644 index 00000000000..4a84bcbede0 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace driver { +namespace flight_sql { + +using arrow::Result; +using arrow::flight::FlightInfo; +using arrow::flight::FlightStreamChunk; +using arrow::flight::FlightStreamReader; +using arrow::flight::sql::FlightSqlClient; +using driver::odbcabstraction::BlockingQueue; + +class FlightStreamChunkBuffer { + BlockingQueue> queue_; + + public: + FlightStreamChunkBuffer(FlightSqlClient& flight_sql_client, + const arrow::flight::FlightCallOptions& call_options, + const std::shared_ptr& flight_info, + size_t queue_capacity = 5); + + ~FlightStreamChunkBuffer(); + + void Close(); + + bool GetNext(FlightStreamChunk* chunk); +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/get_info_cache.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/get_info_cache.cc new file mode 100644 index 00000000000..d18322badbe --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/get_info_cache.cc @@ -0,0 +1,1345 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/get_info_cache.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include +#include "arrow/array.h" +#include "arrow/array/array_nested.h" +#include "arrow/flight/sql/api.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" +#include "arrow/scalar.h" +#include "arrow/type_fwd.h" + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_stream_chunk_buffer.h" +#include "arrow/flight/sql/odbc/flight_sql/scalar_function_reporter.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" + +// Aliases for entries in SqlInfoOptions::SqlInfo that are defined here +// due to causing compilation errors conflicting with ODBC definitions. +#define ARROW_SQL_IDENTIFIER_CASE 503 +#define ARROW_SQL_IDENTIFIER_QUOTE_CHAR 504 +#define ARROW_SQL_QUOTED_IDENTIFIER_CASE 505 +#define ARROW_SQL_KEYWORDS 508 +#define ARROW_SQL_NUMERIC_FUNCTIONS 509 +#define ARROW_SQL_STRING_FUNCTIONS 510 +#define ARROW_SQL_SYSTEM_FUNCTIONS 511 +#define ARROW_SQL_SCHEMA_TERM 529 +#define ARROW_SQL_PROCEDURE_TERM 530 +#define ARROW_SQL_CATALOG_TERM 531 +#define ARROW_SQL_MAX_COLUMNS_IN_GROUP_BY 544 +#define ARROW_SQL_MAX_COLUMNS_IN_INDEX 545 +#define ARROW_SQL_MAX_COLUMNS_IN_ORDER_BY 546 +#define ARROW_SQL_MAX_COLUMNS_IN_SELECT 547 +#define ARROW_SQL_MAX_COLUMNS_IN_TABLE 548 +#define ARROW_SQL_MAX_ROW_SIZE 555 +#define ARROW_SQL_MAX_TABLES_IN_SELECT 560 + +#define ARROW_CONVERT_BIGINT 0 +#define ARROW_CONVERT_BINARY 1 +#define ARROW_CONVERT_BIT 2 +#define ARROW_CONVERT_CHAR 3 +#define ARROW_CONVERT_DATE 4 +#define ARROW_CONVERT_DECIMAL 5 +#define ARROW_CONVERT_FLOAT 6 +#define ARROW_CONVERT_INTEGER 7 +#define ARROW_CONVERT_INTERVAL_DAY_TIME 8 +#define ARROW_CONVERT_INTERVAL_YEAR_MONTH 9 +#define ARROW_CONVERT_LONGVARBINARY 10 +#define ARROW_CONVERT_LONGVARCHAR 11 +#define ARROW_CONVERT_NUMERIC 12 +#define ARROW_CONVERT_REAL 13 +#define ARROW_CONVERT_SMALLINT 14 +#define ARROW_CONVERT_TIME 15 +#define ARROW_CONVERT_TIMESTAMP 16 +#define ARROW_CONVERT_TINYINT 17 +#define ARROW_CONVERT_VARBINARY 18 +#define ARROW_CONVERT_VARCHAR 19 + +namespace { +// Return the corresponding field in SQLGetInfo's SQL_CONVERT_* field +// types for the given Arrow SqlConvert enum value. +// +// The caller is responsible for casting the result to a uint16. Note +// that -1 is returned if there's no corresponding entry. +int32_t GetInfoTypeForArrowConvertEntry(int32_t convert_entry) { + switch (convert_entry) { + case ARROW_CONVERT_BIGINT: + return SQL_CONVERT_BIGINT; + case ARROW_CONVERT_BINARY: + return SQL_CONVERT_BINARY; + case ARROW_CONVERT_BIT: + return SQL_CONVERT_BIT; + case ARROW_CONVERT_CHAR: + return SQL_CONVERT_CHAR; + case ARROW_CONVERT_DATE: + return SQL_CONVERT_DATE; + case ARROW_CONVERT_DECIMAL: + return SQL_CONVERT_DECIMAL; + case ARROW_CONVERT_FLOAT: + return SQL_CONVERT_FLOAT; + case ARROW_CONVERT_INTEGER: + return SQL_CONVERT_INTEGER; + case ARROW_CONVERT_INTERVAL_DAY_TIME: + return SQL_CONVERT_INTERVAL_DAY_TIME; + case ARROW_CONVERT_INTERVAL_YEAR_MONTH: + return SQL_CONVERT_INTERVAL_YEAR_MONTH; + case ARROW_CONVERT_LONGVARBINARY: + return SQL_CONVERT_LONGVARBINARY; + case ARROW_CONVERT_LONGVARCHAR: + return SQL_CONVERT_LONGVARCHAR; + case ARROW_CONVERT_NUMERIC: + return SQL_CONVERT_NUMERIC; + case ARROW_CONVERT_REAL: + return SQL_CONVERT_REAL; + case ARROW_CONVERT_SMALLINT: + return SQL_CONVERT_SMALLINT; + case ARROW_CONVERT_TIME: + return SQL_CONVERT_TIME; + case ARROW_CONVERT_TIMESTAMP: + return SQL_CONVERT_TIMESTAMP; + case ARROW_CONVERT_TINYINT: + return SQL_CONVERT_TINYINT; + case ARROW_CONVERT_VARBINARY: + return SQL_CONVERT_VARBINARY; + case ARROW_CONVERT_VARCHAR: + return SQL_CONVERT_VARCHAR; + } + // Arbitrarily return a negative value + return -1; +} + +// Return the corresponding bitmask to OR in SQLGetInfo's SQL_CONVERT_* field +// value for the given Arrow SqlConvert enum value. +// +// This is _not_ a bit position, it is an integer with only a single bit set. +uint32_t GetCvtBitForArrowConvertEntry(int32_t convert_entry) { + switch (convert_entry) { + case ARROW_CONVERT_BIGINT: + return SQL_CVT_BIGINT; + case ARROW_CONVERT_BINARY: + return SQL_CVT_BINARY; + case ARROW_CONVERT_BIT: + return SQL_CVT_BIT; + case ARROW_CONVERT_CHAR: + return SQL_CVT_CHAR | SQL_CVT_WCHAR; + case ARROW_CONVERT_DATE: + return SQL_CVT_DATE; + case ARROW_CONVERT_DECIMAL: + return SQL_CVT_DECIMAL; + case ARROW_CONVERT_FLOAT: + return SQL_CVT_FLOAT; + case ARROW_CONVERT_INTEGER: + return SQL_CVT_INTEGER; + case ARROW_CONVERT_INTERVAL_DAY_TIME: + return SQL_CVT_INTERVAL_DAY_TIME; + case ARROW_CONVERT_INTERVAL_YEAR_MONTH: + return SQL_CVT_INTERVAL_YEAR_MONTH; + case ARROW_CONVERT_LONGVARBINARY: + return SQL_CVT_LONGVARBINARY; + case ARROW_CONVERT_LONGVARCHAR: + return SQL_CVT_LONGVARCHAR | SQL_CVT_WLONGVARCHAR; + case ARROW_CONVERT_NUMERIC: + return SQL_CVT_NUMERIC; + case ARROW_CONVERT_REAL: + return SQL_CVT_REAL; + case ARROW_CONVERT_SMALLINT: + return SQL_CVT_SMALLINT; + case ARROW_CONVERT_TIME: + return SQL_CVT_TIME; + case ARROW_CONVERT_TIMESTAMP: + return SQL_CVT_TIMESTAMP; + case ARROW_CONVERT_TINYINT: + return SQL_CVT_TINYINT; + case ARROW_CONVERT_VARBINARY: + return SQL_CVT_VARBINARY; + case ARROW_CONVERT_VARCHAR: + return SQL_CVT_VARCHAR | SQL_CVT_WLONGVARCHAR; + } + // Note: GUID not supported by GetSqlInfo. + // Return zero, which has no bits set. + return 0; +} + +inline int32_t ScalarToInt32(arrow::UnionScalar* scalar) { + return reinterpret_cast(scalar->child_value().get())->value; +} + +inline int64_t ScalarToInt64(arrow::UnionScalar* scalar) { + return reinterpret_cast(scalar->child_value().get())->value; +} + +inline std::string ScalarToBoolString(arrow::UnionScalar* scalar) { + return reinterpret_cast(scalar->child_value().get())->value + ? "Y" + : "N"; +} + +inline void SetDefaultIfMissing( + std::unordered_map& cache, + uint16_t info_type, driver::odbcabstraction::Connection::Info default_value) { + // Note: emplace() only writes if the key isn't found. + cache.emplace(info_type, std::move(default_value)); +} + +} // namespace + +namespace driver { +namespace flight_sql { +using arrow::flight::FlightCallOptions; +using arrow::flight::sql::FlightSqlClient; +using arrow::flight::sql::SqlInfoOptions; +using driver::odbcabstraction::Connection; +using driver::odbcabstraction::DriverException; + +GetInfoCache::GetInfoCache(FlightCallOptions& call_options, + std::unique_ptr& client, + const std::string& driver_version) + : call_options_(call_options), sql_client_(client), has_server_info_(false) { + info_[SQL_DRIVER_NAME] = "Arrow Flight ODBC Driver"; + info_[SQL_DRIVER_VER] = ConvertToDBMSVer(driver_version); + + info_[SQL_GETDATA_EXTENSIONS] = + static_cast(SQL_GD_ANY_COLUMN | SQL_GD_ANY_ORDER); + info_[SQL_CURSOR_SENSITIVITY] = static_cast(SQL_UNSPECIFIED); + + // Properties which don't currently have SqlGetInfo fields but probably + // should. + info_[SQL_ACCESSIBLE_PROCEDURES] = "N"; + info_[SQL_COLLATION_SEQ] = ""; + info_[SQL_ALTER_DOMAIN] = static_cast(0); + info_[SQL_ALTER_TABLE] = static_cast(0); + info_[SQL_COLUMN_ALIAS] = "Y"; + info_[SQL_DATETIME_LITERALS] = static_cast( + SQL_DL_SQL92_DATE | SQL_DL_SQL92_TIME | SQL_DL_SQL92_TIMESTAMP); + info_[SQL_CREATE_ASSERTION] = static_cast(0); + info_[SQL_CREATE_CHARACTER_SET] = static_cast(0); + info_[SQL_CREATE_COLLATION] = static_cast(0); + info_[SQL_CREATE_DOMAIN] = static_cast(0); + info_[SQL_INDEX_KEYWORDS] = static_cast(SQL_IK_NONE); + info_[SQL_TIMEDATE_ADD_INTERVALS] = static_cast( + SQL_FN_TSI_FRAC_SECOND | SQL_FN_TSI_SECOND | SQL_FN_TSI_MINUTE | SQL_FN_TSI_HOUR | + SQL_FN_TSI_DAY | SQL_FN_TSI_WEEK | SQL_FN_TSI_MONTH | SQL_FN_TSI_QUARTER | + SQL_FN_TSI_YEAR); + info_[SQL_TIMEDATE_DIFF_INTERVALS] = static_cast( + SQL_FN_TSI_FRAC_SECOND | SQL_FN_TSI_SECOND | SQL_FN_TSI_MINUTE | SQL_FN_TSI_HOUR | + SQL_FN_TSI_DAY | SQL_FN_TSI_WEEK | SQL_FN_TSI_MONTH | SQL_FN_TSI_QUARTER | + SQL_FN_TSI_YEAR); + info_[SQL_CURSOR_COMMIT_BEHAVIOR] = static_cast(SQL_CB_CLOSE); + info_[SQL_CURSOR_ROLLBACK_BEHAVIOR] = static_cast(SQL_CB_CLOSE); + info_[SQL_CREATE_TRANSLATION] = static_cast(0); + info_[SQL_DDL_INDEX] = static_cast(0); + info_[SQL_DROP_ASSERTION] = static_cast(0); + info_[SQL_DROP_CHARACTER_SET] = static_cast(0); + info_[SQL_DROP_COLLATION] = static_cast(0); + info_[SQL_DROP_DOMAIN] = static_cast(0); + info_[SQL_DROP_SCHEMA] = static_cast(0); + info_[SQL_DROP_TABLE] = static_cast(0); + info_[SQL_DROP_TRANSLATION] = static_cast(0); + info_[SQL_DROP_VIEW] = static_cast(0); + info_[SQL_MAX_IDENTIFIER_LEN] = static_cast(65535); // arbitrary + + // Assume all aggregate functions reported in ODBC are supported. + info_[SQL_AGGREGATE_FUNCTIONS] = + static_cast(SQL_AF_ALL | SQL_AF_AVG | SQL_AF_COUNT | SQL_AF_DISTINCT | + SQL_AF_MAX | SQL_AF_MIN | SQL_AF_SUM); + + // Assume catalogs are not supported by default. ODBC checks if SQL_CATALOG_NAME is + // "Y" or "N" to determine if catalogs are supported. + info_[SQL_CATALOG_TERM] = ""; + info_[SQL_CATALOG_NAME] = "N"; + info_[SQL_CATALOG_NAME_SEPARATOR] = ""; + info_[SQL_CATALOG_LOCATION] = static_cast(0); +} + +void GetInfoCache::SetProperty(uint16_t property, + driver::odbcabstraction::Connection::Info value) { + info_[property] = value; +} + +Connection::Info GetInfoCache::GetInfo(uint16_t info_type) { + auto it = info_.find(info_type); + + if (info_.end() == it) { + if (LoadInfoFromServer()) { + it = info_.find(info_type); + } + if (info_.end() == it) { + throw DriverException("Unknown GetInfo type: " + std::to_string(info_type)); + } + } + return it->second; +} + +bool GetInfoCache::LoadInfoFromServer() { + if (sql_client_ && !has_server_info_.exchange(true)) { + std::unique_lock lock(mutex_); + arrow::Result> result = + sql_client_->GetSqlInfo(call_options_, {}); + ThrowIfNotOK(result.status()); + FlightStreamChunkBuffer chunk_iter(*sql_client_, call_options_, result.ValueOrDie()); + + FlightStreamChunk chunk; + bool supports_correlation_name = false; + bool requires_different_correlation_name = false; + bool transactions_supported = false; + bool transaction_ddl_commit = false; + bool transaction_ddl_ignore = false; + while (chunk_iter.GetNext(&chunk)) { + auto name_array = chunk.data->GetColumnByName("info_name"); + auto value_array = chunk.data->GetColumnByName("value"); + + arrow::UInt32Array* info_type_array = + static_cast(name_array.get()); + arrow::UnionArray* value_union_array = + static_cast(value_array.get()); + for (int64_t i = 0; i < chunk.data->num_rows(); ++i) { + if (!value_array->IsNull(i)) { + auto info_type = static_cast( + info_type_array->Value(i)); + auto result_scalar = value_union_array->GetScalar(i); + ThrowIfNotOK(result_scalar.status()); + std::shared_ptr scalar_ptr = result_scalar.ValueOrDie(); + arrow::UnionScalar* scalar = + reinterpret_cast(scalar_ptr.get()); + switch (info_type) { + // String properties + case SqlInfoOptions::FLIGHT_SQL_SERVER_NAME: { + std::string server_name( + reinterpret_cast(scalar->child_value().get()) + ->view()); + + // TODO: Consider creating different properties in GetSqlInfo. + // TODO: Investigate if SQL_SERVER_NAME should just be the host + // address as well. In JDBC, FLIGHT_SQL_SERVER_NAME is only used for + // the DatabaseProductName. + info_[SQL_SERVER_NAME] = server_name; + info_[SQL_DBMS_NAME] = server_name; + info_[SQL_DATABASE_NAME] = + server_name; // This is usually the current catalog. May need to + // throw HYC00 instead. + break; + } + case SqlInfoOptions::FLIGHT_SQL_SERVER_VERSION: { + info_[SQL_DBMS_VER] = ConvertToDBMSVer(std::string( + reinterpret_cast(scalar->child_value().get()) + ->view())); + break; + } + case SqlInfoOptions::FLIGHT_SQL_SERVER_ARROW_VERSION: { + // Unused. + break; + } + case SqlInfoOptions::SQL_SEARCH_STRING_ESCAPE: { + info_[SQL_SEARCH_PATTERN_ESCAPE] = std::string( + reinterpret_cast(scalar->child_value().get()) + ->view()); + break; + } + case ARROW_SQL_IDENTIFIER_QUOTE_CHAR: { + info_[SQL_IDENTIFIER_QUOTE_CHAR] = std::string( + reinterpret_cast(scalar->child_value().get()) + ->view()); + break; + } + case SqlInfoOptions::SQL_EXTRA_NAME_CHARACTERS: { + info_[SQL_SPECIAL_CHARACTERS] = std::string( + reinterpret_cast(scalar->child_value().get()) + ->view()); + break; + } + case ARROW_SQL_SCHEMA_TERM: { + info_[SQL_SCHEMA_TERM] = std::string( + reinterpret_cast(scalar->child_value().get()) + ->view()); + break; + } + case ARROW_SQL_PROCEDURE_TERM: { + info_[SQL_PROCEDURE_TERM] = std::string( + reinterpret_cast(scalar->child_value().get()) + ->view()); + break; + } + case ARROW_SQL_CATALOG_TERM: { + std::string catalog_term(std::string( + reinterpret_cast(scalar->child_value().get()) + ->view())); + if (catalog_term.empty()) { + info_[SQL_CATALOG_NAME] = "N"; + info_[SQL_CATALOG_NAME_SEPARATOR] = ""; + info_[SQL_CATALOG_LOCATION] = static_cast(0); + } else { + info_[SQL_CATALOG_NAME] = "Y"; + info_[SQL_CATALOG_NAME_SEPARATOR] = "."; + info_[SQL_CATALOG_LOCATION] = static_cast(SQL_CL_START); + } + info_[SQL_CATALOG_TERM] = std::string( + reinterpret_cast(scalar->child_value().get()) + ->view()); + + break; + } + + // Bool properties + case SqlInfoOptions::FLIGHT_SQL_SERVER_READ_ONLY: { + info_[SQL_DATA_SOURCE_READ_ONLY] = ScalarToBoolString(scalar); + + // Assume all forms of insert are supported, however this should + // come from a property. + info_[SQL_INSERT_STATEMENT] = static_cast( + SQL_IS_INSERT_LITERALS | SQL_IS_INSERT_SEARCHED | SQL_IS_SELECT_INTO); + break; + } + case SqlInfoOptions::SQL_DDL_CATALOG: + // Unused by ODBC. + break; + case SqlInfoOptions::SQL_DDL_SCHEMA: { + bool supports_schema_ddl = + reinterpret_cast(scalar->child_value().get()) + ->value; + // Note: this is a bitmask and we can't describe cascade or restrict + // flags. + info_[SQL_DROP_SCHEMA] = static_cast(SQL_DS_DROP_SCHEMA); + + // Note: this is a bitmask and we can't describe authorization or + // collation + info_[SQL_CREATE_SCHEMA] = static_cast(SQL_CS_CREATE_SCHEMA); + break; + } + case SqlInfoOptions::SQL_DDL_TABLE: { + bool supports_table_ddl = + reinterpret_cast(scalar->child_value().get()) + ->value; + // This is a bitmask and we cannot describe all clauses. + info_[SQL_CREATE_TABLE] = static_cast(SQL_CT_CREATE_TABLE); + info_[SQL_DROP_TABLE] = static_cast(SQL_DT_DROP_TABLE); + break; + } + case SqlInfoOptions::SQL_ALL_TABLES_ARE_SELECTABLE: { + info_[SQL_ACCESSIBLE_TABLES] = ScalarToBoolString(scalar); + break; + } + case SqlInfoOptions::SQL_SUPPORTS_COLUMN_ALIASING: { + info_[SQL_COLUMN_ALIAS] = ScalarToBoolString(scalar); + break; + } + case SqlInfoOptions::SQL_NULL_PLUS_NULL_IS_NULL: { + info_[SQL_CONCAT_NULL_BEHAVIOR] = static_cast( + reinterpret_cast(scalar->child_value().get()) + ->value + ? SQL_CB_NULL + : SQL_CB_NON_NULL); + break; + } + case SqlInfoOptions::SQL_SUPPORTS_TABLE_CORRELATION_NAMES: { + // Simply cache SQL_SUPPORTS_TABLE_CORRELATION_NAMES and + // SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES since we need both + // properties to determine the value for SQL_CORRELATION_NAME. + supports_correlation_name = + reinterpret_cast(scalar->child_value().get()) + ->value; + break; + } + case SqlInfoOptions::SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES: { + // Simply cache SQL_SUPPORTS_TABLE_CORRELATION_NAMES and + // SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES since we need both + // properties to determine the value for SQL_CORRELATION_NAME. + requires_different_correlation_name = + reinterpret_cast(scalar->child_value().get()) + ->value; + break; + } + case SqlInfoOptions::SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY: { + info_[SQL_EXPRESSIONS_IN_ORDERBY] = ScalarToBoolString(scalar); + break; + } + case SqlInfoOptions::SQL_SUPPORTS_ORDER_BY_UNRELATED: { + // Note: this is the negation of the Flight SQL property. + info_[SQL_ORDER_BY_COLUMNS_IN_SELECT] = + reinterpret_cast(scalar->child_value().get()) + ->value + ? "N" + : "Y"; + break; + } + case SqlInfoOptions::SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE: { + info_[SQL_LIKE_ESCAPE_CLAUSE] = ScalarToBoolString(scalar); + break; + } + case SqlInfoOptions::SQL_SUPPORTS_NON_NULLABLE_COLUMNS: { + info_[SQL_NON_NULLABLE_COLUMNS] = static_cast( + reinterpret_cast(scalar->child_value().get()) + ->value + ? SQL_NNC_NON_NULL + : SQL_NNC_NULL); + break; + } + case SqlInfoOptions::SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY: { + info_[SQL_INTEGRITY] = ScalarToBoolString(scalar); + break; + } + case SqlInfoOptions::SQL_CATALOG_AT_START: { + info_[SQL_CATALOG_LOCATION] = static_cast( + reinterpret_cast(scalar->child_value().get()) + ->value + ? SQL_CL_START + : SQL_CL_END); + break; + } + case SqlInfoOptions::SQL_SELECT_FOR_UPDATE_SUPPORTED: + // Not used. + break; + case SqlInfoOptions::SQL_STORED_PROCEDURES_SUPPORTED: { + info_[SQL_PROCEDURES] = ScalarToBoolString(scalar); + break; + } + case SqlInfoOptions::SQL_MAX_ROW_SIZE_INCLUDES_BLOBS: { + info_[SQL_MAX_ROW_SIZE_INCLUDES_LONG] = ScalarToBoolString(scalar); + break; + } + case SqlInfoOptions::SQL_TRANSACTIONS_SUPPORTED: { + transactions_supported = + reinterpret_cast(scalar->child_value().get()) + ->value; + break; + } + case SqlInfoOptions::SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT: { + transaction_ddl_commit = + reinterpret_cast(scalar->child_value().get()) + ->value; + break; + } + case SqlInfoOptions::SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED: { + transaction_ddl_ignore = + reinterpret_cast(scalar->child_value().get()) + ->value; + break; + } + case SqlInfoOptions::SQL_BATCH_UPDATES_SUPPORTED: { + info_[SQL_BATCH_SUPPORT] = static_cast( + reinterpret_cast(scalar->child_value().get()) + ->value + ? SQL_BS_ROW_COUNT_EXPLICIT + : 0); + break; + } + case SqlInfoOptions::SQL_SAVEPOINTS_SUPPORTED: + // Not used. + break; + case SqlInfoOptions::SQL_NAMED_PARAMETERS_SUPPORTED: + // Not used. + break; + case SqlInfoOptions::SQL_LOCATORS_UPDATE_COPY: + // Not used. + break; + case SqlInfoOptions::SQL_STORED_FUNCTIONS_USING_CALL_SYNTAX_SUPPORTED: + // Not used. + break; + case SqlInfoOptions::SQL_CORRELATED_SUBQUERIES_SUPPORTED: + // Not used. This is implied by SQL_SUPPORTED_SUBQUERIES. + break; + + // Int64 properties + case ARROW_SQL_IDENTIFIER_CASE: { + // Missing from C++ enum. constant from Java. + constexpr int64_t LOWER = 3; + uint16_t value = 0; + int64_t sensitivity = ScalarToInt64(scalar); + switch (sensitivity) { + case SqlInfoOptions::SQL_CASE_SENSITIVITY_UNKNOWN: + value = SQL_IC_SENSITIVE; + break; + case SqlInfoOptions::SQL_CASE_SENSITIVITY_CASE_INSENSITIVE: + value = SQL_IC_MIXED; + break; + case SqlInfoOptions::SQL_CASE_SENSITIVITY_UPPERCASE: + value = SQL_IC_UPPER; + break; + case LOWER: + value = SQL_IC_LOWER; + break; + default: + value = SQL_IC_SENSITIVE; + break; + } + info_[SQL_IDENTIFIER_CASE] = value; + break; + } + case SqlInfoOptions::SQL_NULL_ORDERING: { + uint16_t value = 0; + int64_t scalar_value = ScalarToInt64(scalar); + switch (scalar_value) { + case SqlInfoOptions::SQL_NULLS_SORTED_AT_START: + value = SQL_NC_START; + break; + case SqlInfoOptions::SQL_NULLS_SORTED_AT_END: + value = SQL_NC_END; + break; + case SqlInfoOptions::SQL_NULLS_SORTED_HIGH: + value = SQL_NC_HIGH; + break; + case SqlInfoOptions::SQL_NULLS_SORTED_LOW: + default: + value = SQL_NC_LOW; + break; + } + info_[SQL_NULL_COLLATION] = value; + break; + } + case ARROW_SQL_QUOTED_IDENTIFIER_CASE: { + // Missing from C++ enum. constant from Java. + constexpr int64_t LOWER = 3; + uint16_t value = 0; + int64_t sensitivity = ScalarToInt64(scalar); + switch (sensitivity) { + case SqlInfoOptions::SQL_CASE_SENSITIVITY_UNKNOWN: + value = SQL_IC_SENSITIVE; + break; + case SqlInfoOptions::SQL_CASE_SENSITIVITY_CASE_INSENSITIVE: + value = SQL_IC_MIXED; + break; + case SqlInfoOptions::SQL_CASE_SENSITIVITY_UPPERCASE: + value = SQL_IC_UPPER; + break; + case LOWER: + value = SQL_IC_LOWER; + break; + default: + value = SQL_IC_SENSITIVE; + break; + } + info_[SQL_QUOTED_IDENTIFIER_CASE] = value; + break; + } + case SqlInfoOptions::SQL_MAX_BINARY_LITERAL_LENGTH: { + info_[SQL_MAX_BINARY_LITERAL_LEN] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_CHAR_LITERAL_LENGTH: { + info_[SQL_MAX_CHAR_LITERAL_LEN] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_COLUMN_NAME_LENGTH: { + info_[SQL_MAX_COLUMN_NAME_LEN] = + static_cast(ScalarToInt64(scalar)); + break; + } + case ARROW_SQL_MAX_COLUMNS_IN_GROUP_BY: { + info_[SQL_MAX_COLUMNS_IN_GROUP_BY] = + static_cast(ScalarToInt64(scalar)); + break; + } + case ARROW_SQL_MAX_COLUMNS_IN_INDEX: { + info_[SQL_MAX_COLUMNS_IN_INDEX] = + static_cast(ScalarToInt64(scalar)); + break; + } + case ARROW_SQL_MAX_COLUMNS_IN_ORDER_BY: { + info_[SQL_MAX_COLUMNS_IN_ORDER_BY] = + static_cast(ScalarToInt64(scalar)); + break; + } + case ARROW_SQL_MAX_COLUMNS_IN_SELECT: { + info_[SQL_MAX_COLUMNS_IN_SELECT] = + static_cast(ScalarToInt64(scalar)); + break; + } + case ARROW_SQL_MAX_COLUMNS_IN_TABLE: { + info_[SQL_MAX_COLUMNS_IN_TABLE] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_CONNECTIONS: { + info_[SQL_MAX_DRIVER_CONNECTIONS] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_CURSOR_NAME_LENGTH: { + info_[SQL_MAX_CURSOR_NAME_LEN] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_INDEX_LENGTH: { + info_[SQL_MAX_INDEX_SIZE] = static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_SCHEMA_NAME_LENGTH: { + info_[SQL_MAX_SCHEMA_NAME_LEN] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_PROCEDURE_NAME_LENGTH: { + info_[SQL_MAX_PROCEDURE_NAME_LEN] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_CATALOG_NAME_LENGTH: { + info_[SQL_MAX_CATALOG_NAME_LEN] = + static_cast(ScalarToInt64(scalar)); + break; + } + case ARROW_SQL_MAX_ROW_SIZE: { + info_[SQL_MAX_ROW_SIZE] = static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_STATEMENT_LENGTH: { + info_[SQL_MAX_STATEMENT_LEN] = static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_STATEMENTS: { + info_[SQL_MAX_CONCURRENT_ACTIVITIES] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_TABLE_NAME_LENGTH: { + info_[SQL_MAX_TABLE_NAME_LEN] = + static_cast(ScalarToInt64(scalar)); + break; + } + case ARROW_SQL_MAX_TABLES_IN_SELECT: { + info_[SQL_MAX_TABLES_IN_SELECT] = + static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_MAX_USERNAME_LENGTH: { + info_[SQL_MAX_USER_NAME_LEN] = static_cast(ScalarToInt64(scalar)); + break; + } + case SqlInfoOptions::SQL_DEFAULT_TRANSACTION_ISOLATION: { + constexpr int32_t NONE = 0; + constexpr int32_t READ_UNCOMMITTED = 1; + constexpr int32_t READ_COMMITTED = 2; + constexpr int32_t REPEATABLE_READ = 3; + constexpr int32_t SERIALIZABLE = 4; + int64_t scalar_value = static_cast(ScalarToInt64(scalar)); + uint32_t result_val = 0; + if ((scalar_value & (1 << READ_UNCOMMITTED)) != 0) { + result_val = SQL_TXN_READ_UNCOMMITTED; + } else if ((scalar_value & (1 << READ_COMMITTED)) != 0) { + result_val = SQL_TXN_READ_COMMITTED; + } else if ((scalar_value & (1 << REPEATABLE_READ)) != 0) { + result_val = SQL_TXN_REPEATABLE_READ; + } else if ((scalar_value & (1 << SERIALIZABLE)) != 0) { + result_val = SQL_TXN_SERIALIZABLE; + } + info_[SQL_DEFAULT_TXN_ISOLATION] = result_val; + break; + } + + // Int32 properties + case SqlInfoOptions::SQL_SUPPORTED_GROUP_BY: { + // Note: SqlGroupBy enum is missing in C++. Using Java values. + constexpr int32_t UNRELATED = 0; + constexpr int32_t BEYOND_SELECT = 1; + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + uint16_t result_val = SQL_GB_NOT_SUPPORTED; + if ((scalar_value & (1 << UNRELATED)) != 0) { + result_val = SQL_GB_NO_RELATION; + } else if ((scalar_value & (1 << BEYOND_SELECT)) != 0) { + result_val = SQL_GB_GROUP_BY_CONTAINS_SELECT; + } + // Note GROUP_BY_EQUALS_SELECT and COLLATE cannot be described. + info_[SQL_GROUP_BY] = result_val; + break; + } + case SqlInfoOptions::SQL_SUPPORTED_GRAMMAR: { + // Note: SupportedSqlGrammar enum is missing in C++. Using Java + // values. + constexpr int32_t MINIMUM = 0; + constexpr int32_t CORE = 1; + constexpr int32_t EXTENDED = 2; + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + uint32_t result_val = SQL_OIC_CORE; + if ((scalar_value & (1 << MINIMUM)) != 0) { + result_val = SQL_OIC_CORE; + } else if ((scalar_value & (1 << CORE)) != 0) { + result_val = SQL_OIC_LEVEL1; + } else if ((scalar_value & (1 << EXTENDED)) != 0) { + result_val = SQL_OIC_LEVEL2; + } + info_[SQL_ODBC_API_CONFORMANCE] = result_val; + break; + } + case SqlInfoOptions::SQL_ANSI92_SUPPORTED_LEVEL: { + // Note: SupportedAnsi92SqlGrammarLevel enum is missing in C++. + // Using Java values. + constexpr int32_t ENTRY = 0; + constexpr int32_t INTERMEDIATE = 1; + constexpr int32_t FULL = 2; + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + uint32_t result_val = SQL_SC_SQL92_ENTRY; + uint16_t odbc_sql_conformance = SQL_OSC_MINIMUM; + if ((scalar_value & (1 << ENTRY)) != 0) { + result_val = SQL_SC_SQL92_ENTRY; + } else if ((scalar_value & (1 << INTERMEDIATE)) != 0) { + result_val = SQL_SC_SQL92_INTERMEDIATE; + odbc_sql_conformance = SQL_OSC_CORE; + } else if ((scalar_value & (1 << FULL)) != 0) { + result_val = SQL_SC_SQL92_FULL; + odbc_sql_conformance = SQL_OSC_EXTENDED; + } + info_[SQL_SQL_CONFORMANCE] = result_val; + info_[SQL_ODBC_SQL_CONFORMANCE] = odbc_sql_conformance; + break; + } + case SqlInfoOptions::SQL_OUTER_JOINS_SUPPORT_LEVEL: { + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + + // If limited outer joins is supported, we can't tell which joins + // are supported so just report none. If full outer joins is + // supported, nested joins are supported and full outer joins are + // supported, so all joins + nested are supported. + constexpr int32_t UNSUPPORTED = 0; + constexpr int32_t LIMITED = 1; + constexpr int32_t FULL = 2; + uint32_t result_val = 0; + // Assume inner and cross joins are supported. Flight SQL can't + // report this currently. + uint32_t relational_operators = SQL_SRJO_CROSS_JOIN | SQL_SRJO_INNER_JOIN; + if ((scalar_value & (1 << FULL)) != 0) { + result_val = SQL_OJ_LEFT | SQL_OJ_RIGHT | SQL_OJ_FULL | SQL_OJ_NESTED; + relational_operators |= SQL_SRJO_FULL_OUTER_JOIN | + SQL_SRJO_LEFT_OUTER_JOIN | + SQL_SRJO_RIGHT_OUTER_JOIN; + } else if ((scalar_value & (1 << LIMITED)) != 0) { + result_val = SQL_SC_SQL92_INTERMEDIATE; + } else if ((scalar_value & (1 << UNSUPPORTED)) != 0) { + result_val = 0; + } + info_[SQL_OJ_CAPABILITIES] = result_val; + info_[SQL_OUTER_JOINS] = result_val != 0 ? "Y" : "N"; + info_[SQL_SQL92_RELATIONAL_JOIN_OPERATORS] = relational_operators; + break; + } + case SqlInfoOptions::SQL_SCHEMAS_SUPPORTED_ACTIONS: { + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + + // Missing SqlSupportedElementActions enum in C++. Values taken from + // java. + constexpr int32_t PROCEDURE = 0; + constexpr int32_t INDEX = 1; + constexpr int32_t PRIVILEGE = 2; + // Assume schemas are supported in DML and Table manipulation. + uint32_t result_val = SQL_SU_DML_STATEMENTS | SQL_SU_TABLE_DEFINITION; + if ((scalar_value & (1 << PROCEDURE)) != 0) { + result_val |= SQL_SU_PROCEDURE_INVOCATION; + } + if ((scalar_value & (1 << INDEX)) != 0) { + result_val |= SQL_SU_INDEX_DEFINITION; + } + if ((scalar_value & (1 << PRIVILEGE)) != 0) { + result_val |= SQL_SU_PRIVILEGE_DEFINITION; + } + info_[SQL_SCHEMA_USAGE] = result_val; + break; + } + case SqlInfoOptions::SQL_CATALOGS_SUPPORTED_ACTIONS: { + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + + // Missing SqlSupportedElementActions enum in C++. Values taken from + // java. + constexpr int32_t PROCEDURE = 0; + constexpr int32_t INDEX = 1; + constexpr int32_t PRIVILEGE = 2; + // Assume catalogs are supported in DML and Table manipulation. + uint32_t result_val = SQL_CU_DML_STATEMENTS | SQL_CU_TABLE_DEFINITION; + if ((scalar_value & (1 << PROCEDURE)) != 0) { + result_val |= SQL_CU_PROCEDURE_INVOCATION; + } + if ((scalar_value & (1 << INDEX)) != 0) { + result_val |= SQL_CU_INDEX_DEFINITION; + } + if ((scalar_value & (1 << PRIVILEGE)) != 0) { + result_val |= SQL_CU_PRIVILEGE_DEFINITION; + } + info_[SQL_CATALOG_USAGE] = result_val; + break; + } + case SqlInfoOptions::SQL_SUPPORTED_POSITIONED_COMMANDS: { + // Ignore, positioned updates/deletes unsupported. + break; + } + case SqlInfoOptions::SQL_SUPPORTED_SUBQUERIES: { + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + + // Missing SqlSupportedElementActions enum in C++. Values taken from + // java. + constexpr int32_t COMPARISONS = 0; + constexpr int32_t EXISTS = 1; + constexpr int32_t INN = 2; + constexpr int32_t QUANTIFIEDS = 3; + uint32_t result_val = 0; + if ((scalar_value & (1 << COMPARISONS)) != 0) { + result_val |= SQL_SQ_COMPARISON; + } + if ((scalar_value & (1 << EXISTS)) != 0) { + result_val |= SQL_SQ_EXISTS; + } + if ((scalar_value & (1 << INN)) != 0) { + result_val |= SQL_SQ_IN; + } + if ((scalar_value & (1 << QUANTIFIEDS)) != 0) { + result_val |= SQL_SQ_QUANTIFIED; + } + info_[SQL_SUBQUERIES] = result_val; + break; + } + case SqlInfoOptions::SQL_SUPPORTED_UNIONS: { + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + + // Missing enum in C++. Values taken from java. + constexpr int32_t UNION = 0; + constexpr int32_t UNION_ALL = 1; + uint32_t result_val = 0; + if ((scalar_value & (1 << UNION)) != 0) { + result_val |= SQL_U_UNION; + } + if ((scalar_value & (1 << UNION_ALL)) != 0) { + result_val |= SQL_U_UNION_ALL; + } + info_[SQL_UNION] = result_val; + break; + } + case SqlInfoOptions::SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS: { + int32_t scalar_value = static_cast(ScalarToInt32(scalar)); + + // Missing enum in C++. Values taken from java. + constexpr int32_t NONE = 0; + constexpr int32_t READ_UNCOMMITTED = 1; + constexpr int32_t READ_COMMITTED = 2; + constexpr int32_t REPEATABLE_READ = 3; + constexpr int32_t SERIALIZABLE = 4; + uint32_t result_val = 0; + if ((scalar_value & (1 << NONE)) != 0) { + result_val = 0; + } + if ((scalar_value & (1 << READ_UNCOMMITTED)) != 0) { + result_val |= SQL_TXN_READ_UNCOMMITTED; + } + if ((scalar_value & (1 << READ_COMMITTED)) != 0) { + result_val |= SQL_TXN_READ_COMMITTED; + } + if ((scalar_value & (1 << REPEATABLE_READ)) != 0) { + result_val |= SQL_TXN_REPEATABLE_READ; + } + if ((scalar_value & (1 << SERIALIZABLE)) != 0) { + result_val |= SQL_TXN_SERIALIZABLE; + } + info_[SQL_TXN_ISOLATION_OPTION] = result_val; + break; + } + case SqlInfoOptions::SQL_SUPPORTED_RESULT_SET_TYPES: + // Ignored. Warpdrive supports forward-only only. + break; + case SqlInfoOptions::SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED: + // Ignored. Warpdrive supports forward-only only. + break; + case SqlInfoOptions::SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY: + // Ignored. Warpdrive supports forward-only only. + break; + case SqlInfoOptions:: + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE: + // Ignored. Warpdrive supports forward-only only. + break; + case SqlInfoOptions:: + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE: + // Ignored. Warpdrive supports forward-only only. + break; + + // List properties + case ARROW_SQL_NUMERIC_FUNCTIONS: { + std::shared_ptr list_value = + reinterpret_cast(scalar->child_value().get()) + ->value; + uint32_t result_val = 0; + for (int64_t list_index = 0; list_index < list_value->length(); + ++list_index) { + if (!list_value->IsNull(list_index)) { + ReportNumericFunction( + reinterpret_cast(list_value.get()) + ->GetString(list_index), + result_val); + } + } + info_[SQL_NUMERIC_FUNCTIONS] = result_val; + break; + } + + case ARROW_SQL_STRING_FUNCTIONS: { + std::shared_ptr list_value = + reinterpret_cast(scalar->child_value().get()) + ->value; + uint32_t result_val = 0; + for (int64_t list_index = 0; list_index < list_value->length(); + ++list_index) { + if (!list_value->IsNull(list_index)) { + ReportStringFunction( + reinterpret_cast(list_value.get()) + ->GetString(list_index), + result_val); + } + } + info_[SQL_STRING_FUNCTIONS] = result_val; + break; + } + case ARROW_SQL_SYSTEM_FUNCTIONS: { + std::shared_ptr list_value = + reinterpret_cast(scalar->child_value().get()) + ->value; + uint32_t sys_result = 0; + uint32_t convert_result = 0; + for (int64_t list_index = 0; list_index < list_value->length(); + ++list_index) { + if (!list_value->IsNull(list_index)) { + ReportSystemFunction( + reinterpret_cast(list_value.get()) + ->GetString(list_index), + sys_result, convert_result); + } + } + info_[SQL_CONVERT_FUNCTIONS] = convert_result; + info_[SQL_SYSTEM_FUNCTIONS] = sys_result; + break; + } + case SqlInfoOptions::SQL_DATETIME_FUNCTIONS: { + std::shared_ptr list_value = + reinterpret_cast(scalar->child_value().get()) + ->value; + uint32_t result_val = 0; + for (int64_t list_index = 0; list_index < list_value->length(); + ++list_index) { + if (!list_value->IsNull(list_index)) { + ReportDatetimeFunction( + reinterpret_cast(list_value.get()) + ->GetString(list_index), + result_val); + } + } + info_[SQL_TIMEDATE_FUNCTIONS] = result_val; + break; + } + + case ARROW_SQL_KEYWORDS: { + std::shared_ptr list_value = + reinterpret_cast(scalar->child_value().get()) + ->value; + std::string result_str; + for (int64_t list_index = 0; list_index < list_value->length(); + ++list_index) { + if (!list_value->IsNull(list_index)) { + if (list_index != 0) { + result_str += ", "; + } + + result_str += reinterpret_cast(list_value.get()) + ->GetString(list_index); + } + } + info_[SQL_KEYWORDS] = std::move(result_str); + break; + } + + // Map properties + case SqlInfoOptions::SQL_SUPPORTS_CONVERT: { + arrow::MapScalar* map_scalar = + reinterpret_cast(scalar->child_value().get()); + auto data_array = map_scalar->value; + arrow::StructArray* map_contents = + reinterpret_cast(data_array.get()); + auto map_keys = map_contents->field(0); + auto map_values = map_contents->field(1); + for (int64_t map_index = 0; map_index < map_contents->length(); + ++map_index) { + if (!map_values->IsNull(map_index)) { + auto map_key_scalar_ptr = map_keys->GetScalar(map_index).ValueOrDie(); + auto map_value_scalar_ptr = + map_values->GetScalar(map_index).ValueOrDie(); + int32_t map_key_scalar = + reinterpret_cast(map_key_scalar_ptr.get()) + ->value; + auto map_value_scalar = + reinterpret_cast(map_value_scalar_ptr.get()) + ->value; + + int32_t get_info_type = GetInfoTypeForArrowConvertEntry(map_key_scalar); + if (get_info_type < 0) { + continue; + } + uint32_t info_bitmask_value_to_write = 0; + for (int64_t map_value_array_index = 0; + map_value_array_index < map_value_scalar->length(); + ++map_value_array_index) { + if (!map_value_scalar->IsNull(map_value_array_index)) { + auto list_entry_scalar = + map_value_scalar->GetScalar(map_value_array_index).ValueOrDie(); + info_bitmask_value_to_write |= GetCvtBitForArrowConvertEntry( + reinterpret_cast(list_entry_scalar.get()) + ->value); + } + } + info_[get_info_type] = info_bitmask_value_to_write; + } + } + break; + } + + default: + // Ignore unrecognized. + break; + } + } + } + + if (transactions_supported) { + if (transaction_ddl_commit) { + info_[SQL_TXN_CAPABLE] = static_cast(SQL_TC_DDL_COMMIT); + } else if (transaction_ddl_ignore) { + info_[SQL_TXN_CAPABLE] = static_cast(SQL_TC_DDL_IGNORE); + } else { + // Ambiguous if this means transactions on DDL is supported or not. + // Assume not + info_[SQL_TXN_CAPABLE] = static_cast(SQL_TC_DML); + } + } else { + info_[SQL_TXN_CAPABLE] = static_cast(SQL_TC_NONE); + } + + if (supports_correlation_name) { + if (requires_different_correlation_name) { + info_[SQL_CORRELATION_NAME] = static_cast(SQL_CN_DIFFERENT); + } else { + info_[SQL_CORRELATION_NAME] = static_cast(SQL_CN_ANY); + } + } else { + info_[SQL_CORRELATION_NAME] = static_cast(SQL_CN_NONE); + } + } + LoadDefaultsForMissingEntries(); + return true; + } + + return false; +} + +void GetInfoCache::LoadDefaultsForMissingEntries() { + // For safety's sake, this function does not discriminate between driver and hard-coded + // values. + SetDefaultIfMissing(info_, SQL_ACCESSIBLE_PROCEDURES, "N"); + SetDefaultIfMissing(info_, SQL_ACCESSIBLE_TABLES, "Y"); + SetDefaultIfMissing(info_, SQL_ACTIVE_ENVIRONMENTS, static_cast(0)); + SetDefaultIfMissing( + info_, SQL_AGGREGATE_FUNCTIONS, + static_cast(SQL_AF_ALL | SQL_AF_AVG | SQL_AF_COUNT | SQL_AF_DISTINCT | + SQL_AF_MAX | SQL_AF_MIN | SQL_AF_SUM)); + SetDefaultIfMissing(info_, SQL_ALTER_DOMAIN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_ALTER_TABLE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_ASYNC_MODE, static_cast(SQL_AM_NONE)); + SetDefaultIfMissing(info_, SQL_BATCH_ROW_COUNT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_BATCH_SUPPORT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_BOOKMARK_PERSISTENCE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CATALOG_LOCATION, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CATALOG_NAME, "N"); + SetDefaultIfMissing(info_, SQL_CATALOG_NAME_SEPARATOR, ""); + SetDefaultIfMissing(info_, SQL_CATALOG_TERM, ""); + SetDefaultIfMissing(info_, SQL_CATALOG_USAGE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_COLLATION_SEQ, ""); + SetDefaultIfMissing(info_, SQL_COLUMN_ALIAS, "Y"); + SetDefaultIfMissing(info_, SQL_CONCAT_NULL_BEHAVIOR, + static_cast(SQL_CB_NULL)); + SetDefaultIfMissing(info_, SQL_CONVERT_BIGINT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_BINARY, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_BIT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_CHAR, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_DATE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_DECIMAL, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_DOUBLE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_FLOAT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_GUID, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_INTEGER, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_INTERVAL_YEAR_MONTH, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_INTERVAL_DAY_TIME, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_LONGVARBINARY, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_LONGVARCHAR, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_NUMERIC, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_REAL, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_SMALLINT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_TIME, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_TIMESTAMP, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_TINYINT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_VARBINARY, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_VARCHAR, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_WCHAR, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_WVARCHAR, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_WLONGVARCHAR, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CONVERT_WLONGVARCHAR, + static_cast(SQL_FN_CVT_CAST)); + SetDefaultIfMissing(info_, SQL_CORRELATION_NAME, static_cast(SQL_CN_NONE)); + SetDefaultIfMissing(info_, SQL_CREATE_ASSERTION, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CREATE_CHARACTER_SET, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CREATE_DOMAIN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CREATE_SCHEMA, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CREATE_TABLE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CREATE_TRANSLATION, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CREATE_VIEW, static_cast(0)); + SetDefaultIfMissing(info_, SQL_CURSOR_COMMIT_BEHAVIOR, + static_cast(SQL_CB_CLOSE)); + SetDefaultIfMissing(info_, SQL_CURSOR_ROLLBACK_BEHAVIOR, + static_cast(SQL_CB_CLOSE)); + SetDefaultIfMissing(info_, SQL_CURSOR_SENSITIVITY, + static_cast(SQL_UNSPECIFIED)); + SetDefaultIfMissing(info_, SQL_DATA_SOURCE_READ_ONLY, "N"); + SetDefaultIfMissing(info_, SQL_DBMS_NAME, "Arrow Flight SQL Server"); + SetDefaultIfMissing(info_, SQL_DBMS_VER, "00.01.0000"); + SetDefaultIfMissing(info_, SQL_DDL_INDEX, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DEFAULT_TXN_ISOLATION, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DESCRIBE_PARAMETER, "N"); + SetDefaultIfMissing(info_, SQL_DRIVER_NAME, "Arrow Flight SQL Driver"); + SetDefaultIfMissing(info_, SQL_DRIVER_ODBC_VER, "03.80"); + SetDefaultIfMissing(info_, SQL_DRIVER_VER, "00.09.0000"); + SetDefaultIfMissing(info_, SQL_DROP_ASSERTION, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DROP_CHARACTER_SET, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DROP_COLLATION, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DROP_DOMAIN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DROP_SCHEMA, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DROP_TABLE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DROP_TRANSLATION, static_cast(0)); + SetDefaultIfMissing(info_, SQL_DROP_VIEW, static_cast(0)); + SetDefaultIfMissing(info_, SQL_EXPRESSIONS_IN_ORDERBY, "N"); + SetDefaultIfMissing(info_, SQL_GETDATA_EXTENSIONS, + static_cast(SQL_GD_ANY_COLUMN | SQL_GD_ANY_ORDER)); + SetDefaultIfMissing(info_, SQL_GROUP_BY, + static_cast(SQL_GB_GROUP_BY_CONTAINS_SELECT)); + SetDefaultIfMissing(info_, SQL_IDENTIFIER_CASE, static_cast(SQL_IC_MIXED)); + SetDefaultIfMissing(info_, SQL_IDENTIFIER_QUOTE_CHAR, "\""); + SetDefaultIfMissing(info_, SQL_INDEX_KEYWORDS, static_cast(SQL_IK_NONE)); + SetDefaultIfMissing( + info_, SQL_INFO_SCHEMA_VIEWS, + static_cast(SQL_ISV_TABLES | SQL_ISV_COLUMNS | SQL_ISV_VIEWS)); + SetDefaultIfMissing(info_, SQL_INSERT_STATEMENT, + static_cast(SQL_IS_INSERT_LITERALS | + SQL_IS_INSERT_SEARCHED | SQL_IS_SELECT_INTO)); + SetDefaultIfMissing(info_, SQL_INTEGRITY, "N"); + SetDefaultIfMissing(info_, SQL_KEYWORDS, ""); + SetDefaultIfMissing(info_, SQL_LIKE_ESCAPE_CLAUSE, "Y"); + SetDefaultIfMissing(info_, SQL_MAX_ASYNC_CONCURRENT_STATEMENTS, + static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_BINARY_LITERAL_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_CATALOG_NAME_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_CHAR_LITERAL_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_COLUMN_NAME_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_COLUMNS_IN_GROUP_BY, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_COLUMNS_IN_INDEX, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_COLUMNS_IN_ORDER_BY, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_COLUMNS_IN_SELECT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_COLUMNS_IN_TABLE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_CURSOR_NAME_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_DRIVER_CONNECTIONS, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_IDENTIFIER_LEN, static_cast(65535)); + SetDefaultIfMissing(info_, SQL_MAX_INDEX_SIZE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_PROCEDURE_NAME_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_ROW_SIZE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_ROW_SIZE_INCLUDES_LONG, "N"); + SetDefaultIfMissing(info_, SQL_MAX_SCHEMA_NAME_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_STATEMENT_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_TABLE_NAME_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_TABLES_IN_SELECT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_MAX_USER_NAME_LEN, static_cast(0)); + SetDefaultIfMissing(info_, SQL_NON_NULLABLE_COLUMNS, + static_cast(SQL_NNC_NULL)); + SetDefaultIfMissing(info_, SQL_NULL_COLLATION, static_cast(SQL_NC_END)); + SetDefaultIfMissing(info_, SQL_NUMERIC_FUNCTIONS, static_cast(0)); + SetDefaultIfMissing(info_, SQL_OJ_CAPABILITIES, + static_cast(SQL_OJ_LEFT | SQL_OJ_RIGHT | SQL_OJ_FULL)); + SetDefaultIfMissing(info_, SQL_ORDER_BY_COLUMNS_IN_SELECT, "Y"); + SetDefaultIfMissing(info_, SQL_PROCEDURE_TERM, ""); + SetDefaultIfMissing(info_, SQL_PROCEDURES, "N"); + SetDefaultIfMissing(info_, SQL_QUOTED_IDENTIFIER_CASE, + static_cast(SQL_IC_SENSITIVE)); + SetDefaultIfMissing(info_, SQL_SCHEMA_TERM, "schema"); + SetDefaultIfMissing(info_, SQL_SCHEMA_USAGE, + static_cast(SQL_SU_DML_STATEMENTS)); + SetDefaultIfMissing(info_, SQL_SEARCH_PATTERN_ESCAPE, "\\"); + SetDefaultIfMissing( + info_, SQL_SERVER_NAME, + "Arrow Flight SQL Server"); // This might actually need to be the hostname. + SetDefaultIfMissing(info_, SQL_SQL_CONFORMANCE, + static_cast(SQL_SC_SQL92_ENTRY)); + SetDefaultIfMissing(info_, SQL_SQL92_DATETIME_FUNCTIONS, + static_cast(SQL_SDF_CURRENT_DATE | SQL_SDF_CURRENT_TIME | + SQL_SDF_CURRENT_TIMESTAMP)); + SetDefaultIfMissing(info_, SQL_SQL92_FOREIGN_KEY_DELETE_RULE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_SQL92_FOREIGN_KEY_UPDATE_RULE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_SQL92_GRANT, static_cast(0)); + SetDefaultIfMissing(info_, SQL_SQL92_NUMERIC_VALUE_FUNCTIONS, static_cast(0)); + SetDefaultIfMissing( + info_, SQL_SQL92_PREDICATES, + static_cast(SQL_SP_BETWEEN | SQL_SP_COMPARISON | SQL_SP_EXISTS | + SQL_SP_IN | SQL_SP_ISNOTNULL | SQL_SP_ISNULL | SQL_SP_LIKE)); + SetDefaultIfMissing( + info_, SQL_SQL92_RELATIONAL_JOIN_OPERATORS, + static_cast(SQL_SRJO_INNER_JOIN | SQL_SRJO_CROSS_JOIN | + SQL_SRJO_LEFT_OUTER_JOIN | SQL_SRJO_FULL_OUTER_JOIN | + SQL_SRJO_RIGHT_OUTER_JOIN)); + SetDefaultIfMissing(info_, SQL_SQL92_REVOKE, static_cast(0)); + SetDefaultIfMissing(info_, SQL_SQL92_ROW_VALUE_CONSTRUCTOR, + static_cast(SQL_SRVC_VALUE_EXPRESSION | SQL_SRVC_NULL)); + SetDefaultIfMissing( + info_, SQL_SQL92_STRING_FUNCTIONS, + static_cast(SQL_SSF_CONVERT | SQL_SSF_LOWER | SQL_SSF_UPPER | + SQL_SSF_SUBSTRING | SQL_SSF_TRIM_BOTH | SQL_SSF_TRIM_LEADING | + SQL_SSF_TRIM_TRAILING)); + SetDefaultIfMissing(info_, SQL_SQL92_VALUE_EXPRESSIONS, + static_cast(SQL_SVE_CASE | SQL_SVE_CAST | + SQL_SVE_COALESCE | SQL_SVE_NULLIF)); + SetDefaultIfMissing(info_, SQL_STANDARD_CLI_CONFORMANCE, static_cast(0)); + SetDefaultIfMissing( + info_, SQL_STRING_FUNCTIONS, + static_cast(SQL_FN_STR_CONCAT | SQL_FN_STR_LCASE | SQL_FN_STR_LENGTH | + SQL_FN_STR_LTRIM | SQL_FN_STR_RTRIM | SQL_FN_STR_SPACE | + SQL_FN_STR_SUBSTRING | SQL_FN_STR_UCASE)); + SetDefaultIfMissing( + info_, SQL_SUBQUERIES, + static_cast(SQL_SQ_CORRELATED_SUBQUERIES | SQL_SQ_COMPARISON | + SQL_SQ_EXISTS | SQL_SQ_IN | SQL_SQ_QUANTIFIED)); + SetDefaultIfMissing(info_, SQL_SYSTEM_FUNCTIONS, + static_cast(SQL_FN_SYS_IFNULL | SQL_FN_SYS_USERNAME)); + SetDefaultIfMissing(info_, SQL_TIMEDATE_ADD_INTERVALS, + static_cast( + SQL_FN_TSI_FRAC_SECOND | SQL_FN_TSI_SECOND | SQL_FN_TSI_MINUTE | + SQL_FN_TSI_HOUR | SQL_FN_TSI_DAY | SQL_FN_TSI_WEEK | + SQL_FN_TSI_MONTH | SQL_FN_TSI_QUARTER | SQL_FN_TSI_YEAR)); + SetDefaultIfMissing(info_, SQL_TIMEDATE_DIFF_INTERVALS, + static_cast( + SQL_FN_TSI_FRAC_SECOND | SQL_FN_TSI_SECOND | SQL_FN_TSI_MINUTE | + SQL_FN_TSI_HOUR | SQL_FN_TSI_DAY | SQL_FN_TSI_WEEK | + SQL_FN_TSI_MONTH | SQL_FN_TSI_QUARTER | SQL_FN_TSI_YEAR)); + SetDefaultIfMissing(info_, SQL_UNION, + static_cast(SQL_U_UNION | SQL_U_UNION_ALL)); + SetDefaultIfMissing(info_, SQL_XOPEN_CLI_YEAR, "1995"); + SetDefaultIfMissing(info_, SQL_ODBC_SQL_CONFORMANCE, + static_cast(SQL_OSC_MINIMUM)); + SetDefaultIfMissing(info_, SQL_ODBC_SAG_CLI_CONFORMANCE, + static_cast(SQL_OSCC_COMPLIANT)); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/get_info_cache.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/get_info_cache.h new file mode 100644 index 00000000000..a54dda2e13b --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/get_info_cache.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace driver { +namespace flight_sql { + +class GetInfoCache { + private: + std::unordered_map info_; + arrow::flight::FlightCallOptions& call_options_; + std::unique_ptr& sql_client_; + std::mutex mutex_; + std::atomic has_server_info_; + + public: + GetInfoCache(arrow::flight::FlightCallOptions& call_options, + std::unique_ptr& client, + const std::string& driver_version); + void SetProperty(uint16_t property, driver::odbcabstraction::Connection::Info value); + driver::odbcabstraction::Connection::Info GetInfo(uint16_t info_type); + + private: + bool LoadInfoFromServer(); + void LoadDefaultsForMissingEntries(); +}; +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/configuration.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/configuration.h new file mode 100644 index 00000000000..ead39976d43 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/configuration.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h" + +// winuser.h needs to be included after windows.h, which is defined in platform.h +#include +namespace driver { +namespace flight_sql { +namespace config { + +#define TRUE_STR "true" +#define FALSE_STR "false" + +/** + * ODBC configuration abstraction. + */ +class Configuration { + public: + /** + * Default constructor. + */ + Configuration(); + + /** + * Destructor. + */ + ~Configuration(); + + /** + * Convert configure to connect string. + * + * @return Connect string. + */ + std::string ToConnectString() const; + + void LoadDefaults(); + void LoadDsn(const std::string& dsn); + + void Clear(); + bool IsSet(const std::string_view& key) const; + const std::string& Get(const std::string_view& key) const; + void Set(const std::string_view& key, const std::string& value); + + /** + * Get properties map. + */ + const driver::odbcabstraction::Connection::ConnPropertyMap& GetProperties() const; + + std::vector GetCustomKeys() const; + + private: + driver::odbcabstraction::Connection::ConnPropertyMap properties_; +}; + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/connection_string_parser.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/connection_string_parser.h new file mode 100644 index 00000000000..45494c74390 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/connection_string_parser.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "config/configuration.h" + +namespace driver { +namespace flight_sql { +namespace config { + +/** + * ODBC configuration parser abstraction. + */ +class ConnectionStringParser { + public: + /** + * Constructor. + * + * @param cfg Configuration. + */ + explicit ConnectionStringParser(Configuration& cfg); + + /** + * Destructor. + */ + ~ConnectionStringParser(); + + /** + * Parse connect string. + * + * @param str String to parse. + * @param len String length. + * @param delimiter delimiter. + */ + void ParseConnectionString(const char* str, size_t len, char delimiter); + + /** + * Parse connect string. + * + * @param str String to parse. + */ + void ParseConnectionString(const std::string& str); + + /** + * Parse config attributes. + * + * @param str String to parse. + */ + void ParseConfigAttributes(const char* str); + + private: + ConnectionStringParser(const ConnectionStringParser& parser) = delete; + ConnectionStringParser& operator=(const ConnectionStringParser&) = delete; + + /** Configuration. */ + Configuration& cfg_; +}; + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/flight_sql_driver.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/flight_sql_driver.h new file mode 100644 index 00000000000..6a2977c7bac --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/flight_sql_driver.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace driver { +namespace flight_sql { + +class FlightSqlDriver : public odbcabstraction::Driver { + private: + odbcabstraction::Diagnostics diagnostics_; + std::string version_; + + public: + FlightSqlDriver(); + ~FlightSqlDriver(); + + std::shared_ptr CreateConnection( + odbcabstraction::OdbcVersion odbc_version) override; + + odbcabstraction::Diagnostics& GetDiagnostics() override; + + void SetVersion(std::string version) override; + + /// Register Arrow Compute kernels once. + void RegisterComputeKernels(); + + void RegisterLog() override; +}; + +}; // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/add_property_window.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/add_property_window.h new file mode 100644 index 00000000000..6f87f716210 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/add_property_window.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +// platform.h needs to be included before custom_window.h due to windows.h conflicts +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/custom_window.h" + +namespace driver { +namespace flight_sql { +namespace config { +/** + * Add property window class. + */ +class AddPropertyWindow : public CustomWindow { + /** + * Children windows ids. + */ + struct ChildId { + enum Type { + KEY_EDIT = 100, + KEY_LABEL, + VALUE_EDIT, + VALUE_LABEL, + OK_BUTTON, + CANCEL_BUTTON + }; + }; + + public: + /** + * Constructor. + * + * @param parent Parent window handle. + */ + explicit AddPropertyWindow(Window* parent); + + /** + * Destructor. + */ + virtual ~AddPropertyWindow(); + + /** + * Create window in the center of the parent window. + */ + void Create(); + + void OnCreate() override; + + bool OnMessage(UINT msg, WPARAM wparam, LPARAM lparam) override; + + /** + * Get the property from the dialog. + * + * @return true if the dialog was OK'd, false otherwise. + */ + bool GetProperty(std::string& key, std::string& value); + + private: + /** + * Create property edit boxes. + * + * @param pos_x X position. + * @param pos_y Y position. + * @param size_x Width. + * @return Size by Y. + */ + int CreateEdits(int pos_x, int pos_y, int size_x); + + void CheckEnableOk(); + + std::vector > labels_; + + /** Ok button. */ + std::unique_ptr ok_button_; + + /** Cancel button. */ + std::unique_ptr cancel_button_; + + std::unique_ptr key_edit_; + + std::unique_ptr value_edit_; + + std::string key_; + + std::string value_; + + /** Window width. */ + int width_; + + /** Window height. */ + int height_; + + /** Flag indicating whether OK option was selected. */ + bool accepted_; + + bool is_initialized_; +}; + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/custom_window.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/custom_window.h new file mode 100644 index 00000000000..f9651d5cca4 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/custom_window.h @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "ui/window.h" + +namespace driver { +namespace flight_sql { +namespace config { +/** + * Application execution result. + */ +struct Result { + enum Type { OK, CANCEL }; +}; + +/** + * Process UI messages in current thread. + * Blocks until quit message has been received. + * + * @param window Main window. + * @return Application execution result. + */ +Result::Type ProcessMessages(Window& window); + +/** + * Window class. + */ +class CustomWindow : public Window { + public: + // Window margin size. + enum { MARGIN = 10 }; + + // Standard interval between UI elements. + enum { INTERVAL = 10 }; + + // Standard row height. + enum { ROW_HEIGHT = 20 }; + + // Standard button width. + enum { BUTTON_WIDTH = 80 }; + + // Standard button height. + enum { BUTTON_HEIGHT = 25 }; + + /** + * Constructor. + * + * @param parent Parent window. + * @param class_name Window class name. + * @param title Window title. + */ + CustomWindow(Window* parent, const char* class_name, const char* title); + + /** + * Destructor. + */ + virtual ~CustomWindow(); + + /** + * Callback which is called upon receiving new message. + * Pure virtual. Should be defined by user. + * + * @param msg Message. + * @param wparam Word-sized parameter. + * @param lparam Long parameter. + * @return Should return true if the message has been + * processed by the handler and false otherwise. + */ + virtual bool OnMessage(UINT msg, WPARAM wparam, LPARAM lparam) = 0; + + /** + * Callback that is called upon window creation. + */ + virtual void OnCreate() = 0; + + private: + // IGNITE_NO_COPY_ASSIGNMENT(CustomWindow) + + /** + * Static callback. + * + * @param hwnd Window handle. + * @param msg Message. + * @param wparam Word-sized parameter. + * @param lparam Long parameter. + * @return Operation result. + */ + static LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM lparam); +}; + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/dsn_configuration_window.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/dsn_configuration_window.h new file mode 100644 index 00000000000..daa962ddb2a --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/dsn_configuration_window.h @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "config/configuration.h" +#include "ui/custom_window.h" + +namespace driver { +namespace flight_sql { +namespace config { +/** + * DSN configuration window class. + */ +class DsnConfigurationWindow : public CustomWindow { + /** + * Children windows ids. + */ + struct ChildId { + enum Type { + CONNECTION_SETTINGS_GROUP_BOX = 100, + AUTH_SETTINGS_GROUP_BOX, + ENCRYPTION_SETTINGS_GROUP_BOX, + NAME_EDIT, + NAME_LABEL, + SERVER_EDIT, + SERVER_LABEL, + PORT_EDIT, + PORT_LABEL, + AUTH_TYPE_LABEL, + AUTH_TYPE_COMBOBOX, + USER_LABEL, + USER_EDIT, + PASSWORD_LABEL, + PASSWORD_EDIT, + AUTH_TOKEN_LABEL, + AUTH_TOKEN_EDIT, + ENABLE_ENCRYPTION_LABEL, + ENABLE_ENCRYPTION_CHECKBOX, + CERTIFICATE_LABEL, + CERTIFICATE_EDIT, + CERTIFICATE_BROWSE_BUTTON, + USE_SYSTEM_CERT_STORE_LABEL, + USE_SYSTEM_CERT_STORE_CHECKBOX, + DISABLE_CERT_VERIFICATION_LABEL, + DISABLE_CERT_VERIFICATION_CHECKBOX, + PROPERTY_GROUP_BOX, + PROPERTY_LIST, + ADD_BUTTON, + DELETE_BUTTON, + TAB_CONTROL, + TEST_CONNECTION_BUTTON, + OK_BUTTON, + CANCEL_BUTTON + }; + }; + + public: + /** + * Constructor. + * + * @param parent Parent window handle. + * @param config Configuration + */ + DsnConfigurationWindow(Window* parent, config::Configuration& config); + + /** + * Destructor. + */ + virtual ~DsnConfigurationWindow(); + + /** + * Create window in the center of the parent window. + */ + void Create(); + + void OnCreate() override; + + bool OnMessage(UINT msg, WPARAM wparam, LPARAM lparam) override; + + private: + /** + * Create connection settings group box. + * + * @param pos_x X position. + * @param pos_y Y position. + * @param size_x Width. + * @return Size by Y. + */ + int CreateConnectionSettingsGroup(int pos_x, int pos_y, int size_x); + + /** + * Create aythentication settings group box. + * + * @param pos_x X position. + * @param pos_y Y position. + * @param size_x Width. + * @return Size by Y. + */ + int CreateAuthSettingsGroup(int pos_x, int pos_y, int size_x); + + /** + * Create Encryption settings group box. + * + * @param pos_x X position. + * @param pos_y Y position. + * @param size_x Width. + * @return Size by Y. + */ + int CreateEncryptionSettingsGroup(int pos_x, int pos_y, int size_x); + + /** + * Create advanced properties group box. + * + * @param pos_x X position. + * @param pos_y Y position. + * @param size_x Width. + * @return Size by Y. + */ + int CreatePropertiesGroup(int pos_x, int pos_y, int size_x); + + void SelectTab(int tab_index); + + void CheckEnableOk(); + + void CheckAuthType(); + + void SaveParameters(Configuration& target_config); + + /** Window width. */ + int width_; + + /** Window height. */ + int height_; + + std::unique_ptr tab_control_; + + /** Connection settings group box. */ + std::unique_ptr connection_settings_group_box_; + + /** Authentication settings group box. */ + std::unique_ptr auth_settings_group_box_; + + /** Encryption settings group box. */ + std::unique_ptr encryption_settings_group_box_; + + std::vector > labels_; + + /** Test button. */ + std::unique_ptr test_button_; + + /** Ok button. */ + std::unique_ptr ok_button_; + + /** Cancel button. */ + std::unique_ptr cancel_button_; + + /** DSN name edit field. */ + std::unique_ptr name_edit_; + + std::unique_ptr server_edit_; + + std::unique_ptr port_edit_; + + std::unique_ptr auth_type_combo_box_; + + /** User edit. */ + std::unique_ptr user_edit_; + + /** Password edit. */ + std::unique_ptr password_edit_; + + std::unique_ptr auth_token_edit_; + + std::unique_ptr enable_encryption_check_box_; + + std::unique_ptr certificate_edit_; + + std::unique_ptr certificate_browse_button_; + + std::unique_ptr use_system_cert_store_check_box_; + + std::unique_ptr disable_cert_verification_check_box_; + + std::unique_ptr property_group_box_; + + std::unique_ptr property_list_; + + std::unique_ptr add_button_; + + std::unique_ptr delete_button_; + + /** Configuration. */ + Configuration& config_; + + /** Flag indicating whether OK option was selected. */ + bool accepted_; + + bool is_initialized_; +}; + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/window.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/window.h new file mode 100644 index 00000000000..8ac8dd7934a --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/window.h @@ -0,0 +1,309 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace driver { +namespace flight_sql { +namespace config { + +/** + * Get handle for the current module. + * + * @return Handle for the current module. + */ +HINSTANCE GetHInstance(); + +/** + * Window class. + */ +class Window { + public: + /** + * Constructor for a new window that is going to be created. + * + * @param parent Parent window handle. + * @param class_name Window class name. + * @param title Window title. + */ + Window(Window* parent, const char* class_name, const char* title); + + /** + * Constructor for the existing window. + * + * @param handle Window handle. + */ + explicit Window(HWND handle); + + /** + * Destructor. + */ + virtual ~Window(); + + /** + * Create window. + * + * @param style Window style. + * @param pos_x Window x position. + * @param pos_y Window y position. + * @param width Window width. + * @param height Window height. + * @param id ID for child window. + */ + void Create(DWORD style, int pos_x, int pos_y, int width, int height, int id); + + /** + * Create child tab controlwindow. + * + * @param id ID to be assigned to the created window. + * @return Auto pointer containing new window. + */ + std::unique_ptr CreateTabControl(int id); + + /** + * Create child list view window. + * + * @param pos_x Position by X coordinate. + * @param pos_y Position by Y coordinate. + * @param size_x Size by X coordinate. + * @param size_y Size by Y coordinate. + * @param id ID to be assigned to the created window. + * @return Auto pointer containing new window. + */ + std::unique_ptr CreateList(int pos_x, int pos_y, int size_x, int size_y, + int id); + + /** + * Create child group box window. + * + * @param pos_x Position by X coordinate. + * @param pos_y Position by Y coordinate. + * @param size_x Size by X coordinate. + * @param size_y Size by Y coordinate. + * @param title Title. + * @param id ID to be assigned to the created window. + * @return Auto pointer containing new window. + */ + std::unique_ptr CreateGroupBox(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id); + + /** + * Create child label window. + * + * @param pos_x Position by X coordinate. + * @param pos_y Position by Y coordinate. + * @param size_x Size by X coordinate. + * @param size_y Size by Y coordinate. + * @param title Title. + * @param id ID to be assigned to the created window. + * @return Auto pointer containing new window. + */ + std::unique_ptr CreateLabel(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id); + + /** + * Create child Edit window. + * + * @param pos_x Position by X coordinate. + * @param pos_y Position by Y coordinate. + * @param size_x Size by X coordinate. + * @param size_y Size by Y coordinate. + * @param title Title. + * @param id ID to be assigned to the created window. + * @param style Window style. + * @return Auto pointer containing new window. + */ + std::unique_ptr CreateEdit(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id, int style = 0); + + /** + * Create child button window. + * + * @param pos_x Position by X coordinate. + * @param pos_y Position by Y coordinate. + * @param size_x Size by X coordinate. + * @param size_y Size by Y coordinate. + * @param title Title. + * @param id ID to be assigned to the created window. + * @param style Window style. + * @return Auto pointer containing new window. + */ + std::unique_ptr CreateButton(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id, int style = 0); + + /** + * Create child CheckBox window. + * + * @param pos_x Position by X coordinate. + * @param pos_y Position by Y coordinate. + * @param size_x Size by X coordinate. + * @param size_y Size by Y coordinate. + * @param title Title. + * @param id ID to be assigned to the created window. + * @param state Checked state of checkbox + * @return Auto pointer containing new window. + */ + std::unique_ptr CreateCheckBox(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id, bool state); + + /** + * Create child ComboBox window. + * + * @param pos_x Position by X coordinate. + * @param pos_y Position by Y coordinate. + * @param size_x Size by X coordinate. + * @param size_y Size by Y coordinate. + * @param title Title. + * @param id ID to be assigned to the created window. + * @return Auto pointer containing new window. + */ + std::unique_ptr CreateComboBox(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id); + + /** + * Show window. + */ + void Show(); + + /** + * Update window. + */ + void Update(); + + /** + * Destroy window. + */ + void Destroy(); + + /** + * Get window handle. + * + * @return Window handle. + */ + HWND GetHandle() const { return handle_; } + + void SetVisible(bool isVisible); + + void ListAddColumn(const std::string& name, int index, int width); + + void ListAddItem(const std::vector& items); + + void ListDeleteSelectedItem(); + + std::vector > ListGetAll(); + + void AddTab(const std::string& name, int index); + + bool IsTextEmpty() const; + + /** + * Get window text. + * + * @param text Text. + */ + void GetText(std::string& text) const; + + /** + * Set window text. + * + * @param text Text. + */ + void SetText(const std::string& text) const; + + /** + * Get CheckBox state. + * + * @return True if checked. + */ + bool IsChecked() const; + + /** + * Set CheckBox state. + * + * @param state True if checked. + */ + void SetChecked(bool state); + + /** + * Add string. + * + * @param str String. + */ + void AddString(const std::string& str); + + /** + * Set current ComboBox selection. + * + * @param idx List index. + */ + void SetSelection(int idx); + + /** + * Get current ComboBox selection. + * + * @return idx List index. + */ + int GetSelection() const; + + /** + * Set enabled. + * + * @param enabled Enable flag. + */ + void SetEnabled(bool enabled); + + /** + * Check if the window is enabled. + * + * @return True if enabled. + */ + bool IsEnabled() const; + + protected: + /** + * Set window handle. + * + * @param value Window handle. + */ + void SetHandle(HWND value) { handle_ = value; } + + /** Window class name. */ + std::string class_name_; + + /** Window title. */ + std::string title_; + + /** Window handle. */ + HWND handle_; + + /** Window parent. */ + Window* parent_; + + /** Specifies whether window has been created by the thread and needs destruction. */ + bool created_; + + private: + Window(const Window& window) = delete; +}; + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter.cc new file mode 100644 index 00000000000..e9c9df3bac5 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter.cc @@ -0,0 +1,326 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/json_converter.h" + +#include +#include +#include +#include "arrow/builder.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" +#include "arrow/scalar.h" +#include "arrow/visitor.h" + +using arrow::Status; + +using boost::beast::detail::base64::encode; +using boost::beast::detail::base64::encoded_size; +namespace base64 = boost::beast::detail::base64; + +using driver::flight_sql::ThrowIfNotOK; + +namespace { +template +Status ConvertScalarToStringAndWrite(const ScalarT& scalar, + rapidjson::Writer& writer) { + ARROW_ASSIGN_OR_RAISE(auto string_scalar, scalar.CastTo(arrow::utf8())) + const auto& view = reinterpret_cast(string_scalar.get())->view(); + writer.String(view.data(), view.length(), true); + return Status::OK(); +} + +template +Status ConvertBinaryToBase64StringAndWrite( + const BinaryScalarT& scalar, rapidjson::Writer& writer) { + const auto& view = scalar.view(); + size_t encoded_size = base64::encoded_size(view.length()); + std::vector encoded(std::max(encoded_size, static_cast(1))); + base64::encode(&encoded[0], view.data(), view.length()); + writer.String(&encoded[0], encoded_size, true); + return Status::OK(); +} + +template +Status WriteListScalar(const ListScalarT& scalar, + rapidjson::Writer& writer, + arrow::ScalarVisitor* visitor) { + writer.StartArray(); + for (int64_t i = 0; i < scalar.value->length(); ++i) { + if (scalar.value->IsNull(i)) { + writer.Null(); + } else { + const auto& result = scalar.value->GetScalar(i); + ThrowIfNotOK(result.status()); + ThrowIfNotOK(result.ValueOrDie()->Accept(visitor)); + } + } + + writer.EndArray(); + return Status::OK(); +} + +class ScalarToJson : public arrow::ScalarVisitor { + private: + rapidjson::StringBuffer string_buffer_; + rapidjson::Writer writer_{string_buffer_}; + + public: + void Reset() { + string_buffer_.Clear(); + writer_.Reset(string_buffer_); + } + + std::string ToString() { return string_buffer_.GetString(); } + + Status Visit(const arrow::NullScalar& scalar) override { + writer_.Null(); + + return Status::OK(); + } + + Status Visit(const arrow::BooleanScalar& scalar) override { + writer_.Bool(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::Int8Scalar& scalar) override { + writer_.Int(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::Int16Scalar& scalar) override { + writer_.Int(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::Int32Scalar& scalar) override { + writer_.Int(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::Int64Scalar& scalar) override { + writer_.Int64(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::UInt8Scalar& scalar) override { + writer_.Uint(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::UInt16Scalar& scalar) override { + writer_.Uint(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::UInt32Scalar& scalar) override { + writer_.Uint(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::UInt64Scalar& scalar) override { + writer_.Uint64(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::HalfFloatScalar& scalar) override { + return Status::NotImplemented("Cannot convert HalfFloatScalar to JSON."); + } + + Status Visit(const arrow::FloatScalar& scalar) override { + writer_.Double(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::DoubleScalar& scalar) override { + writer_.Double(scalar.value); + + return Status::OK(); + } + + Status Visit(const arrow::StringScalar& scalar) override { + const auto& view = scalar.view(); + writer_.String(view.data(), view.length()); + + return Status::OK(); + } + + Status Visit(const arrow::BinaryScalar& scalar) override { + return ConvertBinaryToBase64StringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::LargeStringScalar& scalar) override { + const auto& view = scalar.view(); + writer_.String(view.data(), view.length()); + + return Status::OK(); + } + + Status Visit(const arrow::LargeBinaryScalar& scalar) override { + return ConvertBinaryToBase64StringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::FixedSizeBinaryScalar& scalar) override { + return ConvertBinaryToBase64StringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::Date64Scalar& scalar) override { + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::Date32Scalar& scalar) override { + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::Time32Scalar& scalar) override { + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::Time64Scalar& scalar) override { + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::TimestampScalar& scalar) override { + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::DayTimeIntervalScalar& scalar) override { + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::MonthDayNanoIntervalScalar& scalar) override { + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::MonthIntervalScalar& scalar) override { + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::DurationScalar& scalar) override { + // TODO: Append TimeUnit on conversion + return ConvertScalarToStringAndWrite(scalar, writer_); + } + + Status Visit(const arrow::Decimal128Scalar& scalar) override { + const auto& view = scalar.ToString(); + writer_.RawValue(view.data(), view.length(), rapidjson::kNumberType); + + return Status::OK(); + } + + Status Visit(const arrow::Decimal256Scalar& scalar) override { + const auto& view = scalar.ToString(); + writer_.RawValue(view.data(), view.length(), rapidjson::kNumberType); + + return Status::OK(); + } + + Status Visit(const arrow::ListScalar& scalar) override { + return WriteListScalar(scalar, writer_, this); + } + + Status Visit(const arrow::LargeListScalar& scalar) override { + return WriteListScalar(scalar, writer_, this); + } + + Status Visit(const arrow::MapScalar& scalar) override { + return WriteListScalar(scalar, writer_, this); + } + + Status Visit(const arrow::FixedSizeListScalar& scalar) override { + return WriteListScalar(scalar, writer_, this); + } + + Status Visit(const arrow::StructScalar& scalar) override { + writer_.StartObject(); + + const std::shared_ptr& data_type = + std::static_pointer_cast(scalar.type); + for (int i = 0; i < data_type->num_fields(); ++i) { + const auto& result = scalar.field(i); + ThrowIfNotOK(result.status()); + const auto& value = result.ValueOrDie(); + writer_.Key(data_type->field(i)->name().c_str()); + if (value->is_valid) { + ThrowIfNotOK(value->Accept(this)); + } else { + writer_.Null(); + } + } + writer_.EndObject(); + return Status::OK(); + } + + Status Visit(const arrow::DictionaryScalar& scalar) override { + return Status::NotImplemented("Cannot convert DictionaryScalar to JSON."); + } + + Status Visit(const arrow::SparseUnionScalar& scalar) override { + return scalar.child_value().get()->Accept(this); + } + + Status Visit(const arrow::DenseUnionScalar& scalar) override { + return scalar.child_value().get()->Accept(this); + } + + Status Visit(const arrow::ExtensionScalar& scalar) override { + return Status::NotImplemented("Cannot convert ExtensionScalar to JSON."); + } +}; +} // namespace + +namespace driver { +namespace flight_sql { + +std::string ConvertToJson(const arrow::Scalar& scalar) { + static thread_local ScalarToJson converter; + converter.Reset(); + ThrowIfNotOK(scalar.Accept(&converter)); + + return converter.ToString(); +} + +arrow::Result> ConvertToJson( + const std::shared_ptr& input) { + arrow::StringBuilder builder; + int64_t length = input->length(); + RETURN_NOT_OK(builder.ReserveData(length)); + + for (int64_t i = 0; i < length; ++i) { + if (input->IsNull(i)) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + ARROW_ASSIGN_OR_RAISE(auto scalar, input->GetScalar(i)) + RETURN_NOT_OK(builder.Append(ConvertToJson(*scalar))); + } + } + + return builder.Finish(); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter.h new file mode 100644 index 00000000000..de466af4f77 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace driver { +namespace flight_sql { + +std::string ConvertToJson(const arrow::Scalar& scalar); + +arrow::Result> ConvertToJson( + const std::shared_ptr& input); + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter_test.cc new file mode 100644 index 00000000000..d5e018ecd47 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/json_converter_test.cc @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/json_converter.h" +#include "arrow/scalar.h" +#include "arrow/testing/builder.h" +#include "arrow/type.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { + +using arrow::TimeUnit; + +TEST(ConvertToJson, String) { + ASSERT_EQ("\"\"", ConvertToJson(arrow::StringScalar(""))); + ASSERT_EQ("\"string\"", ConvertToJson(arrow::StringScalar("string"))); + ASSERT_EQ("\"string\\\"\"", ConvertToJson(arrow::StringScalar("string\""))); +} + +TEST(ConvertToJson, LargeString) { + ASSERT_EQ("\"\"", ConvertToJson(arrow::LargeStringScalar(""))); + ASSERT_EQ("\"string\"", ConvertToJson(arrow::LargeStringScalar("string"))); + ASSERT_EQ("\"string\\\"\"", ConvertToJson(arrow::LargeStringScalar("string\""))); +} + +TEST(ConvertToJson, Binary) { + ASSERT_EQ("\"\"", ConvertToJson(arrow::BinaryScalar(""))); + ASSERT_EQ("\"c3RyaW5n\"", ConvertToJson(arrow::BinaryScalar("string"))); + ASSERT_EQ("\"c3RyaW5nIg==\"", ConvertToJson(arrow::BinaryScalar("string\""))); +} + +TEST(ConvertToJson, LargeBinary) { + ASSERT_EQ("\"\"", ConvertToJson(arrow::LargeBinaryScalar(""))); + ASSERT_EQ("\"c3RyaW5n\"", ConvertToJson(arrow::LargeBinaryScalar("string"))); + ASSERT_EQ("\"c3RyaW5nIg==\"", ConvertToJson(arrow::LargeBinaryScalar("string\""))); +} + +TEST(ConvertToJson, FixedSizeBinary) { + ASSERT_EQ("\"\"", ConvertToJson(arrow::FixedSizeBinaryScalar(""))); + ASSERT_EQ("\"c3RyaW5n\"", ConvertToJson(arrow::FixedSizeBinaryScalar("string"))); + ASSERT_EQ("\"c3RyaW5nIg==\"", ConvertToJson(arrow::FixedSizeBinaryScalar("string\""))); +} + +TEST(ConvertToJson, Int8) { + ASSERT_EQ("127", ConvertToJson(arrow::Int8Scalar(127))); + ASSERT_EQ("-128", ConvertToJson(arrow::Int8Scalar(-128))); +} + +TEST(ConvertToJson, Int16) { + ASSERT_EQ("32767", ConvertToJson(arrow::Int16Scalar(32767))); + ASSERT_EQ("-32768", ConvertToJson(arrow::Int16Scalar(-32768))); +} + +TEST(ConvertToJson, Int32) { + ASSERT_EQ("2147483647", ConvertToJson(arrow::Int32Scalar(2147483647))); + // 2147483648 is not valid as a signed int, using workaround + ASSERT_EQ("-2147483648", + ConvertToJson(arrow::Int32Scalar(static_cast(-2147483647 - 1)))); +} + +TEST(ConvertToJson, Int64) { + ASSERT_EQ("9223372036854775807", + ConvertToJson(arrow::Int64Scalar(9223372036854775807LL))); + // 9223372036854775808ULL is not valid as a signed int64, using workaround + ASSERT_EQ("-9223372036854775808", ConvertToJson(arrow::Int64Scalar(static_cast( + -9223372036854775807LL - 1)))); +} + +TEST(ConvertToJson, UInt8) { + ASSERT_EQ("127", ConvertToJson(arrow::UInt8Scalar(127))); + ASSERT_EQ("255", ConvertToJson(arrow::UInt8Scalar(255))); +} + +TEST(ConvertToJson, UInt16) { + ASSERT_EQ("32767", ConvertToJson(arrow::UInt16Scalar(32767))); + ASSERT_EQ("65535", ConvertToJson(arrow::UInt16Scalar(65535))); +} + +TEST(ConvertToJson, UInt32) { + ASSERT_EQ("2147483647", ConvertToJson(arrow::UInt32Scalar(2147483647))); + ASSERT_EQ("4294967295", ConvertToJson(arrow::UInt32Scalar(4294967295))); +} + +TEST(ConvertToJson, UInt64) { + ASSERT_EQ("9223372036854775807", + ConvertToJson(arrow::UInt64Scalar(9223372036854775807LL))); + ASSERT_EQ("18446744073709551615", + ConvertToJson(arrow::UInt64Scalar(18446744073709551615ULL))); +} + +TEST(ConvertToJson, Float) { + ASSERT_EQ("1.5", ConvertToJson(arrow::FloatScalar(1.5))); + ASSERT_EQ("-1.5", ConvertToJson(arrow::FloatScalar(-1.5))); +} + +TEST(ConvertToJson, Double) { + ASSERT_EQ("1.5", ConvertToJson(arrow::DoubleScalar(1.5))); + ASSERT_EQ("-1.5", ConvertToJson(arrow::DoubleScalar(-1.5))); +} + +TEST(ConvertToJson, Boolean) { + ASSERT_EQ("true", ConvertToJson(arrow::BooleanScalar(true))); + ASSERT_EQ("false", ConvertToJson(arrow::BooleanScalar(false))); +} + +TEST(ConvertToJson, Null) { ASSERT_EQ("null", ConvertToJson(arrow::NullScalar())); } + +TEST(ConvertToJson, Date32) { + ASSERT_EQ("\"1969-12-31\"", ConvertToJson(arrow::Date32Scalar(-1))); + ASSERT_EQ("\"1970-01-01\"", ConvertToJson(arrow::Date32Scalar(0))); + ASSERT_EQ("\"2022-01-01\"", ConvertToJson(arrow::Date32Scalar(18993))); +} + +TEST(ConvertToJson, Date64) { + ASSERT_EQ("\"1969-12-31\"", ConvertToJson(arrow::Date64Scalar(-86400000))); + ASSERT_EQ("\"1970-01-01\"", ConvertToJson(arrow::Date64Scalar(0))); + ASSERT_EQ("\"2022-01-01\"", ConvertToJson(arrow::Date64Scalar(1640995200000))); +} + +TEST(ConvertToJson, Time32) { + ASSERT_EQ("\"00:00:00\"", ConvertToJson(arrow::Time32Scalar(0, TimeUnit::SECOND))); + ASSERT_EQ("\"01:02:03\"", ConvertToJson(arrow::Time32Scalar(3723, TimeUnit::SECOND))); + ASSERT_EQ("\"00:00:00.123\"", ConvertToJson(arrow::Time32Scalar(123, TimeUnit::MILLI))); +} + +TEST(ConvertToJson, Time64) { + ASSERT_EQ("\"00:00:00.123456\"", + ConvertToJson(arrow::Time64Scalar(123456, TimeUnit::MICRO))); + ASSERT_EQ("\"00:00:00.123456789\"", + ConvertToJson(arrow::Time64Scalar(123456789, TimeUnit::NANO))); +} + +TEST(ConvertToJson, Timestamp) { + ASSERT_EQ("\"1969-12-31 00:00:00.000\"", + ConvertToJson(arrow::TimestampScalar(-86400000, TimeUnit::MILLI))); + ASSERT_EQ("\"1970-01-01 00:00:00.000\"", + ConvertToJson(arrow::TimestampScalar(0, TimeUnit::MILLI))); + ASSERT_EQ("\"2022-01-01 00:00:00.000\"", + ConvertToJson(arrow::TimestampScalar(1640995200000, TimeUnit::MILLI))); + ASSERT_EQ("\"2022-01-01 00:00:01.234\"", + ConvertToJson(arrow::TimestampScalar(1640995201234, TimeUnit::MILLI))); +} + +TEST(ConvertToJson, DayTimeInterval) { + ASSERT_EQ("\"123d0ms\"", ConvertToJson(arrow::DayTimeIntervalScalar({123, 0}))); + ASSERT_EQ("\"1d234ms\"", ConvertToJson(arrow::DayTimeIntervalScalar({1, 234}))); +} + +TEST(ConvertToJson, MonthDayNanoInterval) { + ASSERT_EQ("\"12M34d56ns\"", + ConvertToJson(arrow::MonthDayNanoIntervalScalar({12, 34, 56}))); +} + +TEST(ConvertToJson, MonthInterval) { + ASSERT_EQ("\"1M\"", ConvertToJson(arrow::MonthIntervalScalar(1))); +} + +TEST(ConvertToJson, Duration) { + // TODO: Append TimeUnit on conversion + ASSERT_EQ("\"123\"", ConvertToJson(arrow::DurationScalar(123, TimeUnit::SECOND))); + ASSERT_EQ("\"123\"", ConvertToJson(arrow::DurationScalar(123, TimeUnit::MILLI))); + ASSERT_EQ("\"123\"", ConvertToJson(arrow::DurationScalar(123, TimeUnit::MICRO))); + ASSERT_EQ("\"123\"", ConvertToJson(arrow::DurationScalar(123, TimeUnit::NANO))); +} + +TEST(ConvertToJson, Lists) { + std::vector values = {"ABC", "DEF", "XYZ"}; + std::shared_ptr array; + arrow::ArrayFromVector(values, &array); + + const char* expected_string = R"(["ABC","DEF","XYZ"])"; + ASSERT_EQ(expected_string, ConvertToJson(arrow::ListScalar{array})); + ASSERT_EQ(expected_string, ConvertToJson(arrow::FixedSizeListScalar{array})); + ASSERT_EQ(expected_string, ConvertToJson(arrow::LargeListScalar{array})); + + arrow::StringBuilder builder; + ASSERT_OK(builder.AppendNull()); + ASSERT_EQ("[null]", ConvertToJson(arrow::ListScalar{builder.Finish().ValueOrDie()})); + ASSERT_EQ("[]", ConvertToJson( + arrow::ListScalar{arrow::StringBuilder().Finish().ValueOrDie()})); +} + +TEST(ConvertToJson, Struct) { + auto i32 = arrow::MakeScalar(1); + auto f64 = arrow::MakeScalar(2.5); + auto str = arrow::MakeScalar("yo"); + ASSERT_OK_AND_ASSIGN( + auto scalar, + arrow::StructScalar::Make({i32, f64, str, + arrow::MakeNullScalar(std::shared_ptr( + new arrow::Date32Type()))}, + {"i", "f", "s", "null"})); + ASSERT_EQ("{\"i\":1,\"f\":2.5,\"s\":\"yo\",\"null\":null}", ConvertToJson(*scalar)); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/main.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/main.cc new file mode 100644 index 00000000000..b8b9f9e91f1 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/main.cc @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/flight_sql_driver.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_result_set_metadata.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement.h" + +#include +#include +#include "arrow/flight/api.h" +#include "arrow/flight/sql/api.h" +#include "arrow/util/logging.h" + +using arrow::Status; +using arrow::flight::FlightClient; +using arrow::flight::Location; +using arrow::flight::sql::FlightSqlClient; + +using driver::flight_sql::FlightSqlConnection; +using driver::flight_sql::FlightSqlDriver; +using driver::odbcabstraction::Connection; +using driver::odbcabstraction::ResultSet; +using driver::odbcabstraction::ResultSetMetadata; +using driver::odbcabstraction::Statement; + +void TestBindColumn(const std::shared_ptr& connection) { + const std::shared_ptr& statement = connection->CreateStatement(); + statement->Execute("SELECT IncidntNum, Category FROM \"@dremio\".Test LIMIT 10"); + + const std::shared_ptr& result_set = statement->GetResultSet(); + + const int batch_size = 100; + const int max_str_len = 1000; + + char incidnt_num[batch_size][max_str_len]; + ssize_t incidnt_num_length[batch_size]; + + char category[batch_size][max_str_len]; + ssize_t category_length[batch_size]; + + result_set->BindColumn(1, driver::odbcabstraction::CDataType_CHAR, 0, 0, incidnt_num, + max_str_len, incidnt_num_length); + result_set->BindColumn(2, driver::odbcabstraction::CDataType_CHAR, 0, 0, category, + max_str_len, category_length); + + size_t total = 0; + while (true) { + size_t fetched_rows = result_set->Move(batch_size, 0, 0, nullptr); + std::cout << "Fetched " << fetched_rows << " rows." << std::endl; + + total += fetched_rows; + std::cout << "Total:" << total << std::endl; + + for (int i = 0; i < fetched_rows; ++i) { + ARROW_LOG(DEBUG) << "Row[" << i << "] incidnt_num: '" << incidnt_num[i] + << "', Category: '" << category[i] << "'"; + } + + if (fetched_rows < batch_size) break; + } +} + +void TestGetData(const std::shared_ptr& connection) { + const std::shared_ptr& statement = connection->CreateStatement(); + statement->Execute( + "SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL " + "SELECT 5 UNION ALL SELECT 6"); + + const std::shared_ptr& result_set = statement->GetResultSet(); + const std::shared_ptr& metadata = result_set->GetMetadata(); + + while (result_set->Move(1, 0, 0, nullptr) == 1) { + char result[128]; + ssize_t result_length; + result_set->GetData(1, driver::odbcabstraction::CDataType_CHAR, 0, 0, &result, + sizeof(result), &result_length); + std::cout << result << std::endl; + } +} + +void TestBindColumnBigInt(const std::shared_ptr& connection) { + const std::shared_ptr& statement = connection->CreateStatement(); + statement->Execute( + "SELECT IncidntNum, CAST(\"IncidntNum\" AS DOUBLE) / 100 AS " + "double_field, Category\n" + "FROM (\n" + " SELECT CONVERT_TO_INTEGER(IncidntNum, 1, 1, 0) AS IncidntNum, " + "Category\n" + " FROM (\n" + " SELECT IncidntNum, Category FROM \"@dremio\".Test LIMIT 10\n" + " ) nested_0\n" + ") nested_0"); + + const std::shared_ptr& result_set = statement->GetResultSet(); + + const int batch_size = 100; + const int max_strlen = 1000; + + char incidnt_num[batch_size][max_strlen]; + ssize_t incidnt_num_length[batch_size]; + + double double_field[batch_size]; + ssize_t double_field_length[batch_size]; + + char category[batch_size][max_strlen]; + ssize_t category_length[batch_size]; + + result_set->BindColumn(1, driver::odbcabstraction::CDataType_CHAR, 0, 0, incidnt_num, + max_strlen, incidnt_num_length); + result_set->BindColumn(2, driver::odbcabstraction::CDataType_DOUBLE, 0, 0, double_field, + max_strlen, double_field_length); + result_set->BindColumn(3, driver::odbcabstraction::CDataType_CHAR, 0, 0, category, + max_strlen, category_length); + + size_t total = 0; + while (true) { + size_t fetched_rows = result_set->Move(batch_size, 0, 0, nullptr); + ARROW_LOG(DEBUG) << "Fetched " << fetched_rows << " rows."; + + total += fetched_rows; + ARROW_LOG(DEBUG) << "Total:" << total; + + for (int i = 0; i < fetched_rows; ++i) { + ARROW_LOG(DEBUG) << "Row[" << i << "] incidnt_num: '" << incidnt_num[i] << "', " + << "double_field: '" << double_field[i] << "', " + << "category: '" << category[i] << "'"; + } + + if (fetched_rows < batch_size) break; + } +} + +void TestGetTablesV2(const std::shared_ptr& connection) { + const std::shared_ptr& statement = connection->CreateStatement(); + const std::shared_ptr& result_set = + statement->GetTables_V2(nullptr, nullptr, nullptr, nullptr); + + const std::shared_ptr& metadata = result_set->GetMetadata(); + size_t column_count = metadata->GetColumnCount(); + + while (result_set->Move(1, 0, 0, nullptr) == 1) { + int buffer_length = 1024; + std::vector result(buffer_length); + ssize_t result_length; + result_set->GetData(1, driver::odbcabstraction::CDataType_CHAR, 0, 0, result.data(), + buffer_length, &result_length); + std::cout << result.data() << std::endl; + } + + std::cout << column_count << std::endl; +} + +void TestGetColumnsV3(const std::shared_ptr& connection) { + const std::shared_ptr& statement = connection->CreateStatement(); + std::string table_name = "test_numeric"; + std::string column_name = "%"; + const std::shared_ptr& result_set = + statement->GetColumns_V3(nullptr, nullptr, &table_name, &column_name); + + const std::shared_ptr& metadata = result_set->GetMetadata(); + size_t column_count = metadata->GetColumnCount(); + + int buffer_length = 1024; + std::vector result(buffer_length); + ssize_t result_length; + + while (result_set->Move(1, 0, 0, nullptr) == 1) { + for (int i = 0; i < column_count; ++i) { + result_set->GetData(1 + i, driver::odbcabstraction::CDataType_CHAR, 0, 0, + result.data(), buffer_length, &result_length); + std::cout << (result_length != -1 ? result.data() : "NULL") << '\t'; + } + + std::cout << std::endl; + } + + std::cout << column_count << std::endl; +} + +int main() { + FlightSqlDriver driver; + + const std::shared_ptr& connection = + driver.CreateConnection(driver::odbcabstraction::V_3); + + Connection::ConnPropertyMap properties = { + {FlightSqlConnection::HOST, std::string("automaster.drem.io")}, + {FlightSqlConnection::PORT, std::string("32010")}, + {FlightSqlConnection::USER, std::string("dremio")}, + {FlightSqlConnection::PASSWORD, std::string("dremio123")}, + {FlightSqlConnection::USE_ENCRYPTION, std::string("false")}, + }; + std::vector missing_attr; + connection->Connect(properties, missing_attr); + + // TestBindColumnBigInt(connection); + // TestBindColumn(connection); + TestGetData(connection); + // TestGetTablesV2(connection); + // TestGetColumnsV3(connection); + + connection->Close(); + return 0; +} diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/parse_table_types_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/parse_table_types_test.cc new file mode 100644 index 00000000000..9bfcabb0bbd --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/parse_table_types_test.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_statement_get_tables.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { + +void AssertParseTest(const std::string& input_string, + const std::vector& assert_vector) { + std::vector table_types; + + ParseTableTypes(input_string, table_types); + ASSERT_EQ(table_types, assert_vector); +} + +TEST(TableTypeParser, ParsingWithoutSingleQuotesWithLeadingWhiteSpace) { + AssertParseTest("TABLE, VIEW", {"TABLE", "VIEW"}); +} + +TEST(TableTypeParser, ParsingWithoutSingleQuotesWithoutLeadingWhiteSpace) { + AssertParseTest("TABLE,VIEW", {"TABLE", "VIEW"}); +} + +TEST(TableTypeParser, ParsingWithSingleQuotesWithLeadingWhiteSpace) { + AssertParseTest("'TABLE', 'VIEW'", {"TABLE", "VIEW"}); +} + +TEST(TableTypeParser, ParsingWithSingleQuotesWithoutLeadingWhiteSpace) { + AssertParseTest("'TABLE','VIEW'", {"TABLE", "VIEW"}); +} + +TEST(TableTypeParser, ParsingWithCommaInsideSingleQuotes) { + AssertParseTest("'TABLE, TEST', 'VIEW, TEMPORARY'", {"TABLE, TEST", "VIEW, TEMPORARY"}); +} +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer.cc new file mode 100644 index 00000000000..2b5724c5d16 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer.cc @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include +#include "arrow/array/util.h" +#include "arrow/builder.h" +#include "arrow/flight/sql/odbc/flight_sql/utils.h" + +#include "arrow/array/array_base.h" + +namespace driver { +namespace flight_sql { + +using arrow::ArrayBuilder; +using arrow::MemoryPool; +using arrow::Result; + +namespace { +Result> MakeEmptyArray(std::shared_ptr type, + MemoryPool* memory_pool, + int64_t array_size) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder)); + RETURN_NOT_OK(builder->AppendNulls(array_size)); + return builder->Finish(); +} + +/// A transformer class which is responsible to convert the name of fields +/// inside a RecordBatch. These fields are changed based on tasks created by the +/// methods RenameField() and AddFieldOfNulls(). The execution of the tasks is +/// handled by the method transformer. +class RecordBatchTransformerWithTasks : public RecordBatchTransformer { + private: + std::vector> fields_; + std::vector( + const std::shared_ptr& original_record_batch, + const std::shared_ptr& transformed_schema)>> + tasks_; + + public: + RecordBatchTransformerWithTasks( + std::vector> fields, + std::vector( + const std::shared_ptr& original_record_batch, + const std::shared_ptr& transformed_schema)>> + tasks) { + this->fields_.swap(fields); + this->tasks_.swap(tasks); + } + + std::shared_ptr Transform( + const std::shared_ptr& original) override { + auto new_schema = schema(fields_); + + std::vector> arrays; + arrays.reserve(new_schema->num_fields()); + + for (const auto& item : tasks_) { + arrays.emplace_back(item(original, new_schema)); + } + + auto transformed_batch = RecordBatch::Make(new_schema, original->num_rows(), arrays); + return transformed_batch; + } + + std::shared_ptr GetTransformedSchema() override { return schema(fields_); } +}; +} // namespace + +RecordBatchTransformerWithTasksBuilder& +RecordBatchTransformerWithTasksBuilder::RenameField(const std::string& original_name, + const std::string& transformed_name) { + auto rename_task = [=](const std::shared_ptr& original_record, + const std::shared_ptr& transformed_schema) { + auto original_data_type = original_record->schema()->GetFieldByName(original_name); + auto transformed_data_type = transformed_schema->GetFieldByName(transformed_name); + + if (original_data_type->type() != transformed_data_type->type()) { + throw odbcabstraction::DriverException( + "Original data and target data has different types"); + } + + return original_record->GetColumnByName(original_name); + }; + + task_collection_.emplace_back(rename_task); + + auto original_fields = schema_->GetFieldByName(original_name); + + if (original_fields->HasMetadata()) { + new_fields_.push_back( + field(transformed_name, original_fields->type(), original_fields->metadata())); + } else { + new_fields_.push_back(field(transformed_name, original_fields->type(), + std::shared_ptr())); + } + + return *this; +} + +RecordBatchTransformerWithTasksBuilder& +RecordBatchTransformerWithTasksBuilder::AddFieldOfNulls( + const std::string& field_name, const std::shared_ptr& data_type) { + auto empty_fields_task = [=](const std::shared_ptr& original_record, + const std::shared_ptr& transformed_schema) { + auto result = MakeEmptyArray(data_type, nullptr, original_record->num_rows()); + ThrowIfNotOK(result.status()); + + return result.ValueOrDie(); + }; + + task_collection_.emplace_back(empty_fields_task); + + new_fields_.push_back(field(field_name, data_type)); + + return *this; +} + +std::shared_ptr RecordBatchTransformerWithTasksBuilder::Build() { + std::shared_ptr transformer( + new RecordBatchTransformerWithTasks(this->new_fields_, this->task_collection_)); + + return transformer; +} + +RecordBatchTransformerWithTasksBuilder::RecordBatchTransformerWithTasksBuilder( + std::shared_ptr schema) + : schema_(std::move(schema)) {} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h new file mode 100644 index 00000000000..261b8c1d7c0 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace driver { +namespace flight_sql { + +using arrow::Array; +using arrow::DataType; +using arrow::Field; +using arrow::RecordBatch; +using arrow::Schema; + +typedef std::function( + const std::shared_ptr& original_record_batch, + const std::shared_ptr& transformed_schema)> + TransformTask; + +/// A base class to implement different types of transformer. +class RecordBatchTransformer { + public: + virtual ~RecordBatchTransformer() = default; + + /// Execute the transformation based on predeclared tasks created by + /// RenameField() method and/or AddFieldOfNulls(). + /// \param original The original RecordBatch that will be used as base + /// for the transformation. + /// \return The new transformed RecordBatch. + virtual std::shared_ptr Transform( + const std::shared_ptr& original) = 0; + + /// Use the new list of fields constructed during creation of task + /// to return the new schema. + /// \return the schema from the transformedRecordBatch. + virtual std::shared_ptr GetTransformedSchema() = 0; +}; + +class RecordBatchTransformerWithTasksBuilder { + private: + std::vector> new_fields_; + std::vector task_collection_; + std::shared_ptr schema_; + + public: + /// Based on the original array name and in a target array name it prepares + /// a task that will execute the transformation. + /// \param original_name The original name of the field. + /// \param transformed_name The name after the transformation. + RecordBatchTransformerWithTasksBuilder& RenameField( + const std::string& original_name, const std::string& transformed_name); + + /// Add an empty field to the transformed record batch. + /// \param field_name The name of the empty fields. + /// \param data_type The target data type for the new fields. + RecordBatchTransformerWithTasksBuilder& AddFieldOfNulls( + const std::string& field_name, const std::shared_ptr& data_type); + + /// It creates an object of RecordBatchTransformerWithTasksBuilder + /// \return a RecordBatchTransformerWithTasksBuilder object. + std::shared_ptr Build(); + + /// Instantiate a RecordBatchTransformerWithTasksBuilder object. + /// \param schema The schema from the original RecordBatch. + explicit RecordBatchTransformerWithTasksBuilder(std::shared_ptr schema); +}; + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer_test.cc new file mode 100644 index 00000000000..ebc56ab0867 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/record_batch_transformer_test.cc @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/record_batch_transformer.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/record_batch.h" +#include "arrow/testing/builder.h" +#include "gtest/gtest.h" + +using arrow::Array; +using arrow::Int32Type; +using arrow::RecordBatch; + +using arrow::ArrayFromVector; +namespace { +std::shared_ptr CreateOriginalRecordBatch() { + std::vector values = {1, 2, 3, 4, 5}; + std::shared_ptr array; + + ArrayFromVector(values, &array); + + auto schema = arrow::schema({field("test", arrow::int32(), false)}); + + return RecordBatch::Make(schema, 4, {array}); +} +} // namespace + +namespace driver { +namespace flight_sql { + +TEST(Transformer, TransformerRenameTest) { + // Prepare the Original Record Batch + auto original_record_batch = CreateOriginalRecordBatch(); + auto schema = original_record_batch->schema(); + + // Execute the transformation of the Record Batch + std::string original_name("test"); + std::string transformed_name("test1"); + + auto transformer = RecordBatchTransformerWithTasksBuilder(schema) + .RenameField(original_name, transformed_name) + .Build(); + + auto transformed_record_batch = transformer->Transform(original_record_batch); + + auto transformed_array_ptr = + transformed_record_batch->GetColumnByName(transformed_name); + + auto original_array_ptr = original_record_batch->GetColumnByName(original_name); + + // Assert that the arrays are being the same and we are not creating new + // buffers + ASSERT_EQ(transformed_array_ptr, original_array_ptr); + + // Assert if the schema is not the same + ASSERT_NE(original_record_batch->schema(), transformed_record_batch->schema()); + // Assert if the data is not changed + ASSERT_EQ(original_record_batch->GetColumnByName(original_name), + transformed_record_batch->GetColumnByName(transformed_name)); +} + +TEST(Transformer, TransformerAddEmptyVectorTest) { + // Prepare the Original Record Batch + auto original_record_batch = CreateOriginalRecordBatch(); + auto schema = original_record_batch->schema(); + + std::string original_name("test"); + std::string transformed_name("test1"); + auto emptyField = std::string("empty"); + + auto transformer = RecordBatchTransformerWithTasksBuilder(schema) + .RenameField(original_name, transformed_name) + .AddFieldOfNulls(emptyField, arrow::int32()) + .Build(); + + auto transformed_schema = transformer->GetTransformedSchema(); + + ASSERT_EQ(transformed_schema->num_fields(), 2); + ASSERT_EQ(transformed_schema->GetFieldIndex(transformed_name), 0); + ASSERT_EQ(transformed_schema->GetFieldIndex(emptyField), 1); + + auto transformed_record_batch = transformer->Transform(original_record_batch); + + auto transformed_array_ptr = + transformed_record_batch->GetColumnByName(transformed_name); + + auto original_array_ptr = original_record_batch->GetColumnByName(original_name); + + // Assert that the arrays are being the same and we are not creating new + // buffers + ASSERT_EQ(transformed_array_ptr, original_array_ptr); + + // Assert if the schema is not the same + ASSERT_NE(original_record_batch->schema(), transformed_record_batch->schema()); + // Assert if the data is not changed + ASSERT_EQ(original_record_batch->GetColumnByName(original_name), + transformed_record_batch->GetColumnByName(transformed_name)); +} + +TEST(Transformer, TransformerChangingOrderOfArrayTest) { + std::vector first_array_value = {1, 2, 3, 4, 5}; + std::vector second_array_value = {6, 7, 8, 9, 10}; + std::vector third_array_value = {2, 4, 6, 8, 10}; + std::shared_ptr first_array; + std::shared_ptr second_array; + std::shared_ptr third_array; + + ArrayFromVector(first_array_value, &first_array); + ArrayFromVector(second_array_value, &second_array); + ArrayFromVector(third_array_value, &third_array); + + auto schema = arrow::schema({field("first_array", arrow::int32(), false), + field("second_array", arrow::int32(), false), + field("third_array", arrow::int32(), false)}); + + auto original_record_batch = + RecordBatch::Make(schema, 5, {first_array, second_array, third_array}); + + auto transformer = RecordBatchTransformerWithTasksBuilder(schema) + .RenameField("third_array", "test3") + .RenameField("second_array", "test2") + .RenameField("first_array", "test1") + .AddFieldOfNulls("empty", arrow::int32()) + .Build(); + + const std::shared_ptr& transformed_record_batch = + transformer->Transform(original_record_batch); + + auto transformed_schema = transformed_record_batch->schema(); + + // Assert to check if the empty fields was added + ASSERT_EQ(transformed_record_batch->num_columns(), 4); + + // Assert to make sure that the elements changed his order. + ASSERT_EQ(transformed_schema->GetFieldIndex("test3"), 0); + ASSERT_EQ(transformed_schema->GetFieldIndex("test2"), 1); + ASSERT_EQ(transformed_schema->GetFieldIndex("test1"), 2); + ASSERT_EQ(transformed_schema->GetFieldIndex("empty"), 3); + + // Assert to make sure that the data didn't change after renaming the arrays + ASSERT_EQ(transformed_record_batch->GetColumnByName("test3"), third_array); + ASSERT_EQ(transformed_record_batch->GetColumnByName("test2"), second_array); + ASSERT_EQ(transformed_record_batch->GetColumnByName("test1"), first_array); +} +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/scalar_function_reporter.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/scalar_function_reporter.cc new file mode 100644 index 00000000000..2619e3f2f23 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/scalar_function_reporter.cc @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/scalar_function_reporter.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include +#include + +namespace driver { +namespace flight_sql { + +// The list of functions that can be converted from string to ODBC bitmasks is +// based on Calcite's SqlJdbcFunctionCall class. + +namespace { +static const std::unordered_map numeric_functions = { + {"ABS", SQL_FN_NUM_ABS}, {"ACOS", SQL_FN_NUM_ACOS}, + {"ASIN", SQL_FN_NUM_ASIN}, {"ATAN", SQL_FN_NUM_ATAN}, + {"ATAN2", SQL_FN_NUM_ATAN2}, {"CEILING", SQL_FN_NUM_CEILING}, + {"COS", SQL_FN_NUM_ACOS}, {"COT", SQL_FN_NUM_COT}, + {"DEGREES", SQL_FN_NUM_DEGREES}, {"EXP", SQL_FN_NUM_EXP}, + {"FLOOR", SQL_FN_NUM_FLOOR}, {"LOG", SQL_FN_NUM_LOG}, + {"LOG10", SQL_FN_NUM_LOG10}, {"MOD", SQL_FN_NUM_MOD}, + {"PI", SQL_FN_NUM_PI}, {"POWER", SQL_FN_NUM_POWER}, + {"RADIANS", SQL_FN_NUM_RADIANS}, {"RAND", SQL_FN_NUM_RAND}, + {"ROUND", SQL_FN_NUM_ROUND}, {"SIGN", SQL_FN_NUM_SIGN}, + {"SIN", SQL_FN_NUM_SIN}, {"SQRT", SQL_FN_NUM_SQRT}, + {"TAN", SQL_FN_NUM_TAN}, {"TRUNCATE", SQL_FN_NUM_TRUNCATE}}; + +static const std::unordered_map system_functions = { + {"DATABASE", SQL_FN_SYS_DBNAME}, + {"IFNULL", SQL_FN_SYS_IFNULL}, + {"USER", SQL_FN_SYS_USERNAME}}; + +static const std::unordered_map datetime_functions = { + {"CURDATE", SQL_FN_TD_CURDATE}, + {"CURTIME", SQL_FN_TD_CURTIME}, + {"DAYNAME", SQL_FN_TD_DAYNAME}, + {"DAYOFMONTH", SQL_FN_TD_DAYOFMONTH}, + {"DAYOFWEEK", SQL_FN_TD_DAYOFWEEK}, + {"DAYOFYEAR", SQL_FN_TD_DAYOFYEAR}, + {"HOUR", SQL_FN_TD_HOUR}, + {"MINUTE", SQL_FN_TD_MINUTE}, + {"MONTH", SQL_FN_TD_MONTH}, + {"MONTHNAME", SQL_FN_TD_MONTHNAME}, + {"NOW", SQL_FN_TD_NOW}, + {"QUARTER", SQL_FN_TD_QUARTER}, + {"SECOND", SQL_FN_TD_SECOND}, + {"TIMESTAMPADD", SQL_FN_TD_TIMESTAMPADD}, + {"TIMESTAMPDIFF", SQL_FN_TD_TIMESTAMPDIFF}, + {"WEEK", SQL_FN_TD_WEEK}, + {"YEAR", SQL_FN_TD_YEAR}, + // Additional functions in ODBC but not Calcite: + {"CURRENT_DATE", SQL_FN_TD_CURRENT_DATE}, + {"CURRENT_TIME", SQL_FN_TD_CURRENT_TIME}, + {"CURRENT_TIMESTAMP", SQL_FN_TD_CURRENT_TIMESTAMP}, + {"EXTRACT", SQL_FN_TD_EXTRACT}}; + +static const std::unordered_map string_functions = { + {"ASCII", SQL_FN_STR_ASCII}, + {"CHAR", SQL_FN_STR_CHAR}, + {"CONCAT", SQL_FN_STR_CONCAT}, + {"DIFFERENCE", SQL_FN_STR_DIFFERENCE}, + {"INSERT", SQL_FN_STR_INSERT}, + {"LCASE", SQL_FN_STR_LCASE}, + {"LEFT", SQL_FN_STR_LEFT}, + {"LENGTH", SQL_FN_STR_LENGTH}, + {"LOCATE", SQL_FN_STR_LOCATE}, + {"LTRIM", SQL_FN_STR_LTRIM}, + {"REPEAT", SQL_FN_STR_REPEAT}, + {"REPLACE", SQL_FN_STR_REPLACE}, + {"RIGHT", SQL_FN_STR_RIGHT}, + {"RTRIM", SQL_FN_STR_RTRIM}, + {"SOUNDEX", SQL_FN_STR_SOUNDEX}, + {"SPACE", SQL_FN_STR_SPACE}, + {"SUBSTRING", SQL_FN_STR_SUBSTRING}, + {"UCASE", SQL_FN_STR_UCASE}, + // Additional functions in ODBC but not Calcite: + {"LOCATE_2", SQL_FN_STR_LOCATE_2}, + {"BIT_LENGTH", SQL_FN_STR_BIT_LENGTH}, + {"CHAR_LENGTH", SQL_FN_STR_CHAR_LENGTH}, + {"CHARACTER_LENGTH", SQL_FN_STR_CHARACTER_LENGTH}, + {"OCTET_LENGTH", SQL_FN_STR_OCTET_LENGTH}, + {"POSTION", SQL_FN_STR_POSITION}, + {"SOUNDEX", SQL_FN_STR_SOUNDEX}}; +} // namespace + +void ReportSystemFunction(const std::string& function, uint32_t& current_sys_functions, + uint32_t& current_convert_functions) { + const auto& result = system_functions.find(function); + if (result != system_functions.end()) { + current_sys_functions |= result->second; + } else if (function == "CONVERT") { + // CAST and CONVERT are system functions from FlightSql/Calcite, but are + // CONVERT functions in ODBC. Assume that if CONVERT is reported as a system + // function, then CAST and CONVERT are both supported. + current_convert_functions |= SQL_FN_CVT_CONVERT | SQL_FN_CVT_CAST; + } +} + +void ReportNumericFunction(const std::string& function, uint32_t& current_functions) { + const auto& result = numeric_functions.find(function); + if (result != numeric_functions.end()) { + current_functions |= result->second; + } +} + +void ReportStringFunction(const std::string& function, uint32_t& current_functions) { + const auto& result = string_functions.find(function); + if (result != string_functions.end()) { + current_functions |= result->second; + } +} + +void ReportDatetimeFunction(const std::string& function, uint32_t& current_functions) { + const auto& result = datetime_functions.find(function); + if (result != datetime_functions.end()) { + current_functions |= result->second; + } +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/scalar_function_reporter.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/scalar_function_reporter.h new file mode 100644 index 00000000000..5c2ae06cdba --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/scalar_function_reporter.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace driver { +namespace flight_sql { + +void ReportSystemFunction(const std::string& function, uint32_t& current_sys_functions, + uint32_t& current_convert_functions); +void ReportNumericFunction(const std::string& function, uint32_t& current_functions); +void ReportStringFunction(const std::string& function, uint32_t& current_functions); +void ReportDatetimeFunction(const std::string& function, uint32_t& current_functions); + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/system_dsn.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/system_dsn.cc new file mode 100644 index 00000000000..6426c0c5b50 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/system_dsn.cc @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// platform.h includes windows.h, so it needs to be included +// before winuser.h +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/configuration.h" +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/connection_string_parser.h" +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/dsn_configuration_window.h" +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/window.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" + +#include +#include +#include +#include + +using driver::flight_sql::FlightSqlConnection; +using driver::flight_sql::config::Configuration; +using driver::flight_sql::config::ConnectionStringParser; +using driver::flight_sql::config::DsnConfigurationWindow; +using driver::flight_sql::config::Result; +using driver::flight_sql::config::Window; + +BOOL CALLBACK ConfigDriver(HWND hwndParent, WORD fRequest, LPCSTR lpszDriver, + LPCSTR lpszArgs, LPSTR lpszMsg, WORD cbMsgMax, + WORD* pcbMsgOut) { + return false; +} + +bool DisplayConnectionWindow(void* windowParent, Configuration& config) { + HWND hwndParent = (HWND)windowParent; + + if (!hwndParent) return true; + + try { + Window parent(hwndParent); + DsnConfigurationWindow window(&parent, config); + + window.Create(); + + window.Show(); + window.Update(); + + return ProcessMessages(window) == Result::OK; + } catch (driver::odbcabstraction::DriverException& err) { + std::stringstream buf; + buf << "Message: " << err.GetMessageText() << ", Code: " << err.GetNativeError(); + std::string message = buf.str(); + MessageBox(NULL, message.c_str(), "Error!", MB_ICONEXCLAMATION | MB_OK); + + SQLPostInstallerError(err.GetNativeError(), err.GetMessageText().c_str()); + } + + return false; +} + +void PostLastInstallerError() { +#define BUFFER_SIZE (1024) + DWORD code; + char msg[BUFFER_SIZE]; + SQLInstallerError(1, &code, msg, BUFFER_SIZE, NULL); + + std::stringstream buf; + buf << "Message: \"" << msg << "\", Code: " << code; + std::string error_msg = buf.str(); + + MessageBox(NULL, error_msg.c_str(), "Error!", MB_ICONEXCLAMATION | MB_OK); + SQLPostInstallerError(code, error_msg.c_str()); +} + +/** + * Unregister specified DSN. + * + * @param dsn DSN name. + * @return True on success and false on fail. + */ +bool UnregisterDsn(const std::string& dsn) { + if (SQLRemoveDSNFromIni(dsn.c_str())) { + return true; + } + + PostLastInstallerError(); + return false; +} + +/** + * Register DSN with specified configuration. + * + * @param config Configuration. + * @param driver Driver. + * @return True on success and false on fail. + */ +bool RegisterDsn(const Configuration& config, LPCSTR driver) { + const std::string& dsn = config.Get(FlightSqlConnection::DSN); + + if (!SQLWriteDSNToIni(dsn.c_str(), driver)) { + PostLastInstallerError(); + return false; + } + + const auto& map = config.GetProperties(); + for (auto it = map.begin(); it != map.end(); ++it) { + const std::string_view& key = it->first; + if (boost::iequals(FlightSqlConnection::DSN, key) || + boost::iequals(FlightSqlConnection::DRIVER, key)) { + continue; + } + + std::string key_str = std::string(key); + if (!SQLWritePrivateProfileString(dsn.c_str(), key_str.c_str(), it->second.c_str(), + "ODBC.INI")) { + PostLastInstallerError(); + return false; + } + } + + return true; +} + +BOOL INSTAPI ConfigDSN(HWND hwndParent, WORD req, LPCSTR driver, LPCSTR attributes) { + Configuration config; + ConnectionStringParser parser(config); + parser.ParseConfigAttributes(attributes); + + switch (req) { + case ODBC_ADD_DSN: { + config.LoadDefaults(); + if (!DisplayConnectionWindow(hwndParent, config) || !RegisterDsn(config, driver)) + return FALSE; + + break; + } + + case ODBC_CONFIG_DSN: { + const std::string& dsn = config.Get(FlightSqlConnection::DSN); + if (!SQLValidDSN(dsn.c_str())) return FALSE; + + Configuration loaded(config); + loaded.LoadDsn(dsn); + + if (!DisplayConnectionWindow(hwndParent, loaded) || !UnregisterDsn(dsn.c_str()) || + !RegisterDsn(loaded, driver)) + return FALSE; + + break; + } + + case ODBC_REMOVE_DSN: { + const std::string& dsn = config.Get(FlightSqlConnection::DSN); + if (!SQLValidDSN(dsn.c_str()) || !UnregisterDsn(dsn)) return FALSE; + + break; + } + + default: + return FALSE; + } + + return TRUE; +} diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/system_trust_store.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/system_trust_store.cc new file mode 100644 index 00000000000..67db1fc35be --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/system_trust_store.cc @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/system_trust_store.h" + +#if defined _WIN32 || defined _WIN64 + +namespace driver { +namespace flight_sql { +bool SystemTrustStore::HasNext() { + p_context_ = CertEnumCertificatesInStore(h_store_, p_context_); + + return p_context_ != nullptr; +} + +std::string SystemTrustStore::GetNext() const { + DWORD size = 0; + CryptBinaryToString(p_context_->pbCertEncoded, p_context_->cbCertEncoded, + CRYPT_STRING_BASE64HEADER, nullptr, &size); + + std::string cert; + cert.resize(size); + CryptBinaryToString(p_context_->pbCertEncoded, p_context_->cbCertEncoded, + CRYPT_STRING_BASE64HEADER, &cert[0], &size); + cert.resize(size); + + return cert; +} + +bool SystemTrustStore::SystemHasStore() { return h_store_ != nullptr; } + +SystemTrustStore::SystemTrustStore(const char* store) + : stores_(store), h_store_(CertOpenSystemStore(NULL, store)), p_context_(nullptr) {} + +SystemTrustStore::~SystemTrustStore() { + if (p_context_) { + CertFreeCertificateContext(p_context_); + } + if (h_store_) { + CertCloseStore(h_store_, 0); + } +} +} // namespace flight_sql +} // namespace driver + +#endif diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/system_trust_store.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/system_trust_store.h new file mode 100644 index 00000000000..71175b09709 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/system_trust_store.h @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined _WIN32 || defined _WIN64 + +# include + +# include + +# include +# include + +# include +# include +# include + +namespace driver { +namespace flight_sql { + +/// Load the certificates from the windows system trust store. Part of the logic +/// was based in the drill connector +/// https://github.com/apache/drill/blob/master/contrib/native/client/src/clientlib/wincert.ipp. +class SystemTrustStore { + private: + const char* stores_; + HCERTSTORE h_store_; + PCCERT_CONTEXT p_context_; + + public: + explicit SystemTrustStore(const char* store); + + ~SystemTrustStore(); + + /// Check if there is a certificate inside the system trust store to be extracted + /// \return If there is a valid cert in the store. + bool HasNext(); + + /// Get the next certificate from the store. + /// \return the certificate. + std::string GetNext() const; + + /// Check if the system has the specify store. + /// \return If the specific store exist in the system. + bool SystemHasStore(); +}; +} // namespace flight_sql +} // namespace driver + +#else // Not Windows +namespace driver { +namespace flight_sql { +class SystemTrustStore; +} // namespace flight_sql +} // namespace driver + +#endif diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/add_property_window.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/add_property_window.cc new file mode 100644 index 00000000000..60fcca64f17 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/add_property_window.cc @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "ui/add_property_window.h" + +#include + +#include +#include + +#include + +#include +#include "ui/custom_window.h" +#include "ui/window.h" + +namespace driver { +namespace flight_sql { +namespace config { + +AddPropertyWindow::AddPropertyWindow(Window* parent) + : CustomWindow(parent, "AddProperty", "Add Property"), + width_(300), + height_(120), + accepted_(false), + is_initialized_(false) { + // No-op. +} + +AddPropertyWindow::~AddPropertyWindow() { + // No-op. +} + +void AddPropertyWindow::Create() { + // Finding out parent position. + RECT parent_rect; + GetWindowRect(parent_->GetHandle(), &parent_rect); + + // Positioning window to the center of parent window. + const int pos_x = + parent_rect.left + (parent_rect.right - parent_rect.left - width_) / 2; + const int pos_y = + parent_rect.top + (parent_rect.bottom - parent_rect.top - height_) / 2; + + RECT desired_rect = {pos_x, pos_y, pos_x + width_, pos_y + height_}; + AdjustWindowRect(&desired_rect, WS_BORDER | WS_CAPTION | WS_SYSMENU | WS_THICKFRAME, + FALSE); + + Window::Create(WS_OVERLAPPED | WS_SYSMENU, desired_rect.left, desired_rect.top, + desired_rect.right - desired_rect.left, + desired_rect.bottom - desired_rect.top, 0); + + if (!handle_) { + std::stringstream buf; + buf << "Can not create window, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } +} + +bool AddPropertyWindow::GetProperty(std::string& key, std::string& value) { + if (accepted_) { + key = this->key_; + value = this->value_; + return true; + } + return false; +} + +void AddPropertyWindow::OnCreate() { + int group_pos_y = MARGIN; + int group_size_y = width_ - 2 * MARGIN; + + group_pos_y += INTERVAL + CreateEdits(MARGIN, group_pos_y, group_size_y); + + int cancel_pos_x = width_ - MARGIN - BUTTON_WIDTH; + int ok_pos_x = cancel_pos_x - INTERVAL - BUTTON_WIDTH; + + ok_button_ = CreateButton(ok_pos_x, group_pos_y, BUTTON_WIDTH, BUTTON_HEIGHT, "Ok", + ChildId::OK_BUTTON, BS_DEFPUSHBUTTON); + cancel_button_ = CreateButton(cancel_pos_x, group_pos_y, BUTTON_WIDTH, BUTTON_HEIGHT, + "Cancel", ChildId::CANCEL_BUTTON); + is_initialized_ = true; + CheckEnableOk(); +} + +int AddPropertyWindow::CreateEdits(int pos_x, int pos_y, int size_x) { + enum { LABEL_WIDTH = 30 }; + + const int edit_size_x = size_x - LABEL_WIDTH - INTERVAL; + const int edit_pos_x = pos_x + LABEL_WIDTH + INTERVAL; + + int row_pos = pos_y; + + labels_.push_back( + CreateLabel(pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, "Key:", ChildId::KEY_LABEL)); + key_edit_ = + CreateEdit(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, "", ChildId::KEY_EDIT); + + row_pos += INTERVAL + ROW_HEIGHT; + + labels_.push_back(CreateLabel(pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Value:", ChildId::VALUE_LABEL)); + value_edit_ = + CreateEdit(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, "", ChildId::VALUE_EDIT); + + row_pos += INTERVAL + ROW_HEIGHT; + + return row_pos - pos_y; +} + +void AddPropertyWindow::CheckEnableOk() { + if (!is_initialized_) { + return; + } + + ok_button_->SetEnabled(!key_edit_->IsTextEmpty() && !value_edit_->IsTextEmpty()); +} + +bool AddPropertyWindow::OnMessage(UINT msg, WPARAM wparam, LPARAM lparam) { + switch (msg) { + case WM_COMMAND: { + switch (LOWORD(wparam)) { + case ChildId::OK_BUTTON: { + key_edit_->GetText(key_); + value_edit_->GetText(value_); + accepted_ = true; + PostMessage(GetHandle(), WM_CLOSE, 0, 0); + + break; + } + + case IDCANCEL: + case ChildId::CANCEL_BUTTON: { + PostMessage(GetHandle(), WM_CLOSE, 0, 0); + break; + } + + case ChildId::KEY_EDIT: + case ChildId::VALUE_EDIT: { + if (HIWORD(wparam) == EN_CHANGE) { + CheckEnableOk(); + } + break; + } + + default: + return false; + } + + break; + } + + case WM_DESTROY: { + PostQuitMessage(accepted_ ? Result::OK : Result::CANCEL); + + break; + } + + default: + return false; + } + + return true; +} + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/custom_window.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/custom_window.cc new file mode 100644 index 00000000000..438bc535dc4 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/custom_window.cc @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// platform.h includes windows.h, so it needs to be included +// before Windowsx.h and commctrl.h +#include + +#include +#include + +#include +#include + +#include +#include "ui/custom_window.h" + +namespace driver { +namespace flight_sql { +namespace config { + +Result::Type ProcessMessages(Window& window) { + MSG msg; + + while (GetMessage(&msg, NULL, 0, 0) > 0) { + if (!IsDialogMessage(window.GetHandle(), &msg)) { + TranslateMessage(&msg); + + DispatchMessage(&msg); + } + } + + return static_cast(msg.wParam); +} + +LRESULT CALLBACK CustomWindow::WndProc(HWND hwnd, UINT msg, WPARAM wparam, + LPARAM lparam) { + CustomWindow* window = + reinterpret_cast(GetWindowLongPtr(hwnd, GWLP_USERDATA)); + + switch (msg) { + case WM_NCCREATE: { + _ASSERT(lparam != NULL); + + CREATESTRUCT* create_struct = reinterpret_cast(lparam); + + LONG_PTR long_self_ptr = reinterpret_cast(create_struct->lpCreateParams); + + SetWindowLongPtr(hwnd, GWLP_USERDATA, long_self_ptr); + + return DefWindowProc(hwnd, msg, wparam, lparam); + } + + case WM_CREATE: { + _ASSERT(window != NULL); + + window->SetHandle(hwnd); + + window->OnCreate(); + + return 0; + } + + default: + break; + } + + if (window && window->OnMessage(msg, wparam, lparam)) return 0; + + return DefWindowProc(hwnd, msg, wparam, lparam); +} + +CustomWindow::CustomWindow(Window* parent, const char* class_name, const char* title) + : Window(parent, class_name, title) { + WNDCLASS wcx; + + wcx.style = CS_HREDRAW | CS_VREDRAW; + wcx.lpfnWndProc = WndProc; + wcx.cbClsExtra = 0; + wcx.cbWndExtra = 0; + wcx.hInstance = GetHInstance(); + wcx.hIcon = NULL; + wcx.hCursor = LoadCursor(NULL, IDC_ARROW); + wcx.hbrBackground = (HBRUSH)COLOR_WINDOW; + wcx.lpszMenuName = NULL; + wcx.lpszClassName = class_name; + + if (!RegisterClass(&wcx)) { + std::stringstream buf; + buf << "Can not register window class, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } +} + +CustomWindow::~CustomWindow() { UnregisterClass(class_name_.c_str(), GetHInstance()); } + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/dsn_configuration_window.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/dsn_configuration_window.cc new file mode 100644 index 00000000000..718b4a1d52b --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/dsn_configuration_window.cc @@ -0,0 +1,609 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/dsn_configuration_window.h" +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/add_property_window.h" + +#define COMMON_TAB 0 +#define ADVANCED_TAB 1 + +namespace { +std::string TestConnection(const driver::flight_sql::config::Configuration& config) { + std::unique_ptr flight_sql_conn( + new driver::flight_sql::FlightSqlConnection(driver::odbcabstraction::V_3)); + + std::vector missing_properties; + flight_sql_conn->Connect(config.GetProperties(), missing_properties); + + // This should have been checked before enabling the Test button. + assert(missing_properties.empty()); + std::string server_name = + boost::get(flight_sql_conn->GetInfo(SQL_SERVER_NAME)); + std::string server_version = + boost::get(flight_sql_conn->GetInfo(SQL_DBMS_VER)); + return "Server Name: " + server_name + "\n" + "Server Version: " + server_version; +} +} // namespace + +namespace driver { +namespace flight_sql { +namespace config { + +DsnConfigurationWindow::DsnConfigurationWindow(Window* parent, + config::Configuration& config) + : CustomWindow(parent, "FlightConfigureDSN", "Configure Apache Arrow Flight SQL"), + width_(480), + height_(375), + config_(config), + accepted_(false), + is_initialized_(false) { + // No-op. +} + +DsnConfigurationWindow::~DsnConfigurationWindow() { + // No-op. +} + +void DsnConfigurationWindow::Create() { + // Finding out parent position. + RECT parent_rect; + GetWindowRect(parent_->GetHandle(), &parent_rect); + + // Positioning window to the center of parent window. + const int pos_x = + parent_rect.left + (parent_rect.right - parent_rect.left - width_) / 2; + const int pos_y = + parent_rect.top + (parent_rect.bottom - parent_rect.top - height_) / 2; + + RECT desired_rect = {pos_x, pos_y, pos_x + width_, pos_y + height_}; + AdjustWindowRect(&desired_rect, WS_BORDER | WS_CAPTION | WS_SYSMENU | WS_THICKFRAME, + FALSE); + + Window::Create(WS_OVERLAPPED | WS_SYSMENU, desired_rect.left, desired_rect.top, + desired_rect.right - desired_rect.left, + desired_rect.bottom - desired_rect.top, 0); + + if (!handle_) { + std::stringstream buf; + buf << "Can not create window, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } +} + +void DsnConfigurationWindow::OnCreate() { + tab_control_ = CreateTabControl(ChildId::TAB_CONTROL); + tab_control_->AddTab("Common", COMMON_TAB); + tab_control_->AddTab("Advanced", ADVANCED_TAB); + + int group_pos_y = 3 * MARGIN; + int group_size_y = width_ - 2 * MARGIN; + + int common_group_pos_y = group_pos_y; + common_group_pos_y += + INTERVAL + CreateConnectionSettingsGroup(MARGIN, common_group_pos_y, group_size_y); + common_group_pos_y += + INTERVAL + CreateAuthSettingsGroup(MARGIN, common_group_pos_y, group_size_y); + + int advanced_group_pos_y = group_pos_y; + advanced_group_pos_y += INTERVAL + CreateEncryptionSettingsGroup( + MARGIN, advanced_group_pos_y, group_size_y); + advanced_group_pos_y += + INTERVAL + CreatePropertiesGroup(MARGIN, advanced_group_pos_y, group_size_y); + + int test_pos_x = MARGIN; + int cancel_pos_x = width_ - MARGIN - BUTTON_WIDTH; + int ok_pos_x = cancel_pos_x - INTERVAL - BUTTON_WIDTH; + + int button_pos_y = std::max(common_group_pos_y, advanced_group_pos_y); + test_button_ = CreateButton(test_pos_x, button_pos_y, BUTTON_WIDTH + 20, BUTTON_HEIGHT, + "Test Connection", ChildId::TEST_CONNECTION_BUTTON); + ok_button_ = CreateButton(ok_pos_x, button_pos_y, BUTTON_WIDTH, BUTTON_HEIGHT, "Ok", + ChildId::OK_BUTTON); + cancel_button_ = CreateButton(cancel_pos_x, button_pos_y, BUTTON_WIDTH, BUTTON_HEIGHT, + "Cancel", ChildId::CANCEL_BUTTON); + is_initialized_ = true; + CheckEnableOk(); + SelectTab(COMMON_TAB); +} + +int DsnConfigurationWindow::CreateConnectionSettingsGroup(int pos_x, int pos_y, + int size_x) { + enum { LABEL_WIDTH = 100 }; + + const int label_pos_x = pos_x + INTERVAL; + + const int edit_size_x = size_x - LABEL_WIDTH - 3 * INTERVAL; + const int edit_pos_x = label_pos_x + LABEL_WIDTH + INTERVAL; + + int row_pos = pos_y + 2 * INTERVAL; + + const char* val = config_.Get(FlightSqlConnection::DSN).c_str(); + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Data Source Name:", ChildId::NAME_LABEL)); + name_edit_ = + CreateEdit(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, val, ChildId::NAME_EDIT); + + row_pos += INTERVAL + ROW_HEIGHT; + + val = config_.Get(FlightSqlConnection::HOST).c_str(); + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Host Name:", ChildId::SERVER_LABEL)); + server_edit_ = + CreateEdit(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, val, ChildId::SERVER_EDIT); + + row_pos += INTERVAL + ROW_HEIGHT; + + val = config_.Get(FlightSqlConnection::PORT).c_str(); + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Port:", ChildId::PORT_LABEL)); + port_edit_ = CreateEdit(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, val, + ChildId::PORT_EDIT, ES_NUMBER); + + row_pos += INTERVAL + ROW_HEIGHT; + + connection_settings_group_box_ = + CreateGroupBox(pos_x, pos_y, size_x, row_pos - pos_y, "Connection settings", + ChildId::CONNECTION_SETTINGS_GROUP_BOX); + + return row_pos - pos_y; +} + +int DsnConfigurationWindow::CreateAuthSettingsGroup(int pos_x, int pos_y, int size_x) { + enum { LABEL_WIDTH = 120 }; + + const int label_pos_x = pos_x + INTERVAL; + + const int edit_size_x = size_x - LABEL_WIDTH - 3 * INTERVAL; + const int edit_pos_x = label_pos_x + LABEL_WIDTH + INTERVAL; + + int row_pos = pos_y + 2 * INTERVAL; + + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Authentication Type:", ChildId::AUTH_TYPE_LABEL)); + auth_type_combo_box_ = + CreateComboBox(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, + "Authentication Type:", ChildId::AUTH_TYPE_COMBOBOX); + auth_type_combo_box_->AddString("Basic Authentication"); + auth_type_combo_box_->AddString("Token Authentication"); + + row_pos += INTERVAL + ROW_HEIGHT; + + const char* val = config_.Get(FlightSqlConnection::UID).c_str(); + + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "User:", ChildId::USER_LABEL)); + user_edit_ = + CreateEdit(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, val, ChildId::USER_EDIT); + + row_pos += INTERVAL + ROW_HEIGHT; + + val = config_.Get(FlightSqlConnection::PWD).c_str(); + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Password:", ChildId::PASSWORD_LABEL)); + password_edit_ = CreateEdit(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, val, + ChildId::USER_EDIT, ES_PASSWORD); + + row_pos += INTERVAL + ROW_HEIGHT; + + const auto& token = config_.Get(FlightSqlConnection::TOKEN); + val = token.c_str(); + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Authentication Token:", ChildId::AUTH_TOKEN_LABEL)); + auth_token_edit_ = CreateEdit(edit_pos_x, row_pos, edit_size_x, ROW_HEIGHT, val, + ChildId::AUTH_TOKEN_EDIT); + auth_token_edit_->SetEnabled(false); + + // Ensure the right elements are selected. + auth_type_combo_box_->SetSelection(token.empty() ? 0 : 1); + CheckAuthType(); + + row_pos += INTERVAL + ROW_HEIGHT; + + auth_settings_group_box_ = + CreateGroupBox(pos_x, pos_y, size_x, row_pos - pos_y, "Authentication settings", + ChildId::AUTH_SETTINGS_GROUP_BOX); + + return row_pos - pos_y; +} + +int DsnConfigurationWindow::CreateEncryptionSettingsGroup(int pos_x, int pos_y, + int size_x) { + enum { LABEL_WIDTH = 120 }; + + const int label_pos_x = pos_x + INTERVAL; + + const int edit_size_x = size_x - LABEL_WIDTH - 3 * INTERVAL; + const int edit_pos_x = label_pos_x + LABEL_WIDTH + INTERVAL; + + int row_pos = pos_y + 2 * INTERVAL; + + const char* val = config_.Get(FlightSqlConnection::USE_ENCRYPTION).c_str(); + + const bool enable_encryption = driver::odbcabstraction::AsBool(val).value_or(true); + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Use Encryption:", ChildId::ENABLE_ENCRYPTION_LABEL)); + enable_encryption_check_box_ = + CreateCheckBox(edit_pos_x, row_pos - 2, edit_size_x, ROW_HEIGHT, "", + ChildId::ENABLE_ENCRYPTION_CHECKBOX, enable_encryption); + + row_pos += INTERVAL + ROW_HEIGHT; + + val = config_.Get(FlightSqlConnection::TRUSTED_CERTS).c_str(); + + labels_.push_back(CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, ROW_HEIGHT, + "Certificate:", ChildId::CERTIFICATE_LABEL)); + certificate_edit_ = CreateEdit(edit_pos_x, row_pos, edit_size_x - MARGIN - BUTTON_WIDTH, + ROW_HEIGHT, val, ChildId::CERTIFICATE_EDIT); + certificate_browse_button_ = + CreateButton(edit_pos_x + edit_size_x - BUTTON_WIDTH, row_pos - 2, BUTTON_WIDTH, + BUTTON_HEIGHT, "Browse", ChildId::CERTIFICATE_BROWSE_BUTTON); + + row_pos += INTERVAL + ROW_HEIGHT; + + val = config_.Get(FlightSqlConnection::USE_SYSTEM_TRUST_STORE).c_str(); + + const bool use_system_cert_store = driver::odbcabstraction::AsBool(val).value_or(true); + labels_.push_back( + CreateLabel(label_pos_x, row_pos, LABEL_WIDTH, 2 * ROW_HEIGHT, + "Use System Certificate Store:", ChildId::USE_SYSTEM_CERT_STORE_LABEL)); + use_system_cert_store_check_box_ = + CreateCheckBox(edit_pos_x, row_pos - 2, 20, 2 * ROW_HEIGHT, "", + ChildId::USE_SYSTEM_CERT_STORE_CHECKBOX, use_system_cert_store); + + val = config_.Get(FlightSqlConnection::DISABLE_CERTIFICATE_VERIFICATION).c_str(); + + const int right_pos_x = label_pos_x + (size_x - (2 * INTERVAL)) / 2; + const int right_check_pos_x = right_pos_x + (edit_pos_x - label_pos_x); + const bool disable_cert_verification = + driver::odbcabstraction::AsBool(val).value_or(false); + labels_.push_back(CreateLabel( + right_pos_x, row_pos, LABEL_WIDTH, 2 * ROW_HEIGHT, + "Disable Certificate Verification:", ChildId::DISABLE_CERT_VERIFICATION_LABEL)); + disable_cert_verification_check_box_ = CreateCheckBox( + right_check_pos_x, row_pos - 2, 20, 2 * ROW_HEIGHT, "", + ChildId::DISABLE_CERT_VERIFICATION_CHECKBOX, disable_cert_verification); + + row_pos += INTERVAL + static_cast(1.5 * static_cast(ROW_HEIGHT)); + + encryption_settings_group_box_ = + CreateGroupBox(pos_x, pos_y, size_x, row_pos - pos_y, "Encryption settings", + ChildId::AUTH_SETTINGS_GROUP_BOX); + + return row_pos - pos_y; +} + +int DsnConfigurationWindow::CreatePropertiesGroup(int pos_x, int pos_y, int size_x) { + enum { LABEL_WIDTH = 120 }; + + const int label_pos_x = pos_x + INTERVAL; + const int list_size = size_x - 2 * INTERVAL; + const int column_size = list_size / 2; + + int row_pos = pos_y + 2 * INTERVAL; + const int list_height = 5 * ROW_HEIGHT; + + property_list_ = + CreateList(label_pos_x, row_pos, list_size, list_height, ChildId::PROPERTY_LIST); + property_list_->ListAddColumn("Key", 0, column_size); + property_list_->ListAddColumn("Value", 1, column_size); + + const auto keys = config_.GetCustomKeys(); + for (const auto& key : keys) { + property_list_->ListAddItem({std::string(key), config.Get(key)}); + } + + SendMessage(property_list_->GetHandle(), LVM_SETEXTENDEDLISTVIEWSTYLE, + LVS_EX_FULLROWSELECT, LVS_EX_FULLROWSELECT); + + row_pos += INTERVAL + list_height; + + int delete_pos_x = width_ - INTERVAL - MARGIN - BUTTON_WIDTH; + int add_pos_x = delete_pos_x - INTERVAL - BUTTON_WIDTH; + add_button_ = CreateButton(add_pos_x, row_pos, BUTTON_WIDTH, BUTTON_HEIGHT, "Add", + ChildId::ADD_BUTTON); + delete_button_ = CreateButton(delete_pos_x, row_pos, BUTTON_WIDTH, BUTTON_HEIGHT, + "Delete", ChildId::DELETE_BUTTON); + + row_pos += INTERVAL + BUTTON_HEIGHT; + + property_group_box_ = + CreateGroupBox(pos_x, pos_y, size_x, row_pos - pos_y, "Advanced properties", + ChildId::PROPERTY_GROUP_BOX); + + return row_pos - pos_y; +} + +void DsnConfigurationWindow::SelectTab(int tab_index) { + if (!is_initialized_) { + return; + } + + connection_settings_group_box_->SetVisible(COMMON_TAB == tab_index); + auth_settings_group_box_->SetVisible(COMMON_TAB == tab_index); + name_edit_->SetVisible(COMMON_TAB == tab_index); + server_edit_->SetVisible(COMMON_TAB == tab_index); + port_edit_->SetVisible(COMMON_TAB == tab_index); + auth_type_combo_box_->SetVisible(COMMON_TAB == tab_index); + user_edit_->SetVisible(COMMON_TAB == tab_index); + password_edit_->SetVisible(COMMON_TAB == tab_index); + auth_token_edit_->SetVisible(COMMON_TAB == tab_index); + for (size_t i = 0; i < 7; ++i) { + labels_[i]->SetVisible(COMMON_TAB == tab_index); + } + + encryption_settings_group_box_->SetVisible(ADVANCED_TAB == tab_index); + enable_encryption_check_box_->SetVisible(ADVANCED_TAB == tab_index); + certificate_edit_->SetVisible(ADVANCED_TAB == tab_index); + certificate_browse_button_->SetVisible(ADVANCED_TAB == tab_index); + use_system_cert_store_check_box_->SetVisible(ADVANCED_TAB == tab_index); + disable_cert_verification_check_box_->SetVisible(ADVANCED_TAB == tab_index); + property_group_box_->SetVisible(ADVANCED_TAB == tab_index); + property_list_->SetVisible(ADVANCED_TAB == tab_index); + add_button_->SetVisible(ADVANCED_TAB == tab_index); + delete_button_->SetVisible(ADVANCED_TAB == tab_index); + for (size_t i = 7; i < labels_.size(); ++i) { + labels_[i]->SetVisible(ADVANCED_TAB == tab_index); + } +} + +void DsnConfigurationWindow::CheckEnableOk() { + if (!is_initialized_) { + return; + } + + bool enable_ok = !name_edit_->IsTextEmpty(); + enable_ok = enable_ok && !server_edit_->IsTextEmpty(); + enable_ok = enable_ok && !port_edit_->IsTextEmpty(); + if (auth_token_edit_->IsEnabled()) { + enable_ok = enable_ok && !auth_token_edit_->IsTextEmpty(); + } else { + enable_ok = enable_ok && !user_edit_->IsTextEmpty(); + enable_ok = enable_ok && !password_edit_->IsTextEmpty(); + } + + test_button_->SetEnabled(enable_ok); + ok_button_->SetEnabled(enable_ok); +} + +void DsnConfigurationWindow::SaveParameters(Configuration& target_config) { + target_config.Clear(); + + std::string text; + name_edit_->GetText(text); + target_config.Set(FlightSqlConnection::DSN, text); + server_edit_->GetText(text); + target_config.Set(FlightSqlConnection::HOST, text); + port_edit_->GetText(text); + try { + const int port_int = std::stoi(text); + if (0 > port_int || USHRT_MAX < port_int) { + throw odbcabstraction::DriverException("Invalid port value."); + } + target_config.Set(FlightSqlConnection::PORT, text); + } catch (odbcabstraction::DriverException&) { + throw; + } catch (std::exception&) { + throw odbcabstraction::DriverException("Invalid port value."); + } + + if (0 == auth_type_combo_box_->GetSelection()) { + user_edit_->GetText(text); + target_config.Set(FlightSqlConnection::UID, text); + password_edit_->GetText(text); + target_config.Set(FlightSqlConnection::PWD, text); + } else { + auth_token_edit_->GetText(text); + target_config.Set(FlightSqlConnection::TOKEN, text); + } + + if (enable_encryption_check_box_->IsChecked()) { + target_config.Set(FlightSqlConnection::USE_ENCRYPTION, TRUE_STR); + certificate_edit_->GetText(text); + target_config.Set(FlightSqlConnection::TRUSTED_CERTS, text); + target_config.Set( + FlightSqlConnection::USE_SYSTEM_TRUST_STORE, + use_system_cert_store_check_box_->IsChecked() ? TRUE_STR : FALSE_STR); + target_config.Set( + FlightSqlConnection::DISABLE_CERTIFICATE_VERIFICATION, + disable_cert_verification_check_box_->IsChecked() ? TRUE_STR : FALSE_STR); + } else { + target_config.Set(FlightSqlConnection::USE_ENCRYPTION, FALSE_STR); + } + + // Get all the list properties. + const auto properties = property_list_->ListGetAll(); + for (const auto& property : properties) { + target_config.Set(property[0], property[1]); + } +} + +void DsnConfigurationWindow::CheckAuthType() { + const bool is_basic = COMMON_TAB == auth_type_combo_box_->GetSelection(); + user_edit_->SetEnabled(is_basic); + password_edit_->SetEnabled(is_basic); + auth_token_edit_->SetEnabled(!is_basic); +} + +bool DsnConfigurationWindow::OnMessage(UINT msg, WPARAM wparam, LPARAM lparam) { + switch (msg) { + case WM_NOTIFY: { + switch (((LPNMHDR)lparam)->code) { + case TCN_SELCHANGING: { + // Return FALSE to allow the selection to change. + return FALSE; + } + + case TCN_SELCHANGE: { + SelectTab(TabCtrl_GetCurSel(tab_control_->GetHandle())); + break; + } + } + break; + } + + case WM_COMMAND: { + switch (LOWORD(wparam)) { + case ChildId::TEST_CONNECTION_BUTTON: { + try { + Configuration test_config; + SaveParameters(test_config); + std::string test_message = TestConnection(test_config); + + MessageBox(NULL, test_message.c_str(), "Test Connection Success", MB_OK); + } catch (odbcabstraction::DriverException& err) { + MessageBox(NULL, err.GetMessageText().c_str(), "Error!", + MB_ICONEXCLAMATION | MB_OK); + } + + break; + } + case ChildId::OK_BUTTON: { + try { + SaveParameters(config_); + accepted_ = true; + PostMessage(GetHandle(), WM_CLOSE, 0, 0); + } catch (odbcabstraction::DriverException& err) { + MessageBox(NULL, err.GetMessageText().c_str(), "Error!", + MB_ICONEXCLAMATION | MB_OK); + } + + break; + } + + case IDCANCEL: + case ChildId::CANCEL_BUTTON: { + PostMessage(GetHandle(), WM_CLOSE, 0, 0); + break; + } + + case ChildId::AUTH_TOKEN_EDIT: + case ChildId::NAME_EDIT: + case ChildId::PASSWORD_EDIT: + case ChildId::PORT_EDIT: + case ChildId::SERVER_EDIT: + case ChildId::USER_EDIT: { + if (HIWORD(wparam) == EN_CHANGE) { + CheckEnableOk(); + } + break; + } + + case ChildId::AUTH_TYPE_COMBOBOX: { + CheckAuthType(); + CheckEnableOk(); + break; + } + + case ChildId::ENABLE_ENCRYPTION_CHECKBOX: { + const bool toggle = !enable_encryption_check_box_->IsChecked(); + enable_encryption_check_box_->SetChecked(toggle); + certificate_edit_->SetEnabled(toggle); + certificate_browse_button_->SetEnabled(toggle); + use_system_cert_store_check_box_->SetEnabled(toggle); + disable_cert_verification_check_box_->SetEnabled(toggle); + break; + } + + case ChildId::CERTIFICATE_BROWSE_BUTTON: { + OPENFILENAME open_file_name; + char file_name[FILENAME_MAX]; + + ZeroMemory(&open_file_name, sizeof(open_file_name)); + open_file_name.lStructSize = sizeof(open_file_name); + open_file_name.hwndOwner = NULL; + open_file_name.lpstrFile = file_name; + open_file_name.lpstrFile[0] = '\0'; + open_file_name.nMaxFile = FILENAME_MAX; + // TODO: What type should this be? + open_file_name.lpstrFilter = "All\0*.*"; + open_file_name.nFilterIndex = 1; + open_file_name.lpstrFileTitle = NULL; + open_file_name.nMaxFileTitle = 0; + open_file_name.lpstrInitialDir = NULL; + open_file_name.Flags = OFN_PATHMUSTEXIST | OFN_FILEMUSTEXIST; + + if (GetOpenFileName(&open_file_name)) { + certificate_edit_->SetText(file_name); + } + break; + } + + case ChildId::USE_SYSTEM_CERT_STORE_CHECKBOX: { + use_system_cert_store_check_box_->SetChecked( + !use_system_cert_store_check_box_->IsChecked()); + break; + } + + case ChildId::DISABLE_CERT_VERIFICATION_CHECKBOX: { + disable_cert_verification_check_box_->SetChecked( + !disable_cert_verification_check_box_->IsChecked()); + break; + } + + case ChildId::DELETE_BUTTON: { + property_list_->ListDeleteSelectedItem(); + break; + } + + case ChildId::ADD_BUTTON: { + AddPropertyWindow add_window(this); + add_window.Create(); + add_window.Show(); + add_window.Update(); + + if (ProcessMessages(add_window) == Result::OK) { + std::string key; + std::string value; + add_window.GetProperty(key, value); + property_list_->ListAddItem({key, value}); + } + break; + } + + default: + return false; + } + + break; + } + + case WM_DESTROY: { + PostQuitMessage(accepted_ ? Result::OK : Result::CANCEL); + + break; + } + + default: + return false; + } + + return true; +} + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/window.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/window.cc new file mode 100644 index 00000000000..534575680ac --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/ui/window.cc @@ -0,0 +1,331 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// platform.h includes windows.h, so it needs to be included +// before Windowsx.h and commctrl.h +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include +#include +#include + +#include +#include +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/ui/window.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" + +namespace driver { +namespace flight_sql { +namespace config { + +HINSTANCE GetHInstance() { + TCHAR sz_file_name[MAX_PATH]; + GetModuleFileName(NULL, sz_file_name, MAX_PATH); + + // TODO: This needs to be the module name. + HINSTANCE h_instance = GetModuleHandle(sz_file_name); + + if (h_instance == NULL) { + std::stringstream buf; + buf << "Can not get hInstance for the module, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } + + return h_instance; +} + +Window::Window(Window* parent, const char* class_name, const char* title) + : class_name_(class_name), + title_(title), + handle_(NULL), + parent_(parent), + created_(false) { + // No-op. +} + +Window::Window(HWND handle) + : class_name_(), title_(), handle_(handle), parent_(0), created_(false) { + // No-op. +} + +Window::~Window() { + if (created_) Destroy(); +} + +void Window::Create(DWORD style, int pos_x, int pos_y, int width, int height, int id) { + if (handle_) { + std::stringstream buf; + buf << "Window already created, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } + + handle_ = CreateWindow(class_name_.c_str(), title_.c_str(), style, pos_x, pos_y, width, + height, parent_ ? parent_->GetHandle() : NULL, + reinterpret_cast(static_cast(id)), + GetHInstance(), this); + + if (!handle_) { + std::stringstream buf; + buf << "Can not create window, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } + + created_ = true; + + const HGDIOBJ hf_default = GetStockObject(DEFAULT_GUI_FONT); + SendMessage(GetHandle(), WM_SETFONT, (WPARAM)hf_default, MAKELPARAM(FALSE, 0)); +} + +std::unique_ptr Window::CreateTabControl(int id) { + std::unique_ptr child(new Window(this, WC_TABCONTROL, "")); + + // Get the dimensions of the parent window's client area, and + // create a tab control child window of that size. + RECT rc_client; + GetClientRect(handle_, &rc_client); + + child->Create(WS_CHILD | WS_CLIPSIBLINGS | WS_VISIBLE | WS_TABSTOP, 0, 0, + rc_client.right, 20, id); + + return child; +} + +std::unique_ptr Window::CreateList(int pos_x, int pos_y, int size_x, int size_y, + int id) { + std::unique_ptr child(new Window(this, WC_LISTVIEW, "")); + + child->Create( + WS_CHILD | WS_VISIBLE | WS_BORDER | LVS_REPORT | LVS_EDITLABELS | WS_TABSTOP, pos_x, + pos_y, size_x, size_y, id); + + return child; +} + +std::unique_ptr Window::CreateGroupBox(int pos_x, int pos_y, int size_x, + int size_y, const char* title, int id) { + std::unique_ptr child(new Window(this, "Button", title)); + + child->Create(WS_CHILD | WS_VISIBLE | BS_GROUPBOX, pos_x, pos_y, size_x, size_y, id); + + return child; +} + +std::unique_ptr Window::CreateLabel(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id) { + std::unique_ptr child(new Window(this, "Static", title)); + + child->Create(WS_CHILD | WS_VISIBLE, pos_x, pos_y, size_x, size_y, id); + + return child; +} + +std::unique_ptr Window::CreateEdit(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id, int style) { + std::unique_ptr child(new Window(this, "Edit", title)); + + child->Create(WS_CHILD | WS_VISIBLE | WS_BORDER | ES_AUTOHSCROLL | WS_TABSTOP | style, + pos_x, pos_y, size_x, size_y, id); + + return child; +} + +std::unique_ptr Window::CreateButton(int pos_x, int pos_y, int size_x, int size_y, + const char* title, int id, int style) { + std::unique_ptr child(new Window(this, "Button", title)); + + child->Create(WS_CHILD | WS_VISIBLE | WS_TABSTOP | style, pos_x, pos_y, size_x, size_y, + id); + + return child; +} + +std::unique_ptr Window::CreateCheckBox(int pos_x, int pos_y, int size_x, + int size_y, const char* title, int id, + bool state) { + std::unique_ptr child(new Window(this, "Button", title)); + + child->Create(WS_CHILD | WS_VISIBLE | BS_CHECKBOX | WS_TABSTOP, pos_x, pos_y, size_x, + size_y, id); + + child->SetChecked(state); + + return child; +} + +std::unique_ptr Window::CreateComboBox(int pos_x, int pos_y, int size_x, + int size_y, const char* title, int id) { + std::unique_ptr child(new Window(this, "Combobox", title)); + + child->Create(WS_CHILD | WS_VISIBLE | CBS_DROPDOWNLIST | WS_TABSTOP, pos_x, pos_y, + size_x, size_y, id); + + return child; +} + +void Window::Show() { ShowWindow(handle_, SW_SHOW); } + +void Window::Update() { UpdateWindow(handle_); } + +void Window::Destroy() { + if (handle_) DestroyWindow(handle_); + + handle_ = NULL; +} + +void Window::SetVisible(bool is_visible) { + ShowWindow(handle_, is_visible ? SW_SHOW : SW_HIDE); +} + +bool Window::IsTextEmpty() const { + if (!IsEnabled()) { + return true; + } + int len = GetWindowTextLength(handle_); + + return (len <= 0); +} + +void Window::ListAddColumn(const std::string& name, int index, int width) { + LVCOLUMN lvc; + lvc.mask = LVCF_FMT | LVCF_WIDTH | LVCF_TEXT | LVCF_SUBITEM; + lvc.fmt = LVCFMT_LEFT; + lvc.cx = width; + lvc.pszText = const_cast(name.c_str()); + lvc.iSubItem = index; + + if (ListView_InsertColumn(handle_, index, &lvc) == -1) { + std::stringstream buf; + buf << "Can not add list column, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } +} + +void Window::ListAddItem(const std::vector& items) { + LVITEM lvi = {0}; + lvi.mask = LVIF_TEXT; + lvi.pszText = const_cast(items[0].c_str()); + + int ret = ListView_InsertItem(handle_, &lvi); + if (ret < 0) { + std::stringstream buf; + buf << "Can not add list item, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } + + for (size_t i = 1; i < items.size(); ++i) { + ListView_SetItemText(handle_, ret, static_cast(i), + const_cast(items[i].c_str())); + } +} + +void Window::ListDeleteSelectedItem() { + const int row_index = ListView_GetSelectionMark(handle_); + if (row_index >= 0) { + if (ListView_DeleteItem(handle_, row_index) == -1) { + std::stringstream buf; + buf << "Can not delete list item, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } + } +} + +std::vector > Window::ListGetAll() { +#define BUF_LEN 1024 + char buf[BUF_LEN]; + + std::vector > values; + const int num_columns = Header_GetItemCount(ListView_GetHeader(handle_)); + const int num_items = ListView_GetItemCount(handle_); + for (int i = 0; i < num_items; ++i) { + std::vector row; + for (int j = 0; j < num_columns; ++j) { + ListView_GetItemText(handle_, i, j, buf, BUF_LEN); + row.emplace_back(buf); + } + values.push_back(row); + } + + return values; +} + +void Window::AddTab(const std::string& name, int index) { + TCITEM tab_control_item; + tab_control_item.mask = TCIF_TEXT | TCIF_IMAGE; + tab_control_item.iImage = -1; + tab_control_item.pszText = const_cast(name.c_str()); + if (TabCtrl_InsertItem(handle_, index, &tab_control_item) == -1) { + std::stringstream buf; + buf << "Can not add tab, error code: " << GetLastError(); + throw odbcabstraction::DriverException(buf.str()); + } +} + +void Window::GetText(std::string& text) const { + if (!IsEnabled()) { + text.clear(); + + return; + } + + int len = GetWindowTextLength(handle_); + + if (len <= 0) { + text.clear(); + + return; + } + + text.resize(len + 1); + + if (!GetWindowText(handle_, &text[0], len + 1)) text.clear(); + + text.resize(len); + boost::algorithm::trim(text); +} + +void Window::SetText(const std::string& text) const { + SNDMSG(handle_, WM_SETTEXT, 0, reinterpret_cast(text.c_str())); +} + +bool Window::IsChecked() const { + return IsEnabled() && Button_GetCheck(handle_) == BST_CHECKED; +} + +void Window::SetChecked(bool state) { + Button_SetCheck(handle_, state ? BST_CHECKED : BST_UNCHECKED); +} + +void Window::AddString(const std::string& str) { + SNDMSG(handle_, CB_ADDSTRING, 0, reinterpret_cast(str.c_str())); +} + +void Window::SetSelection(int idx) { + SNDMSG(handle_, CB_SETCURSEL, static_cast(idx), 0); +} + +int Window::GetSelection() const { + return static_cast(SNDMSG(handle_, CB_GETCURSEL, 0, 0)); +} + +void Window::SetEnabled(bool enabled) { EnableWindow(GetHandle(), enabled); } + +bool Window::IsEnabled() const { return IsWindowEnabled(GetHandle()) != 0; } + +} // namespace config +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/utils.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/utils.cc new file mode 100644 index 00000000000..49ee103fbdc --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/utils.cc @@ -0,0 +1,1125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/utils.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/encoding.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h" + +#include "arrow/builder.h" +#include "arrow/compute/api.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" + +#include "arrow/flight/sql/odbc/flight_sql/json_converter.h" + +#include + +#include +#include + +namespace driver { +namespace flight_sql { + +namespace { +bool IsComplexType(arrow::Type::type type_id) { + switch (type_id) { + case arrow::Type::LIST: + case arrow::Type::LARGE_LIST: + case arrow::Type::FIXED_SIZE_LIST: + case arrow::Type::MAP: + case arrow::Type::STRUCT: + return true; + default: + return false; + } +} + +odbcabstraction::SqlDataType GetDefaultSqlCharType(bool use_wide_char) { + return use_wide_char ? odbcabstraction::SqlDataType_WCHAR + : odbcabstraction::SqlDataType_CHAR; +} +odbcabstraction::SqlDataType GetDefaultSqlVarcharType(bool use_wide_char) { + return use_wide_char ? odbcabstraction::SqlDataType_WVARCHAR + : odbcabstraction::SqlDataType_VARCHAR; +} +odbcabstraction::CDataType GetDefaultCCharType(bool use_wide_char) { + return use_wide_char ? odbcabstraction::CDataType_WCHAR + : odbcabstraction::CDataType_CHAR; +} + +} // namespace + +using odbcabstraction::CDataType; +using odbcabstraction::GetSqlWCharSize; +using odbcabstraction::GetTodayTimeFromEpoch; +using odbcabstraction::SqlDataType; + +using std::make_optional; +using std::nullopt; + +/// \brief Returns the mapping from Arrow type to SqlDataType +/// \param field the field to return the SqlDataType for +/// \return the concise SqlDataType for the field. +/// \note use GetNonConciseDataType on the output to get the verbose type +/// \note the concise and verbose types are the same for all but types relating to times +/// and intervals +SqlDataType GetDataTypeFromArrowFieldV3(const std::shared_ptr& field, + bool use_wide_char) { + const std::shared_ptr& type = field->type(); + + switch (type->id()) { + case arrow::Type::BOOL: + return odbcabstraction::SqlDataType_BIT; + case arrow::Type::UINT8: + case arrow::Type::INT8: + return odbcabstraction::SqlDataType_TINYINT; + case arrow::Type::UINT16: + case arrow::Type::INT16: + return odbcabstraction::SqlDataType_SMALLINT; + case arrow::Type::UINT32: + case arrow::Type::INT32: + return odbcabstraction::SqlDataType_INTEGER; + case arrow::Type::UINT64: + case arrow::Type::INT64: + return odbcabstraction::SqlDataType_BIGINT; + case arrow::Type::HALF_FLOAT: + case arrow::Type::FLOAT: + return odbcabstraction::SqlDataType_FLOAT; + case arrow::Type::DOUBLE: + return odbcabstraction::SqlDataType_DOUBLE; + case arrow::Type::BINARY: + case arrow::Type::FIXED_SIZE_BINARY: + case arrow::Type::LARGE_BINARY: + return odbcabstraction::SqlDataType_BINARY; + case arrow::Type::STRING: + case arrow::Type::LARGE_STRING: + return GetDefaultSqlVarcharType(use_wide_char); + case arrow::Type::DATE32: + case arrow::Type::DATE64: + return odbcabstraction::SqlDataType_TYPE_DATE; + case arrow::Type::TIMESTAMP: + return odbcabstraction::SqlDataType_TYPE_TIMESTAMP; + case arrow::Type::DECIMAL128: + return odbcabstraction::SqlDataType_DECIMAL; + case arrow::Type::TIME32: + case arrow::Type::TIME64: + return odbcabstraction::SqlDataType_TYPE_TIME; + case arrow::Type::INTERVAL_MONTHS: + return odbcabstraction:: + SqlDataType_INTERVAL_MONTH; // TODO: maybe + // odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH + case arrow::Type::INTERVAL_DAY_TIME: + return odbcabstraction::SqlDataType_INTERVAL_DAY; + + // TODO: Handle remaining types. + case arrow::Type::INTERVAL_MONTH_DAY_NANO: + case arrow::Type::LIST: + case arrow::Type::STRUCT: + case arrow::Type::SPARSE_UNION: + case arrow::Type::DENSE_UNION: + case arrow::Type::DICTIONARY: + case arrow::Type::MAP: + case arrow::Type::EXTENSION: + case arrow::Type::FIXED_SIZE_LIST: + case arrow::Type::DURATION: + case arrow::Type::LARGE_LIST: + case arrow::Type::MAX_ID: + case arrow::Type::NA: + break; + } + + return GetDefaultSqlVarcharType(use_wide_char); +} + +SqlDataType EnsureRightSqlCharType(SqlDataType data_type, bool use_wide_char) { + switch (data_type) { + case odbcabstraction::SqlDataType_CHAR: + case odbcabstraction::SqlDataType_WCHAR: + return GetDefaultSqlCharType(use_wide_char); + case odbcabstraction::SqlDataType_VARCHAR: + case odbcabstraction::SqlDataType_WVARCHAR: + return GetDefaultSqlVarcharType(use_wide_char); + default: + return data_type; + } +} + +int16_t ConvertSqlDataTypeFromV3ToV2(int16_t data_type_v3) { + switch (data_type_v3) { + case odbcabstraction::SqlDataType_TYPE_DATE: + return 9; // Same as SQL_DATE from sqlext.h + case odbcabstraction::SqlDataType_TYPE_TIME: + return 10; // Same as SQL_TIME from sqlext.h + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + return 11; // Same as SQL_TIMESTAMP from sqlext.h + default: + return data_type_v3; + } +} + +CDataType ConvertCDataTypeFromV2ToV3(int16_t data_type_v2) { + switch (data_type_v2) { + case -6: // Same as SQL_C_TINYINT from sqlext.h + return odbcabstraction::CDataType_STINYINT; + case 4: // Same as SQL_C_LONG from sqlext.h + return odbcabstraction::CDataType_SLONG; + case 5: // Same as SQL_C_SHORT from sqlext.h + return odbcabstraction::CDataType_SSHORT; + case 7: // Same as SQL_C_FLOAT from sqlext.h + return odbcabstraction::CDataType_FLOAT; + case 8: // Same as SQL_C_DOUBLE from sqlext.h + return odbcabstraction::CDataType_DOUBLE; + case 9: // Same as SQL_C_DATE from sqlext.h + return odbcabstraction::CDataType_DATE; + case 10: // Same as SQL_C_TIME from sqlext.h + return odbcabstraction::CDataType_TIME; + case 11: // Same as SQL_C_TIMESTAMP from sqlext.h + return odbcabstraction::CDataType_TIMESTAMP; + default: + return static_cast(data_type_v2); + } +} + +std::string GetTypeNameFromSqlDataType(int16_t data_type) { + switch (data_type) { + case odbcabstraction::SqlDataType_CHAR: + return "CHAR"; + case odbcabstraction::SqlDataType_VARCHAR: + return "VARCHAR"; + case odbcabstraction::SqlDataType_LONGVARCHAR: + return "LONGVARCHAR"; + case odbcabstraction::SqlDataType_WCHAR: + return "WCHAR"; + case odbcabstraction::SqlDataType_WVARCHAR: + return "WVARCHAR"; + case odbcabstraction::SqlDataType_WLONGVARCHAR: + return "WLONGVARCHAR"; + case odbcabstraction::SqlDataType_DECIMAL: + return "DECIMAL"; + case odbcabstraction::SqlDataType_NUMERIC: + return "NUMERIC"; + case odbcabstraction::SqlDataType_SMALLINT: + return "SMALLINT"; + case odbcabstraction::SqlDataType_INTEGER: + return "INTEGER"; + case odbcabstraction::SqlDataType_REAL: + return "REAL"; + case odbcabstraction::SqlDataType_FLOAT: + return "FLOAT"; + case odbcabstraction::SqlDataType_DOUBLE: + return "DOUBLE"; + case odbcabstraction::SqlDataType_BIT: + return "BIT"; + case odbcabstraction::SqlDataType_TINYINT: + return "TINYINT"; + case odbcabstraction::SqlDataType_BIGINT: + return "BIGINT"; + case odbcabstraction::SqlDataType_BINARY: + return "BINARY"; + case odbcabstraction::SqlDataType_VARBINARY: + return "VARBINARY"; + case odbcabstraction::SqlDataType_LONGVARBINARY: + return "LONGVARBINARY"; + case odbcabstraction::SqlDataType_TYPE_DATE: + case 9: + return "DATE"; + case odbcabstraction::SqlDataType_TYPE_TIME: + case 10: + return "TIME"; + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + case 11: + return "TIMESTAMP"; + case odbcabstraction::SqlDataType_INTERVAL_MONTH: + return "INTERVAL_MONTH"; + case odbcabstraction::SqlDataType_INTERVAL_YEAR: + return "INTERVAL_YEAR"; + case odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH: + return "INTERVAL_YEAR_TO_MONTH"; + case odbcabstraction::SqlDataType_INTERVAL_DAY: + return "INTERVAL_DAY"; + case odbcabstraction::SqlDataType_INTERVAL_HOUR: + return "INTERVAL_HOUR"; + case odbcabstraction::SqlDataType_INTERVAL_MINUTE: + return "INTERVAL_MINUTE"; + case odbcabstraction::SqlDataType_INTERVAL_SECOND: + return "INTERVAL_SECOND"; + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_HOUR: + return "INTERVAL_DAY_TO_HOUR"; + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_MINUTE: + return "INTERVAL_DAY_TO_MINUTE"; + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_SECOND: + return "INTERVAL_DAY_TO_SECOND"; + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_MINUTE: + return "INTERVAL_HOUR_TO_MINUTE"; + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_SECOND: + return "INTERVAL_HOUR_TO_SECOND"; + case odbcabstraction::SqlDataType_INTERVAL_MINUTE_TO_SECOND: + return "INTERVAL_MINUTE_TO_SECOND"; + case odbcabstraction::SqlDataType_GUID: + return "GUID"; + } + + throw driver::odbcabstraction::DriverException("Unsupported data type: " + + std::to_string(data_type)); +} + +optional GetRadixFromSqlDataType(odbcabstraction::SqlDataType data_type) { + switch (data_type) { + case odbcabstraction::SqlDataType_DECIMAL: + case odbcabstraction::SqlDataType_NUMERIC: + case odbcabstraction::SqlDataType_SMALLINT: + case odbcabstraction::SqlDataType_TINYINT: + case odbcabstraction::SqlDataType_INTEGER: + case odbcabstraction::SqlDataType_BIGINT: + return 10; + case odbcabstraction::SqlDataType_REAL: + case odbcabstraction::SqlDataType_FLOAT: + case odbcabstraction::SqlDataType_DOUBLE: + return 2; + default: + return std::nullopt; + } +} + +int16_t GetNonConciseDataType(odbcabstraction::SqlDataType data_type) { + switch (data_type) { + case odbcabstraction::SqlDataType_TYPE_DATE: + case odbcabstraction::SqlDataType_TYPE_TIME: + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + return 9; // Same as SQL_DATETIME on sql.h + case odbcabstraction::SqlDataType_INTERVAL_YEAR: + case odbcabstraction::SqlDataType_INTERVAL_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_DAY: + case odbcabstraction::SqlDataType_INTERVAL_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE_TO_SECOND: + return 10; // Same as SQL_INTERVAL on sqlext.h + default: + return data_type; + } +} + +optional GetSqlDateTimeSubCode(SqlDataType data_type) { + switch (data_type) { + case odbcabstraction::SqlDataType_TYPE_DATE: + return odbcabstraction::SqlDateTimeSubCode_DATE; + case odbcabstraction::SqlDataType_TYPE_TIME: + return odbcabstraction::SqlDateTimeSubCode_TIME; + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + return odbcabstraction::SqlDateTimeSubCode_TIMESTAMP; + case odbcabstraction::SqlDataType_INTERVAL_YEAR: + return odbcabstraction::SqlDateTimeSubCode_YEAR; + case odbcabstraction::SqlDataType_INTERVAL_MONTH: + return odbcabstraction::SqlDateTimeSubCode_MONTH; + case odbcabstraction::SqlDataType_INTERVAL_DAY: + return odbcabstraction::SqlDateTimeSubCode_DAY; + case odbcabstraction::SqlDataType_INTERVAL_HOUR: + return odbcabstraction::SqlDateTimeSubCode_HOUR; + case odbcabstraction::SqlDataType_INTERVAL_MINUTE: + return odbcabstraction::SqlDateTimeSubCode_MINUTE; + case odbcabstraction::SqlDataType_INTERVAL_SECOND: + return odbcabstraction::SqlDateTimeSubCode_SECOND; + case odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH: + return odbcabstraction::SqlDateTimeSubCode_YEAR_TO_MONTH; + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_HOUR: + return odbcabstraction::SqlDateTimeSubCode_DAY_TO_HOUR; + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_MINUTE: + return odbcabstraction::SqlDateTimeSubCode_DAY_TO_MINUTE; + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_SECOND: + return odbcabstraction::SqlDateTimeSubCode_DAY_TO_SECOND; + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_MINUTE: + return odbcabstraction::SqlDateTimeSubCode_HOUR_TO_MINUTE; + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_SECOND: + return odbcabstraction::SqlDateTimeSubCode_HOUR_TO_SECOND; + case odbcabstraction::SqlDataType_INTERVAL_MINUTE_TO_SECOND: + return odbcabstraction::SqlDateTimeSubCode_MINUTE_TO_SECOND; + default: + return std::nullopt; + } +} + +optional GetCharOctetLength(SqlDataType data_type, + const arrow::Result& column_size, + const int32_t decimal_precison) { + switch (data_type) { + case odbcabstraction::SqlDataType_BINARY: + case odbcabstraction::SqlDataType_VARBINARY: + case odbcabstraction::SqlDataType_LONGVARBINARY: + case odbcabstraction::SqlDataType_CHAR: + case odbcabstraction::SqlDataType_VARCHAR: + case odbcabstraction::SqlDataType_LONGVARCHAR: + if (column_size.ok()) { + return column_size.ValueOrDie(); + } else { + return std::nullopt; + } + case odbcabstraction::SqlDataType_WCHAR: + case odbcabstraction::SqlDataType_WVARCHAR: + case odbcabstraction::SqlDataType_WLONGVARCHAR: + if (column_size.ok()) { + return column_size.ValueOrDie() * GetSqlWCharSize(); + } else { + return std::nullopt; + } + case odbcabstraction::SqlDataType_TINYINT: + case odbcabstraction::SqlDataType_BIT: + return 1; // The same as sizeof(SQL_C_BIT) + case odbcabstraction::SqlDataType_SMALLINT: + return 2; // The same as sizeof(SQL_C_SMALLINT) + case odbcabstraction::SqlDataType_INTEGER: + return 4; // The same as sizeof(SQL_C_INTEGER) + case odbcabstraction::SqlDataType_BIGINT: + case odbcabstraction::SqlDataType_FLOAT: + case odbcabstraction::SqlDataType_DOUBLE: + return 8; // The same as sizeof(SQL_C_DOUBLE) + case odbcabstraction::SqlDataType_DECIMAL: + case odbcabstraction::SqlDataType_NUMERIC: + return decimal_precison + 2; // One char for each digit and two extra chars for a + // sign and a decimal point + case odbcabstraction::SqlDataType_TYPE_DATE: + case odbcabstraction::SqlDataType_TYPE_TIME: + return 6; // The same as sizeof(SQL_TIME_STRUCT) + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + return 16; // The same as sizeof(SQL_TIMESTAMP_STRUCT) + case odbcabstraction::SqlDataType_INTERVAL_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_YEAR: + case odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_DAY: + case odbcabstraction::SqlDataType_INTERVAL_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE_TO_SECOND: + return 34; // The same as sizeof(SQL_INTERVAL_STRUCT) + case odbcabstraction::SqlDataType_GUID: + return 16; + default: + return std::nullopt; + } +} +optional GetTypeScale(SqlDataType data_type, + const optional& type_scale) { + switch (data_type) { + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + case odbcabstraction::SqlDataType_TYPE_TIME: + return 3; + case odbcabstraction::SqlDataType_DECIMAL: + return type_scale; + case odbcabstraction::SqlDataType_NUMERIC: + return type_scale; + case odbcabstraction::SqlDataType_TINYINT: + case odbcabstraction::SqlDataType_SMALLINT: + case odbcabstraction::SqlDataType_INTEGER: + case odbcabstraction::SqlDataType_BIGINT: + return 0; + default: + return std::nullopt; + } +} +optional GetColumnSize(SqlDataType data_type, + const optional& column_size) { + switch (data_type) { + case odbcabstraction::SqlDataType_CHAR: + case odbcabstraction::SqlDataType_VARCHAR: + case odbcabstraction::SqlDataType_LONGVARCHAR: + return column_size; + case odbcabstraction::SqlDataType_WCHAR: + case odbcabstraction::SqlDataType_WVARCHAR: + case odbcabstraction::SqlDataType_WLONGVARCHAR: + return column_size.has_value() + ? std::make_optional(column_size.value() * GetSqlWCharSize()) + : std::nullopt; + case odbcabstraction::SqlDataType_BINARY: + case odbcabstraction::SqlDataType_VARBINARY: + case odbcabstraction::SqlDataType_LONGVARBINARY: + return column_size; + case odbcabstraction::SqlDataType_DECIMAL: + return 19; // The same as sizeof(SQL_NUMERIC_STRUCT) + case odbcabstraction::SqlDataType_NUMERIC: + return 19; // The same as sizeof(SQL_NUMERIC_STRUCT) + case odbcabstraction::SqlDataType_BIT: + case odbcabstraction::SqlDataType_TINYINT: + return 1; + case odbcabstraction::SqlDataType_SMALLINT: + return 2; + case odbcabstraction::SqlDataType_INTEGER: + return 4; + case odbcabstraction::SqlDataType_BIGINT: + return 8; + case odbcabstraction::SqlDataType_REAL: + return 4; + case odbcabstraction::SqlDataType_FLOAT: + case odbcabstraction::SqlDataType_DOUBLE: + return 8; + case odbcabstraction::SqlDataType_TYPE_DATE: + return 10; // The same as sizeof(SQL_DATE_STRUCT) + case odbcabstraction::SqlDataType_TYPE_TIME: + return 12; // The same as sizeof(SQL_TIME_STRUCT) + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + return 23; // The same as sizeof(SQL_TIME_STRUCT) + case odbcabstraction::SqlDataType_INTERVAL_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_YEAR: + case odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_DAY: + case odbcabstraction::SqlDataType_INTERVAL_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE_TO_SECOND: + return 28; // The same as sizeof(SQL_INTERVAL_STRUCT) + case odbcabstraction::SqlDataType_GUID: + return 16; + default: + return std::nullopt; + } +} + +optional GetBufferLength(SqlDataType data_type, + const optional& column_size) { + switch (data_type) { + case odbcabstraction::SqlDataType_CHAR: + case odbcabstraction::SqlDataType_VARCHAR: + case odbcabstraction::SqlDataType_LONGVARCHAR: + return column_size; + case odbcabstraction::SqlDataType_WCHAR: + case odbcabstraction::SqlDataType_WVARCHAR: + case odbcabstraction::SqlDataType_WLONGVARCHAR: + return column_size.has_value() + ? std::make_optional(column_size.value() * GetSqlWCharSize()) + : std::nullopt; + case odbcabstraction::SqlDataType_BINARY: + case odbcabstraction::SqlDataType_VARBINARY: + case odbcabstraction::SqlDataType_LONGVARBINARY: + return column_size; + case odbcabstraction::SqlDataType_DECIMAL: + case odbcabstraction::SqlDataType_NUMERIC: + return 19; // The same as sizeof(SQL_NUMERIC_STRUCT) + case odbcabstraction::SqlDataType_BIT: + case odbcabstraction::SqlDataType_TINYINT: + return 1; + case odbcabstraction::SqlDataType_SMALLINT: + return 2; + case odbcabstraction::SqlDataType_INTEGER: + return 4; + case odbcabstraction::SqlDataType_BIGINT: + return 8; + case odbcabstraction::SqlDataType_REAL: + return 4; + case odbcabstraction::SqlDataType_FLOAT: + case odbcabstraction::SqlDataType_DOUBLE: + return 8; + case odbcabstraction::SqlDataType_TYPE_DATE: + return 10; // The same as sizeof(SQL_DATE_STRUCT) + case odbcabstraction::SqlDataType_TYPE_TIME: + return 12; // The same as sizeof(SQL_TIME_STRUCT) + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + return 23; // The same as sizeof(SQL_TIME_STRUCT) + case odbcabstraction::SqlDataType_INTERVAL_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_YEAR: + case odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_DAY: + case odbcabstraction::SqlDataType_INTERVAL_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE_TO_SECOND: + return 28; // The same as sizeof(SQL_INTERVAL_STRUCT) + case odbcabstraction::SqlDataType_GUID: + return 16; + default: + return std::nullopt; + } +} + +optional GetLength(SqlDataType data_type, const optional& column_size) { + switch (data_type) { + case odbcabstraction::SqlDataType_CHAR: + case odbcabstraction::SqlDataType_VARCHAR: + case odbcabstraction::SqlDataType_LONGVARCHAR: + case odbcabstraction::SqlDataType_WCHAR: + case odbcabstraction::SqlDataType_WVARCHAR: + case odbcabstraction::SqlDataType_WLONGVARCHAR: + case odbcabstraction::SqlDataType_BINARY: + case odbcabstraction::SqlDataType_VARBINARY: + case odbcabstraction::SqlDataType_LONGVARBINARY: + return column_size; + case odbcabstraction::SqlDataType_DECIMAL: + case odbcabstraction::SqlDataType_NUMERIC: + return 19; // The same as sizeof(SQL_NUMERIC_STRUCT) + case odbcabstraction::SqlDataType_BIT: + case odbcabstraction::SqlDataType_TINYINT: + return 1; + case odbcabstraction::SqlDataType_SMALLINT: + return 2; + case odbcabstraction::SqlDataType_INTEGER: + return 4; + case odbcabstraction::SqlDataType_BIGINT: + return 8; + case odbcabstraction::SqlDataType_REAL: + return 4; + case odbcabstraction::SqlDataType_FLOAT: + case odbcabstraction::SqlDataType_DOUBLE: + return 8; + case odbcabstraction::SqlDataType_TYPE_DATE: + return 10; // The same as sizeof(SQL_DATE_STRUCT) + case odbcabstraction::SqlDataType_TYPE_TIME: + return 12; // The same as sizeof(SQL_TIME_STRUCT) + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + return 23; // The same as sizeof(SQL_TIME_STRUCT) + case odbcabstraction::SqlDataType_INTERVAL_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_YEAR: + case odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_DAY: + case odbcabstraction::SqlDataType_INTERVAL_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE_TO_SECOND: + return 28; // The same as sizeof(SQL_INTERVAL_STRUCT) + case odbcabstraction::SqlDataType_GUID: + return 16; + default: + return std::nullopt; + } +} + +optional GetDisplaySize(SqlDataType data_type, + const optional& column_size) { + switch (data_type) { + case odbcabstraction::SqlDataType_CHAR: + case odbcabstraction::SqlDataType_VARCHAR: + case odbcabstraction::SqlDataType_LONGVARCHAR: + case odbcabstraction::SqlDataType_WCHAR: + case odbcabstraction::SqlDataType_WVARCHAR: + case odbcabstraction::SqlDataType_WLONGVARCHAR: + return column_size; + case odbcabstraction::SqlDataType_BINARY: + case odbcabstraction::SqlDataType_VARBINARY: + case odbcabstraction::SqlDataType_LONGVARBINARY: + return column_size ? make_optional(*column_size * 2) : nullopt; + case odbcabstraction::SqlDataType_DECIMAL: + case odbcabstraction::SqlDataType_NUMERIC: + return column_size ? make_optional(*column_size + 2) : nullopt; + case odbcabstraction::SqlDataType_BIT: + return 1; + case odbcabstraction::SqlDataType_TINYINT: + return 4; + case odbcabstraction::SqlDataType_SMALLINT: + return 6; + case odbcabstraction::SqlDataType_INTEGER: + return 11; + case odbcabstraction::SqlDataType_BIGINT: + return 20; + case odbcabstraction::SqlDataType_REAL: + return 14; + case odbcabstraction::SqlDataType_FLOAT: + case odbcabstraction::SqlDataType_DOUBLE: + return 24; + case odbcabstraction::SqlDataType_TYPE_DATE: + return 10; + case odbcabstraction::SqlDataType_TYPE_TIME: + return 12; // Assuming format "hh:mm:ss.fff" + case odbcabstraction::SqlDataType_TYPE_TIMESTAMP: + return 23; // Assuming format "yyyy-mm-dd hh:mm:ss.fff" + case odbcabstraction::SqlDataType_INTERVAL_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_YEAR: + case odbcabstraction::SqlDataType_INTERVAL_YEAR_TO_MONTH: + case odbcabstraction::SqlDataType_INTERVAL_DAY: + case odbcabstraction::SqlDataType_INTERVAL_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_HOUR: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_DAY_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_MINUTE: + case odbcabstraction::SqlDataType_INTERVAL_HOUR_TO_SECOND: + case odbcabstraction::SqlDataType_INTERVAL_MINUTE_TO_SECOND: + return nullopt; // TODO: Implement for INTERVAL types + case odbcabstraction::SqlDataType_GUID: + return 36; + default: + return nullopt; + } +} + +std::string ConvertSqlPatternToRegexString(const std::string& pattern) { + static const std::string specials = "[]()|^-+*?{}$\\."; + + std::string regex_str; + bool escape = false; + for (const auto& c : pattern) { + if (escape) { + regex_str += c; + escape = false; + continue; + } + + switch (c) { + case '\\': + escape = true; + break; + case '_': + regex_str += '.'; + break; + case '%': + regex_str += ".*"; + break; + default: + if (specials.find(c) != std::string::npos) { + regex_str += '\\'; + } + regex_str += c; + break; + } + } + return regex_str; +} + +boost::xpressive::sregex ConvertSqlPatternToRegex(const std::string& pattern) { + const std::string& regex_str = ConvertSqlPatternToRegexString(pattern); + return boost::xpressive::sregex(boost::xpressive::sregex::compile(regex_str)); +} + +bool NeedArrayConversion(arrow::Type::type original_type_id, + odbcabstraction::CDataType data_type) { + switch (original_type_id) { + case arrow::Type::DATE32: + case arrow::Type::DATE64: + return data_type != odbcabstraction::CDataType_DATE; + case arrow::Type::TIME32: + case arrow::Type::TIME64: + return data_type != odbcabstraction::CDataType_TIME; + case arrow::Type::TIMESTAMP: + return data_type != odbcabstraction::CDataType_TIMESTAMP; + case arrow::Type::STRING: + return data_type != odbcabstraction::CDataType_CHAR && + data_type != odbcabstraction::CDataType_WCHAR; + case arrow::Type::INT16: + return data_type != odbcabstraction::CDataType_SSHORT; + case arrow::Type::UINT16: + return data_type != odbcabstraction::CDataType_USHORT; + case arrow::Type::INT32: + return data_type != odbcabstraction::CDataType_SLONG; + case arrow::Type::UINT32: + return data_type != odbcabstraction::CDataType_ULONG; + case arrow::Type::FLOAT: + return data_type != odbcabstraction::CDataType_FLOAT; + case arrow::Type::DOUBLE: + return data_type != odbcabstraction::CDataType_DOUBLE; + case arrow::Type::BOOL: + return data_type != odbcabstraction::CDataType_BIT; + case arrow::Type::INT8: + return data_type != odbcabstraction::CDataType_STINYINT; + case arrow::Type::UINT8: + return data_type != odbcabstraction::CDataType_UTINYINT; + case arrow::Type::INT64: + return data_type != odbcabstraction::CDataType_SBIGINT; + case arrow::Type::UINT64: + return data_type != odbcabstraction::CDataType_UBIGINT; + case arrow::Type::BINARY: + return data_type != odbcabstraction::CDataType_BINARY; + case arrow::Type::DECIMAL128: + return data_type != odbcabstraction::CDataType_NUMERIC; + case arrow::Type::LIST: + case arrow::Type::LARGE_LIST: + case arrow::Type::FIXED_SIZE_LIST: + case arrow::Type::MAP: + case arrow::Type::STRUCT: + return data_type == odbcabstraction::CDataType_CHAR || + data_type == odbcabstraction::CDataType_WCHAR; + default: + throw odbcabstraction::DriverException(std::string("Invalid conversion")); + } +} + +std::shared_ptr GetDefaultDataTypeForTypeId(arrow::Type::type type_id) { + switch (type_id) { + case arrow::Type::STRING: + return arrow::utf8(); + case arrow::Type::INT16: + return arrow::int16(); + case arrow::Type::UINT16: + return arrow::uint16(); + case arrow::Type::INT32: + return arrow::int32(); + case arrow::Type::UINT32: + return arrow::uint32(); + case arrow::Type::FLOAT: + return arrow::float32(); + case arrow::Type::DOUBLE: + return arrow::float64(); + case arrow::Type::BOOL: + return arrow::boolean(); + case arrow::Type::INT8: + return arrow::int8(); + case arrow::Type::UINT8: + return arrow::uint8(); + case arrow::Type::INT64: + return arrow::int64(); + case arrow::Type::UINT64: + return arrow::uint64(); + case arrow::Type::BINARY: + return arrow::binary(); + case arrow::Type::DECIMAL128: + return arrow::decimal128(arrow::Decimal128Type::kMaxPrecision, 0); + case arrow::Type::DATE64: + return arrow::date64(); + case arrow::Type::TIME64: + return arrow::time64(arrow::TimeUnit::MICRO); + case arrow::Type::TIMESTAMP: + return arrow::timestamp(arrow::TimeUnit::SECOND); + } + + throw odbcabstraction::DriverException(std::string("Invalid type id: ") + + std::to_string(type_id)); +} + +arrow::Type::type ConvertCToArrowType(odbcabstraction::CDataType data_type) { + switch (data_type) { + case odbcabstraction::CDataType_CHAR: + case odbcabstraction::CDataType_WCHAR: + return arrow::Type::STRING; + case odbcabstraction::CDataType_SSHORT: + return arrow::Type::INT16; + case odbcabstraction::CDataType_USHORT: + return arrow::Type::UINT16; + case odbcabstraction::CDataType_SLONG: + return arrow::Type::INT32; + case odbcabstraction::CDataType_ULONG: + return arrow::Type::UINT32; + case odbcabstraction::CDataType_FLOAT: + return arrow::Type::FLOAT; + case odbcabstraction::CDataType_DOUBLE: + return arrow::Type::DOUBLE; + case odbcabstraction::CDataType_BIT: + return arrow::Type::BOOL; + case odbcabstraction::CDataType_STINYINT: + return arrow::Type::INT8; + case odbcabstraction::CDataType_UTINYINT: + return arrow::Type::UINT8; + case odbcabstraction::CDataType_SBIGINT: + return arrow::Type::INT64; + case odbcabstraction::CDataType_UBIGINT: + return arrow::Type::UINT64; + case odbcabstraction::CDataType_BINARY: + return arrow::Type::BINARY; + case odbcabstraction::CDataType_NUMERIC: + return arrow::Type::DECIMAL128; + case odbcabstraction::CDataType_TIMESTAMP: + return arrow::Type::TIMESTAMP; + case odbcabstraction::CDataType_TIME: + return arrow::Type::TIME64; + case odbcabstraction::CDataType_DATE: + return arrow::Type::DATE64; + default: + throw odbcabstraction::DriverException(std::string("Invalid target type: ") + + std::to_string(data_type)); + } +} + +odbcabstraction::CDataType ConvertArrowTypeToC(arrow::Type::type type_id, + bool use_wide_char) { + switch (type_id) { + case arrow::Type::STRING: + return GetDefaultCCharType(use_wide_char); + case arrow::Type::INT16: + return odbcabstraction::CDataType_SSHORT; + case arrow::Type::UINT16: + return odbcabstraction::CDataType_USHORT; + case arrow::Type::INT32: + return odbcabstraction::CDataType_SLONG; + case arrow::Type::UINT32: + return odbcabstraction::CDataType_ULONG; + case arrow::Type::FLOAT: + return odbcabstraction::CDataType_FLOAT; + case arrow::Type::DOUBLE: + return odbcabstraction::CDataType_DOUBLE; + case arrow::Type::BOOL: + return odbcabstraction::CDataType_BIT; + case arrow::Type::INT8: + return odbcabstraction::CDataType_STINYINT; + case arrow::Type::UINT8: + return odbcabstraction::CDataType_UTINYINT; + case arrow::Type::INT64: + return odbcabstraction::CDataType_SBIGINT; + case arrow::Type::UINT64: + return odbcabstraction::CDataType_UBIGINT; + case arrow::Type::BINARY: + return odbcabstraction::CDataType_BINARY; + case arrow::Type::DECIMAL128: + return odbcabstraction::CDataType_NUMERIC; + case arrow::Type::DATE64: + case arrow::Type::DATE32: + return odbcabstraction::CDataType_DATE; + case arrow::Type::TIME64: + case arrow::Type::TIME32: + return odbcabstraction::CDataType_TIME; + case arrow::Type::TIMESTAMP: + return odbcabstraction::CDataType_TIMESTAMP; + default: + throw odbcabstraction::DriverException(std::string("Invalid type id: ") + + std::to_string(type_id)); + } +} + +std::shared_ptr CheckConversion(const arrow::Result& result) { + if (result.ok()) { + const arrow::Datum& datum = result.ValueOrDie(); + return datum.make_array(); + } else { + throw odbcabstraction::DriverException(result.status().message()); + } +} + +ArrayConvertTask GetConverter(arrow::Type::type original_type_id, + odbcabstraction::CDataType target_type) { + // The else statement has a convert the works for the most case of array + // conversion. In case, we find conversion that the default one can't handle + // we can include some additional if-else statement with the logic to handle + // it + if (original_type_id == arrow::Type::STRING && + target_type == odbcabstraction::CDataType_TIME) { + return [=](const std::shared_ptr& original_array) { + arrow::compute::StrptimeOptions options("%H:%M", arrow::TimeUnit::MICRO, false); + + auto converted_result = arrow::compute::Strptime({original_array}, options); + auto first_converted_array = CheckConversion(converted_result); + + arrow::compute::CastOptions cast_options; + cast_options.to_type = time64(arrow::TimeUnit::MICRO); + return CheckConversion( + arrow::compute::CallFunction("cast", {first_converted_array}, &cast_options)); + }; + } else if (original_type_id == arrow::Type::TIME32 && + target_type == odbcabstraction::CDataType_TIMESTAMP) { + return [=](const std::shared_ptr& original_array) { + arrow::compute::CastOptions cast_options; + cast_options.to_type = arrow::int32(); + + auto first_converted_array = + CheckConversion(arrow::compute::Cast(original_array, cast_options)); + + cast_options.to_type = arrow::int64(); + + auto second_converted_array = + CheckConversion(arrow::compute::Cast(first_converted_array, cast_options)); + + auto seconds_from_epoch = GetTodayTimeFromEpoch(); + + auto third_converted_array = CheckConversion(arrow::compute::Add( + second_converted_array, + std::make_shared(seconds_from_epoch * 1000))); + + arrow::compute::CastOptions cast_options_2; + cast_options_2.to_type = arrow::timestamp(arrow::TimeUnit::MILLI); + + return CheckConversion(arrow::compute::Cast(third_converted_array, cast_options_2)); + }; + } else if (original_type_id == arrow::Type::TIME64 && + target_type == odbcabstraction::CDataType_TIMESTAMP) { + return [=](const std::shared_ptr& original_array) { + arrow::compute::CastOptions cast_options; + cast_options.to_type = arrow::int64(); + + auto first_converted_array = + CheckConversion(arrow::compute::Cast(original_array, cast_options)); + + auto seconds_from_epoch = GetTodayTimeFromEpoch(); + + auto second_converted_array = CheckConversion(arrow::compute::Add( + first_converted_array, + std::make_shared(seconds_from_epoch * 1000000000))); + + arrow::compute::CastOptions cast_options_2; + cast_options_2.to_type = arrow::timestamp(arrow::TimeUnit::NANO); + + return CheckConversion( + arrow::compute::Cast(second_converted_array, cast_options_2)); + }; + } else if (original_type_id == arrow::Type::STRING && + target_type == odbcabstraction::CDataType_DATE) { + return [=](const std::shared_ptr& original_array) { + // The Strptime requires a date format. Using the ISO 8601 format + arrow::compute::StrptimeOptions options("%Y-%m-%d", arrow::TimeUnit::SECOND, false); + + auto converted_result = arrow::compute::Strptime({original_array}, options); + + auto first_converted_array = CheckConversion(converted_result); + arrow::compute::CastOptions cast_options; + cast_options.to_type = arrow::date64(); + return CheckConversion( + arrow::compute::CallFunction("cast", {first_converted_array}, &cast_options)); + }; + } else if (original_type_id == arrow::Type::DECIMAL128 && + (target_type == odbcabstraction::CDataType_CHAR || + target_type == odbcabstraction::CDataType_WCHAR)) { + return [=](const std::shared_ptr& original_array) { + arrow::StringBuilder builder; + int64_t length = original_array->length(); + ThrowIfNotOK(builder.ReserveData(length)); + + for (int64_t i = 0; i < length; ++i) { + if (original_array->IsNull(i)) { + ThrowIfNotOK(builder.AppendNull()); + } else { + auto result = original_array->GetScalar(i); + auto scalar = result.ValueOrDie(); + ThrowIfNotOK(builder.Append(scalar->ToString())); + } + } + + auto finish = builder.Finish(); + + return finish.ValueOrDie(); + }; + } else if (IsComplexType(original_type_id) && + (target_type == odbcabstraction::CDataType_CHAR || + target_type == odbcabstraction::CDataType_WCHAR)) { + return [=](const std::shared_ptr& original_array) { + const auto& json_conversion_result = ConvertToJson(original_array); + ThrowIfNotOK(json_conversion_result.status()); + return json_conversion_result.ValueOrDie(); + }; + } else { + // Default converter + return [=](const std::shared_ptr& original_array) { + const arrow::Type::type& target_arrow_type_id = ConvertCToArrowType(target_type); + arrow::compute::CastOptions cast_options; + cast_options.to_type = GetDefaultDataTypeForTypeId(target_arrow_type_id); + + return CheckConversion( + arrow::compute::CallFunction("cast", {original_array}, &cast_options)); + }; + } +} +std::string ConvertToDBMSVer(const std::string& str) { + boost::char_separator separator("."); + boost::tokenizer > tokenizer(str, separator); + std::string result; + // The permitted ODBC format is ##.##.#### + // If any of the first 3 tokens are not numbers or are greater than the permitted + // digits, assume we hit the custom-server-information early and assume the remaining + // version digits are zero. + size_t position = 0; + bool is_showing_custom_data = false; + auto pad_remaining_tokens = [&](size_t pos) -> std::string { + std::string padded_str; + if (pos == 0) { + padded_str += "00"; + } + if (pos <= 1) { + padded_str += ".00"; + } + if (pos <= 2) { + padded_str += ".0000"; + } + return padded_str; + }; + + for (auto token : tokenizer) { + if (token.empty()) { + continue; + } + + if (!is_showing_custom_data && position < 3) { + std::string suffix; + try { + size_t next_pos = 0; + int version = stoi(token, &next_pos); + if (next_pos != token.size()) { + suffix = &token[0]; + } + if (version < 0 || (position < 2 && (version > 99)) || + (position == 2 && version > 9999)) { + is_showing_custom_data = true; + } else { + std::stringstream strstream; + if (position == 2) { + strstream << std::setfill('0') << std::setw(4); + } else { + strstream << std::setfill('0') << std::setw(2); + } + strstream << version; + + if (position != 0) { + result += "."; + } + result += strstream.str(); + if (next_pos != token.size()) { + suffix = &token[next_pos]; + result += pad_remaining_tokens(++position) + suffix; + position = 4; // Prevent additional padding. + is_showing_custom_data = true; + continue; + } + ++position; + continue; + } + } catch (std::logic_error&) { + is_showing_custom_data = true; + } + + result += pad_remaining_tokens(position) + suffix; + ++position; + } + + result += "." + token; + ++position; + } + + result += pad_remaining_tokens(position); + return result; +} + +int32_t GetDecimalTypeScale(const std::shared_ptr& decimal_type) { + auto decimal128_type = std::dynamic_pointer_cast(decimal_type); + return decimal128_type->scale(); +} + +int32_t GetDecimalTypePrecision(const std::shared_ptr& decimal_type) { + auto decimal128_type = std::dynamic_pointer_cast(decimal_type); + return decimal128_type->precision(); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/utils.h b/cpp/src/arrow/flight/sql/odbc/flight_sql/utils.h new file mode 100644 index 00000000000..69e43727ce1 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/utils.h @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace driver { +namespace flight_sql { + +typedef std::function(const std::shared_ptr&)> + ArrayConvertTask; + +using std::optional; + +inline void ThrowIfNotOK(const arrow::Status& status) { + if (!status.ok()) { + throw odbcabstraction::DriverException(status.message()); + } +} + +template +inline bool CheckIfSetToOnlyValidValue(const AttributeTypeT& value, T allowed_value) { + return boost::get(value) == allowed_value; +} + +template +arrow::Status AppendToBuilder(BUILDER& builder, optional opt_value) { + if (opt_value) { + return builder.Append(*opt_value); + } else { + return builder.AppendNull(); + } +} + +template +arrow::Status AppendToBuilder(BUILDER& builder, T value) { + return builder.Append(value); +} + +odbcabstraction::SqlDataType GetDataTypeFromArrowFieldV3( + const std::shared_ptr& field, bool use_wide_char); + +odbcabstraction::SqlDataType EnsureRightSqlCharType( + odbcabstraction::SqlDataType data_type, bool use_wide_char); + +int16_t ConvertSqlDataTypeFromV3ToV2(int16_t data_type_v3); + +odbcabstraction::CDataType ConvertCDataTypeFromV2ToV3(int16_t data_type_v2); + +std::string GetTypeNameFromSqlDataType(int16_t data_type); + +optional GetRadixFromSqlDataType(odbcabstraction::SqlDataType data_type); + +int16_t GetNonConciseDataType(odbcabstraction::SqlDataType data_type); + +optional GetSqlDateTimeSubCode(odbcabstraction::SqlDataType data_type); + +optional GetCharOctetLength(odbcabstraction::SqlDataType data_type, + const arrow::Result& column_size, + const int32_t decimal_precison = 0); + +optional GetBufferLength(odbcabstraction::SqlDataType data_type, + const optional& column_size); + +optional GetLength(odbcabstraction::SqlDataType data_type, + const optional& column_size); + +optional GetTypeScale(odbcabstraction::SqlDataType data_type, + const optional& type_scale); + +optional GetColumnSize(odbcabstraction::SqlDataType data_type, + const optional& column_size); + +optional GetDisplaySize(odbcabstraction::SqlDataType data_type, + const optional& column_size); + +std::string ConvertSqlPatternToRegexString(const std::string& pattern); + +boost::xpressive::sregex ConvertSqlPatternToRegex(const std::string& pattern); + +bool NeedArrayConversion(arrow::Type::type original_type_id, + odbcabstraction::CDataType data_type); + +std::shared_ptr GetDefaultDataTypeForTypeId(arrow::Type::type type_id); + +arrow::Type::type ConvertCToArrowType(odbcabstraction::CDataType data_type); + +odbcabstraction::CDataType ConvertArrowTypeToC(arrow::Type::type type_id, + bool use_wide_char); + +std::shared_ptr CheckConversion(const arrow::Result& result); + +ArrayConvertTask GetConverter(arrow::Type::type original_type_id, + odbcabstraction::CDataType target_type); + +std::string ConvertToDBMSVer(const std::string& str); + +int32_t GetDecimalTypeScale(const std::shared_ptr& decimal_type); + +int32_t GetDecimalTypePrecision(const std::shared_ptr& decimal_type); + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/flight_sql/utils_test.cc b/cpp/src/arrow/flight/sql/odbc/flight_sql/utils_test.cc new file mode 100644 index 00000000000..fcfffc49e90 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/flight_sql/utils_test.cc @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/flight_sql/utils.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h" + +#include "arrow/compute/initialize.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "gtest/gtest.h" + +namespace driver { +namespace flight_sql { + +// A global test "environment", to ensure Arrow compute kernel functions are registered + +class ComputeKernelEnvironment : public ::testing::Environment { + public: + void SetUp() override { ASSERT_OK(arrow::compute::Initialize()); } +}; + +::testing::Environment* kernel_env = + ::testing::AddGlobalTestEnvironment(new ComputeKernelEnvironment); + +void AssertConvertedArray(const std::shared_ptr& expected_array, + const std::shared_ptr& converted_array, + uint64_t size, arrow::Type::type arrow_type) { + ASSERT_EQ(converted_array->type_id(), arrow_type); + ASSERT_EQ(converted_array->length(), size); + ASSERT_EQ(expected_array->ToString(), converted_array->ToString()); +} + +std::shared_ptr convertArray( + const std::shared_ptr& original_array, + odbcabstraction::CDataType c_type) { + auto converter = GetConverter(original_array->type_id(), c_type); + return converter(original_array); +} + +void TestArrayConversion(const std::vector& input, + const std::shared_ptr& expected_array, + odbcabstraction::CDataType c_type, + arrow::Type::type arrow_type) { + std::shared_ptr original_array; + arrow::ArrayFromVector(input, &original_array); + + auto converted_array = convertArray(original_array, c_type); + + AssertConvertedArray(expected_array, converted_array, input.size(), arrow_type); +} + +void TestTime32ArrayConversion(const std::vector& input, + const std::shared_ptr& expected_array, + odbcabstraction::CDataType c_type, + arrow::Type::type arrow_type) { + std::shared_ptr original_array; + arrow::ArrayFromVector(time32(arrow::TimeUnit::MILLI), + input, &original_array); + + auto converted_array = convertArray(original_array, c_type); + + AssertConvertedArray(expected_array, converted_array, input.size(), arrow_type); +} + +void TestTime64ArrayConversion(const std::vector& input, + const std::shared_ptr& expected_array, + odbcabstraction::CDataType c_type, + arrow::Type::type arrow_type) { + std::shared_ptr original_array; + arrow::ArrayFromVector(time64(arrow::TimeUnit::NANO), input, + &original_array); + + auto converted_array = convertArray(original_array, c_type); + + AssertConvertedArray(expected_array, converted_array, input.size(), arrow_type); +} + +TEST(Utils, Time32ToTimeStampArray) { + std::vector input_data = {14896, 17820}; + + const auto seconds_from_epoch = odbcabstraction::GetTodayTimeFromEpoch(); + std::vector expected_data; + expected_data.reserve(2); + + for (const auto& item : input_data) { + expected_data.emplace_back(item + seconds_from_epoch * 1000); + } + + std::shared_ptr expected; + auto timestamp_field = field("timestamp_field", timestamp(arrow::TimeUnit::MILLI)); + arrow::ArrayFromVector(timestamp_field->type(), + expected_data, &expected); + + TestTime32ArrayConversion(input_data, expected, odbcabstraction::CDataType_TIMESTAMP, + arrow::Type::TIMESTAMP); +} + +TEST(Utils, Time64ToTimeStampArray) { + std::vector input_data = {1579489200000, 1646881200000}; + + const auto seconds_from_epoch = odbcabstraction::GetTodayTimeFromEpoch(); + std::vector expected_data; + expected_data.reserve(2); + + for (const auto& item : input_data) { + expected_data.emplace_back(item + seconds_from_epoch * 1000000000); + } + + std::shared_ptr expected; + auto timestamp_field = field("timestamp_field", timestamp(arrow::TimeUnit::NANO)); + arrow::ArrayFromVector(timestamp_field->type(), + expected_data, &expected); + + TestTime64ArrayConversion(input_data, expected, odbcabstraction::CDataType_TIMESTAMP, + arrow::Type::TIMESTAMP); +} + +TEST(Utils, StringToDateArray) { + std::shared_ptr expected; + arrow::ArrayFromVector({1579489200000, 1646881200000}, + &expected); + + TestArrayConversion({"2020-01-20", "2022-03-10"}, expected, + odbcabstraction::CDataType_DATE, arrow::Type::DATE64); +} + +TEST(Utils, StringToTimeArray) { + std::shared_ptr expected; + arrow::ArrayFromVector( + time64(arrow::TimeUnit::MICRO), {36000000000, 43200000000}, &expected); + + TestArrayConversion({"10:00", "12:00"}, expected, odbcabstraction::CDataType_TIME, + arrow::Type::TIME64); +} + +TEST(Utils, ConvertSqlPatternToRegexString) { + ASSERT_EQ(std::string("XY"), ConvertSqlPatternToRegexString("XY")); + ASSERT_EQ(std::string("X.Y"), ConvertSqlPatternToRegexString("X_Y")); + ASSERT_EQ(std::string("X.*Y"), ConvertSqlPatternToRegexString("X%Y")); + ASSERT_EQ(std::string("X%Y"), ConvertSqlPatternToRegexString("X\\%Y")); + ASSERT_EQ(std::string("X_Y"), ConvertSqlPatternToRegexString("X\\_Y")); +} + +TEST(Utils, ConvertToDBMSVer) { + ASSERT_EQ(std::string("01.02.0003"), ConvertToDBMSVer("1.2.3")); + ASSERT_EQ(std::string("01.02.0003.0"), ConvertToDBMSVer("1.2.3.0")); + ASSERT_EQ(std::string("01.02.0000"), ConvertToDBMSVer("1.2")); + ASSERT_EQ(std::string("01.00.0000"), ConvertToDBMSVer("1")); + ASSERT_EQ(std::string("01.02.0000-foo"), ConvertToDBMSVer("1.2-foo")); + ASSERT_EQ(std::string("01.00.0000-foo"), ConvertToDBMSVer("1-foo")); + ASSERT_EQ(std::string("10.11.0001-foo"), ConvertToDBMSVer("10.11.1-foo")); +} + +} // namespace flight_sql +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbc.def b/cpp/src/arrow/flight/sql/odbc/odbc.def new file mode 100644 index 00000000000..4881546ebd2 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbc.def @@ -0,0 +1,62 @@ +; Licensed to the Apache Software Foundation (ASF) under one +; or more contributor license agreements. See the NOTICE file +; distributed with this work for additional information +; regarding copyright ownership. The ASF licenses this file +; to you under the Apache License, Version 2.0 (the +; "License"); you may not use this file except in compliance +; with the License. You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, +; software distributed under the License is distributed on an +; "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +; KIND, either express or implied. See the License for the +; specific language governing permissions and limitations +; under the License. + +LIBRARY arrow_flight_sql_odbc +EXPORTS + ; TODO: update to ConfigDSNW when driver has unicode implemented + ConfigDSN + SQLAllocConnect + SQLAllocEnv + SQLAllocHandle + SQLAllocStmt + SQLBindCol + SQLCancel + SQLCloseCursor + SQLColAttributeW + SQLColumnsW + SQLConnectW + SQLDescribeColW + SQLDisconnect + SQLDriverConnectW + SQLExecDirectW + SQLExecute + SQLExtendedFetch + SQLFetch + SQLFetchScroll + SQLForeignKeysW + SQLFreeEnv + SQLFreeConnect + SQLFreeHandle + SQLFreeStmt + SQLGetConnectAttrW + SQLGetData + SQLGetDiagFieldW + SQLGetDiagRecW + SQLGetEnvAttr + SQLGetInfoW + SQLGetStmtAttrW + SQLGetTypeInfoW + SQLRowCount + SQLMoreResults + SQLNativeSqlW + SQLNumResultCols + SQLPrepareW + SQLPrimaryKeysW + SQLSetConnectAttrW + SQLSetEnvAttr + SQLSetStmtAttrW + SQLTablesW diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_api.cc b/cpp/src/arrow/flight/sql/odbc/odbc_api.cc new file mode 100644 index 00000000000..865066796c1 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbc_api.cc @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// flight_sql_connection.h needs to be included first due to conflicts with windows.h +#include "arrow/flight/sql/odbc/flight_sql/flight_sql_connection.h" + +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/config/configuration.h" +#include "arrow/flight/sql/odbc/flight_sql/include/flight_sql/flight_sql_driver.h" +#include "arrow/flight/sql/odbc/odbc_api_internal.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/attribute_utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/encoding_utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_connection.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_descriptor.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_environment.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_statement.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h" +#include "arrow/util/logging.h" + +namespace arrow::flight::sql::odbc { +SQLRETURN SQLAllocHandle(SQLSMALLINT type, SQLHANDLE parent, SQLHANDLE* result) { + ARROW_LOG(DEBUG) << "SQLAllocHandle called with type: " << type + << ", parent: " << parent + << ", result: " << static_cast(result); + + return SQL_INVALID_HANDLE; +} + +} // namespace arrow::flight::sql::odbc diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_api_internal.h b/cpp/src/arrow/flight/sql/odbc/odbc_api_internal.h new file mode 100644 index 00000000000..527d833840d --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbc_api_internal.h @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h" + +#include +#include +#include + +// \file odbc_api_internal.h +// +// Define internal ODBC API function headers. +namespace arrow::flight::sql::odbc { +SQLRETURN SQLAllocHandle(SQLSMALLINT type, SQLHANDLE parent, SQLHANDLE* result); +} // namespace arrow::flight::sql::odbc diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/CMakeLists.txt b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/CMakeLists.txt new file mode 100644 index 00000000000..c192e814267 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/CMakeLists.txt @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +arrow_install_all_headers("arrow/flight/sql/odbc/odbcabstraction/include") + +add_library(odbcabstraction + include/odbcabstraction/calendar_utils.h + include/odbcabstraction/diagnostics.h + include/odbcabstraction/error_codes.h + include/odbcabstraction/exceptions.h + include/odbcabstraction/platform.h + include/odbcabstraction/types.h + include/odbcabstraction/utils.h + include/odbcabstraction/odbc_impl/attribute_utils.h + include/odbcabstraction/odbc_impl/encoding_utils.h + include/odbcabstraction/odbc_impl/odbc_connection.h + include/odbcabstraction/odbc_impl/odbc_descriptor.h + include/odbcabstraction/odbc_impl/odbc_environment.h + include/odbcabstraction/odbc_impl/odbc_handle.h + include/odbcabstraction/odbc_impl/odbc_statement.h + include/odbcabstraction/odbc_impl/type_utilities.h + include/odbcabstraction/spi/connection.h + include/odbcabstraction/spi/driver.h + include/odbcabstraction/spi/result_set.h + include/odbcabstraction/spi/result_set_metadata.h + include/odbcabstraction/spi/statement.h + calendar_utils.cc + diagnostics.cc + encoding.cc + exceptions.cc + utils.cc + ../../../../vendored/whereami/whereami.cc + odbc_impl/odbc_connection.cc + odbc_impl/odbc_descriptor.cc + odbc_impl/odbc_environment.cc + odbc_impl/odbc_statement.cc) +target_include_directories(odbcabstraction PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include) +target_link_libraries(odbcabstraction PUBLIC ODBC::ODBC Boost::headers) + +set_target_properties(odbcabstraction + PROPERTIES ARCHIVE_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/$/lib + LIBRARY_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/$/lib + RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}/$/lib) diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/calendar_utils.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/calendar_utils.cc new file mode 100644 index 00000000000..7b92a00a254 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/calendar_utils.cc @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "odbcabstraction/calendar_utils.h" + +#include +#include +#include +#include + +namespace driver { +namespace odbcabstraction { +int64_t GetTodayTimeFromEpoch() { + tm date{}; + int64_t t = std::time(0); + + GetTimeForSecondsSinceEpoch(t, date); + + date.tm_hour = 0; + date.tm_min = 0; + date.tm_sec = 0; + +#if defined(_WIN32) + return _mkgmtime(&date); +#else + return timegm(&date); +#endif +} + +void GetTimeForSecondsSinceEpoch(const int64_t seconds_since_epoch, std::tm& out_tm) { + std::memset(&out_tm, 0, sizeof(std::tm)); + + std::chrono::time_point timepoint{ + std::chrono::seconds{seconds_since_epoch}}; + auto tm_days = std::chrono::floor(timepoint); + + std::chrono::year_month_day ymd(tm_days); + std::chrono::hh_mm_ss timeofday(timepoint - tm_days); + + out_tm.tm_year = static_cast(ymd.year()) - 1900; + out_tm.tm_mon = static_cast(ymd.month()) - 1; + out_tm.tm_mday = static_cast(ymd.day()); + out_tm.tm_hour = static_cast(timeofday.hours().count()); + out_tm.tm_min = static_cast(timeofday.minutes().count()); + out_tm.tm_sec = static_cast(timeofday.seconds().count()); +} +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/diagnostics.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/diagnostics.cc new file mode 100644 index 00000000000..37c95194fa0 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/diagnostics.cc @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include + +namespace { +void RewriteSQLStateForODBC2(std::string& sql_state) { + if (sql_state[0] == 'H' && sql_state[1] == 'Y') { + sql_state[0] = 'S'; + sql_state[1] = '1'; + } +} +} // namespace + +namespace driver { +namespace odbcabstraction { + +Diagnostics::Diagnostics(std::string vendor, std::string data_source_component, + OdbcVersion version) + : vendor_(std::move(vendor)), + data_source_component_(std::move(data_source_component)), + version_(version) {} + +void Diagnostics::SetDataSourceComponent(std::string component) { + data_source_component_ = std::move(component); +} + +std::string Diagnostics::GetDataSourceComponent() const { return data_source_component_; } + +std::string Diagnostics::GetVendor() const { return vendor_; } + +void driver::odbcabstraction::Diagnostics::AddError( + const driver::odbcabstraction::DriverException& exception) { + auto record = std::unique_ptr(new DiagnosticsRecord{ + exception.GetMessageText(), exception.GetSqlState(), exception.GetNativeError()}); + if (version_ == OdbcVersion::V_2) { + RewriteSQLStateForODBC2(record->sql_state); + } + TrackRecord(*record); + owned_records_.push_back(std::move(record)); +} + +void driver::odbcabstraction::Diagnostics::AddWarning(std::string message, + std::string sql_state, + int32_t native_error) { + auto record = std::unique_ptr( + new DiagnosticsRecord{std::move(message), std::move(sql_state), native_error}); + if (version_ == OdbcVersion::V_2) { + RewriteSQLStateForODBC2(record->sql_state); + } + TrackRecord(*record); + owned_records_.push_back(std::move(record)); +} + +std::string driver::odbcabstraction::Diagnostics::GetMessageText( + uint32_t record_index) const { + std::string message; + if (!vendor_.empty()) { + message += std::string("[") + vendor_ + "]"; + } + const DiagnosticsRecord* rec = GetRecordAtIndex(record_index); + return message + "[" + data_source_component_ + "] (" + + std::to_string(rec->native_error) + ") " + rec->msg_text; +} + +OdbcVersion Diagnostics::GetOdbcVersion() const { return version_; } + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/encoding.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/encoding.cc new file mode 100644 index 00000000000..95dc920da78 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/encoding.cc @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#if defined(__APPLE__) +# include +# include +# include +#endif + +namespace driver { +namespace odbcabstraction { + +#if defined(__APPLE__) +std::atomic SqlWCharSize{0}; + +namespace { +std::mutex SqlWCharSizeMutex; + +bool IsUsingIODBC() { + // Detects iODBC by looking up by symbol iodbc_version + void* handle = dlsym(RTLD_DEFAULT, "iodbc_version"); + bool using_iodbc = handle != nullptr; + dlclose(handle); + + return using_iodbc; +} +} // namespace + +void ComputeSqlWCharSize() { + std::unique_lock lock(SqlWCharSizeMutex); + if (SqlWCharSize != 0) return; // double-checked locking + + const char* env_p = std::getenv("WCHAR_ENCODING"); + if (env_p) { + if (boost::iequals(env_p, "UTF-16")) { + SqlWCharSize = sizeof(char16_t); + return; + } else if (boost::iequals(env_p, "UTF-32")) { + SqlWCharSize = sizeof(char32_t); + return; + } + } + + SqlWCharSize = IsUsingIODBC() ? sizeof(char32_t) : sizeof(char16_t); +} +#endif + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/exceptions.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/exceptions.cc new file mode 100644 index 00000000000..fcd8163a500 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/exceptions.cc @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +namespace driver { +namespace odbcabstraction { + +DriverException::DriverException(std::string message, std::string sql_state, + int32_t native_error) + : msg_text_(std::move(message)), + sql_state_(std::move(sql_state)), + native_error_(native_error) {} + +const char* DriverException::what() const throw() { return msg_text_.c_str(); } +const std::string& DriverException::GetMessageText() const { return msg_text_; } +const std::string& DriverException::GetSqlState() const { return sql_state_; } +int32_t DriverException::GetNativeError() const { return native_error_; } + +AuthenticationException::AuthenticationException(std::string message, + std::string sql_state, + int32_t native_error) + : DriverException(message, sql_state, native_error) {} + +CommunicationException::CommunicationException(std::string message, std::string sql_state, + int32_t native_error) + : DriverException( + message + ". Please ensure your encryption settings match the server.", + sql_state, native_error) {} + +NullWithoutIndicatorException::NullWithoutIndicatorException(std::string message, + std::string sql_state, + int32_t native_error) + : DriverException(message, sql_state, native_error) {} +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/blocking_queue.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/blocking_queue.h new file mode 100644 index 00000000000..b44938e84c6 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/blocking_queue.h @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace driver { +namespace odbcabstraction { + +template +class BlockingQueue { + size_t capacity_; + std::vector buffer_; + size_t buffer_size_{0}; + size_t left_{0}; // index where variables are put inside of buffer (produced) + size_t right_{0}; // index where variables are removed from buffer (consumed) + + std::mutex mtx_; + std::condition_variable not_empty_; + std::condition_variable not_full_; + + std::vector threads_; + std::atomic active_threads_{0}; + std::atomic closed_{false}; + + public: + typedef std::function(void)> Supplier; + + explicit BlockingQueue(size_t capacity) : capacity_(capacity), buffer_(capacity) {} + + void AddProducer(Supplier supplier) { + active_threads_++; + threads_.emplace_back([=] { + while (!closed_) { + // Block while queue is full + std::unique_lock unique_lock(mtx_); + if (!WaitUntilCanPushOrClosed(unique_lock)) break; + unique_lock.unlock(); + + // Only one thread at a time be notified and call supplier + auto item = supplier(); + if (!item) break; + + Push(*item); + } + + std::unique_lock unique_lock(mtx_); + active_threads_--; + not_empty_.notify_all(); + }); + } + + void Push(T item) { + std::unique_lock unique_lock(mtx_); + if (!WaitUntilCanPushOrClosed(unique_lock)) return; + + buffer_[right_] = std::move(item); + + right_ = (right_ + 1) % capacity_; + buffer_size_++; + + not_empty_.notify_one(); + } + + bool Pop(T* result) { + std::unique_lock unique_lock(mtx_); + if (!WaitUntilCanPopOrClosed(unique_lock)) return false; + + *result = std::move(buffer_[left_]); + + left_ = (left_ + 1) % capacity_; + buffer_size_--; + + not_full_.notify_one(); + + return true; + } + + void Close() { + std::unique_lock unique_lock(mtx_); + + if (closed_) return; + closed_ = true; + not_empty_.notify_all(); + not_full_.notify_all(); + + unique_lock.unlock(); + + for (auto& item : threads_) { + item.join(); + } + } + + private: + bool WaitUntilCanPushOrClosed(std::unique_lock& unique_lock) { + not_full_.wait(unique_lock, + [this]() { return closed_ || buffer_size_ != capacity_; }); + return !closed_; + } + + bool WaitUntilCanPopOrClosed(std::unique_lock& unique_lock) { + not_empty_.wait(unique_lock, [this]() { + return closed_ || buffer_size_ != 0 || active_threads_ == 0; + }); + + return !closed_ && buffer_size_ > 0; + } +}; + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h new file mode 100644 index 00000000000..6b55c443cdb --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/calendar_utils.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace driver { +namespace odbcabstraction { +int64_t GetTodayTimeFromEpoch(); + +void GetTimeForSecondsSinceEpoch(const int64_t seconds_since_epoch, std::tm& out_tm); +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h new file mode 100644 index 00000000000..131de3c1b08 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/diagnostics.h @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include +#include + +namespace driver { +namespace odbcabstraction { +class Diagnostics { + public: + struct DiagnosticsRecord { + std::string msg_text; + std::string sql_state; + int32_t native_error; + }; + + private: + std::vector error_records_; + std::vector warning_records_; + std::vector> owned_records_; + std::string vendor_; + std::string data_source_component_; + OdbcVersion version_; + + public: + Diagnostics(std::string vendor, std::string data_source_component, OdbcVersion version); + void AddError(const DriverException& exception); + void AddWarning(std::string message, std::string sql_state, int32_t native_error); + + /// \brief Add a pre-existing truncation warning. + inline void AddTruncationWarning() { + static const std::unique_ptr TRUNCATION_WARNING( + new DiagnosticsRecord{"String or binary data, right-truncated.", "01004", + ODBCErrorCodes_TRUNCATION_WARNING}); + warning_records_.push_back(TRUNCATION_WARNING.get()); + } + + inline void TrackRecord(const DiagnosticsRecord& record) { + if (record.sql_state[0] == '0' && record.sql_state[1] == '1') { + warning_records_.push_back(&record); + } else { + error_records_.push_back(&record); + } + } + + void SetDataSourceComponent(std::string component); + std::string GetDataSourceComponent() const; + + std::string GetVendor() const; + + inline void Clear() { + error_records_.clear(); + warning_records_.clear(); + owned_records_.clear(); + } + + std::string GetMessageText(uint32_t record_index) const; + std::string GetSQLState(uint32_t record_index) const { + return GetRecordAtIndex(record_index)->sql_state; + } + + int32_t GetNativeError(uint32_t record_index) const { + return GetRecordAtIndex(record_index)->native_error; + } + + inline size_t GetRecordCount() const { + return error_records_.size() + warning_records_.size(); + } + + inline bool HasRecord(uint32_t record_index) const { + return error_records_.size() + warning_records_.size() > record_index; + } + + inline bool HasWarning() const { return !warning_records_.empty(); } + + inline bool HasError() const { return !error_records_.empty(); } + + OdbcVersion GetOdbcVersion() const; + + private: + inline const DiagnosticsRecord* GetRecordAtIndex(uint32_t record_index) const { + if (record_index < error_records_.size()) { + return error_records_[record_index]; + } + return warning_records_[record_index - error_records_.size()]; + } +}; +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/encoding.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/encoding.h new file mode 100644 index 00000000000..66f35602809 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/encoding.h @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" +#include "arrow/util/macros.h" + +#if defined(__APPLE__) +# include +#endif + +namespace driver { +namespace odbcabstraction { + +#if defined(__APPLE__) +extern std::atomic SqlWCharSize; + +void ComputeSqlWCharSize(); + +inline size_t GetSqlWCharSize() { + if (SqlWCharSize == 0) { + ComputeSqlWCharSize(); + } + + return SqlWCharSize; +} +#else +constexpr inline size_t GetSqlWCharSize() { return sizeof(char16_t); } +#endif + +} // namespace odbcabstraction +} // namespace driver + +using driver::odbcabstraction::DriverException; +using driver::odbcabstraction::GetSqlWCharSize; + +template +inline size_t wcsstrlen(const void* wcs_string) { + size_t len; + for (len = 0; ((CHAR_TYPE*)wcs_string)[len]; len++) { + } + return len; +} + +inline size_t wcsstrlen(const void* wcs_string) { + switch (GetSqlWCharSize()) { + case sizeof(char16_t): + return wcsstrlen(wcs_string); + case sizeof(char32_t): + return wcsstrlen(wcs_string); + default: + assert(false); + throw DriverException("Encoding is unsupported, SQLWCHAR size: " + + std::to_string(GetSqlWCharSize())); + } +} + +namespace driver { +namespace odbcabstraction { + +// GH-46576: suppress unicode warnings +ARROW_SUPPRESS_DEPRECATION_WARNING +template +inline void Utf8ToWcs(const char* utf8_string, size_t length, + std::vector* result) { + thread_local std::wstring_convert, CHAR_TYPE> converter; + auto string = converter.from_bytes(utf8_string, utf8_string + length); + + uint32_t length_in_bytes = static_cast(string.size() * GetSqlWCharSize()); + const uint8_t* data = (uint8_t*)string.data(); + + result->reserve(length_in_bytes); + result->assign(data, data + length_in_bytes); +} +ARROW_UNSUPPRESS_DEPRECATION_WARNING + +inline void Utf8ToWcs(const char* utf8_string, size_t length, + std::vector* result) { + switch (GetSqlWCharSize()) { + case sizeof(char16_t): + return Utf8ToWcs(utf8_string, length, result); + case sizeof(char32_t): + return Utf8ToWcs(utf8_string, length, result); + default: + assert(false); + throw DriverException("Encoding is unsupported, SQLWCHAR size: " + + std::to_string(GetSqlWCharSize())); + } +} + +inline void Utf8ToWcs(const char* utf8_string, std::vector* result) { + return Utf8ToWcs(utf8_string, strlen(utf8_string), result); +} + +// GH-46576: suppress unicode warnings +ARROW_SUPPRESS_DEPRECATION_WARNING +template +inline void WcsToUtf8(const void* wcs_string, size_t length_in_code_units, + std::vector* result) { + thread_local std::wstring_convert, CHAR_TYPE> converter; + auto byte_string = converter.to_bytes((CHAR_TYPE*)wcs_string, + (CHAR_TYPE*)wcs_string + length_in_code_units); + + uint32_t length_in_bytes = static_cast(byte_string.size()); + const uint8_t* data = (uint8_t*)byte_string.data(); + + result->reserve(length_in_bytes); + result->assign(data, data + length_in_bytes); +} +ARROW_UNSUPPRESS_DEPRECATION_WARNING + +inline void WcsToUtf8(const void* wcs_string, size_t length_in_code_units, + std::vector* result) { + switch (GetSqlWCharSize()) { + case sizeof(char16_t): + return WcsToUtf8(wcs_string, length_in_code_units, result); + case sizeof(char32_t): + return WcsToUtf8(wcs_string, length_in_code_units, result); + default: + assert(false); + throw DriverException("Encoding is unsupported, SQLWCHAR size: " + + std::to_string(GetSqlWCharSize())); + } +} + +inline void WcsToUtf8(const void* wcs_string, std::vector* result) { + return WcsToUtf8(wcs_string, wcsstrlen(wcs_string), result); +} + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/error_codes.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/error_codes.h new file mode 100644 index 00000000000..3033e2c7f89 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/error_codes.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace driver { +namespace odbcabstraction { + +enum ODBCErrorCodes : int32_t { + ODBCErrorCodes_GENERAL_ERROR = 100, + ODBCErrorCodes_AUTH = 200, + ODBCErrorCodes_TLS = 300, + ODBCErrorCodes_FRACTIONAL_TRUNCATION_ERROR = 400, + ODBCErrorCodes_COMMUNICATION = 500, + ODBCErrorCodes_GENERAL_WARNING = 1000000, + ODBCErrorCodes_TRUNCATION_WARNING = 1000100, + ODBCErrorCodes_FRACTIONAL_TRUNCATION_WARNING = 1000100, + ODBCErrorCodes_INDICATOR_NEEDED = 1000200 +}; +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h new file mode 100644 index 00000000000..48a773e4f4d --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +namespace driver { +namespace odbcabstraction { + +/// \brief Base for all driver specific exceptions +class DriverException : public std::exception { + public: + explicit DriverException(std::string message, std::string sql_state = "HY000", + int32_t native_error = ODBCErrorCodes_GENERAL_ERROR); + + const char* what() const throw() override; + + const std::string& GetMessageText() const; + const std::string& GetSqlState() const; + int32_t GetNativeError() const; + + private: + const std::string msg_text_; + const std::string sql_state_; + const int32_t native_error_; +}; + +/// \brief Authentication specific exception +class AuthenticationException : public DriverException { + public: + explicit AuthenticationException(std::string message, std::string sql_state = "28000", + int32_t native_error = ODBCErrorCodes_AUTH); +}; + +/// \brief Communication link specific exception +class CommunicationException : public DriverException { + public: + explicit CommunicationException(std::string message, std::string sql_state = "08S01", + int32_t native_error = ODBCErrorCodes_COMMUNICATION); +}; + +/// \brief Error when null is retrieved from the database but no indicator was supplied. +/// (This means the driver has no way to report ot the application that there was a NULL +/// value). +class NullWithoutIndicatorException : public DriverException { + public: + explicit NullWithoutIndicatorException( + std::string message = "Indicator variable required but not supplied", + std::string sql_state = "22002", + int32_t native_error = ODBCErrorCodes_INDICATOR_NEEDED); +}; + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/attribute_utils.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/attribute_utils.h new file mode 100644 index 00000000000..13d4492717a --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/attribute_utils.h @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace ODBC { +using driver::odbcabstraction::WcsToUtf8; + +template +inline void GetAttribute(T attribute_value, SQLPOINTER output, O output_size, + O* output_len_ptr) { + if (output) { + T* typed_output = reinterpret_cast(output); + *typed_output = attribute_value; + } + + if (output_len_ptr) { + *output_len_ptr = sizeof(T); + } +} + +template +inline SQLRETURN GetAttributeUTF8(const std::string& attribute_value, SQLPOINTER output, + O output_size, O* output_len_ptr) { + if (output) { + size_t output_len_before_null = + std::min(static_cast(attribute_value.size()), static_cast(output_size - 1)); + memcpy(output, attribute_value.c_str(), output_len_before_null); + reinterpret_cast(output)[output_len_before_null] = '\0'; + } + + if (output_len_ptr) { + *output_len_ptr = static_cast(attribute_value.size()); + } + + if (output && output_size < static_cast(attribute_value.size() + 1)) { + return SQL_SUCCESS_WITH_INFO; + } + return SQL_SUCCESS; +} + +template +inline SQLRETURN GetAttributeUTF8(const std::string& attribute_value, SQLPOINTER output, + O output_size, O* output_len_ptr, + driver::odbcabstraction::Diagnostics& diagnostics) { + SQLRETURN result = + GetAttributeUTF8(attribute_value, output, output_size, output_len_ptr); + if (SQL_SUCCESS_WITH_INFO == result) { + diagnostics.AddTruncationWarning(); + } + return result; +} + +template +inline SQLRETURN GetAttributeSQLWCHAR(const std::string& attribute_value, + bool is_length_in_bytes, SQLPOINTER output, + O output_size, O* output_len_ptr) { + size_t result = ConvertToSqlWChar( + attribute_value, reinterpret_cast(output), + is_length_in_bytes ? output_size : output_size * GetSqlWCharSize()); + + if (output_len_ptr) { + *output_len_ptr = + static_cast(is_length_in_bytes ? result : result / GetSqlWCharSize()); + } + + if (output && + output_size < + static_cast(result + (is_length_in_bytes ? GetSqlWCharSize() : 1))) { + return SQL_SUCCESS_WITH_INFO; + } + return SQL_SUCCESS; +} + +template +inline SQLRETURN GetAttributeSQLWCHAR(const std::string& attribute_value, + bool is_length_in_bytes, SQLPOINTER output, + O output_size, O* output_len_ptr, + driver::odbcabstraction::Diagnostics& diagnostics) { + SQLRETURN result = GetAttributeSQLWCHAR(attribute_value, is_length_in_bytes, output, + output_size, output_len_ptr); + if (SQL_SUCCESS_WITH_INFO == result) { + diagnostics.AddTruncationWarning(); + } + return result; +} + +template +inline SQLRETURN GetStringAttribute(bool is_unicode, const std::string& attribute_value, + bool is_length_in_bytes, SQLPOINTER output, + O output_size, O* output_len_ptr, + driver::odbcabstraction::Diagnostics& diagnostics) { + SQLRETURN result = SQL_SUCCESS; + if (is_unicode) { + result = GetAttributeSQLWCHAR(attribute_value, is_length_in_bytes, output, + output_size, output_len_ptr); + } else { + result = GetAttributeUTF8(attribute_value, output, output_size, output_len_ptr); + } + + if (SQL_SUCCESS_WITH_INFO == result) { + diagnostics.AddTruncationWarning(); + } + return result; +} + +template +inline void SetAttribute(SQLPOINTER new_value, T& attribute_to_write) { + SQLLEN valueAsLen = reinterpret_cast(new_value); + attribute_to_write = static_cast(valueAsLen); +} + +template +inline void SetPointerAttribute(SQLPOINTER new_value, T& attribute_to_write) { + attribute_to_write = static_cast(new_value); +} + +inline void SetAttributeUTF8(SQLPOINTER new_value, SQLINTEGER input_length, + std::string& attribute_to_write) { + const char* new_value_as_char = static_cast(new_value); + attribute_to_write.assign(new_value_as_char, input_length == SQL_NTS + ? strlen(new_value_as_char) + : input_length); +} + +inline void SetAttributeSQLWCHAR(SQLPOINTER new_value, SQLINTEGER input_length_in_bytes, + std::string& attribute_to_write) { + thread_local std::vector utf8_str; + if (input_length_in_bytes == SQL_NTS) { + WcsToUtf8(new_value, &utf8_str); + } else { + WcsToUtf8(new_value, input_length_in_bytes / GetSqlWCharSize(), &utf8_str); + } + attribute_to_write.assign((char*)utf8_str.data()); +} + +template +void CheckIfAttributeIsSetToOnlyValidValue(SQLPOINTER value, T allowed_value) { + if (static_cast(reinterpret_cast(value)) != allowed_value) { + throw driver::odbcabstraction::DriverException("Optional feature not implemented", + "HYC00"); + } +} +} // namespace ODBC diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/encoding_utils.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/encoding_utils.h new file mode 100644 index 00000000000..ae1ce3568bf --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/encoding_utils.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING + +namespace ODBC { +using driver::odbcabstraction::DriverException; +using driver::odbcabstraction::GetSqlWCharSize; +using driver::odbcabstraction::Utf8ToWcs; + +// Return the number of bytes required for the conversion. +template +inline size_t ConvertToSqlWChar(const std::string& str, SQLWCHAR* buffer, + SQLLEN buffer_size_in_bytes) { + thread_local std::vector wstr; + Utf8ToWcs(str.data(), str.size(), &wstr); + SQLLEN value_length_in_bytes = wstr.size(); + + if (buffer) { + memcpy(buffer, wstr.data(), + std::min(static_cast(wstr.size()), buffer_size_in_bytes)); + + // Write a NUL terminator + if (buffer_size_in_bytes >= + value_length_in_bytes + static_cast(GetSqlWCharSize())) { + reinterpret_cast(buffer)[value_length_in_bytes / GetSqlWCharSize()] = + '\0'; + } else { + SQLLEN num_chars_written = buffer_size_in_bytes / GetSqlWCharSize(); + // If we failed to even write one char, the buffer is too small to hold a + // NUL-terminator. + if (num_chars_written > 0) { + reinterpret_cast(buffer)[num_chars_written - 1] = '\0'; + } + } + } + return value_length_in_bytes; +} + +inline size_t ConvertToSqlWChar(const std::string& str, SQLWCHAR* buffer, + SQLLEN buffer_size_in_bytes) { + switch (GetSqlWCharSize()) { + case sizeof(char16_t): + return ConvertToSqlWChar(str, buffer, buffer_size_in_bytes); + case sizeof(char32_t): + return ConvertToSqlWChar(str, buffer, buffer_size_in_bytes); + default: + assert(false); + throw DriverException("Encoding is unsupported, SQLWCHAR size: " + + std::to_string(GetSqlWCharSize())); + } +} + +} // namespace ODBC diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_connection.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_connection.h new file mode 100644 index 00000000000..6d46f3b2232 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_connection.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace ODBC { +class ODBCEnvironment; +class ODBCDescriptor; +class ODBCStatement; +} // namespace ODBC + +/** + * @brief An abstraction over an ODBC connection handle. This also wraps an SPI + * Connection. + */ +namespace ODBC { +class ODBCConnection : public ODBCHandle { + public: + ODBCConnection(const ODBCConnection&) = delete; + ODBCConnection& operator=(const ODBCConnection&) = delete; + + ODBCConnection(ODBCEnvironment& environment, + std::shared_ptr spi_connection); + + driver::odbcabstraction::Diagnostics& GetDiagnosticsImpl(); + + const std::string& GetDSN() const; + bool IsConnected() const; + void Connect(std::string dsn, + const driver::odbcabstraction::Connection::ConnPropertyMap& properties, + std::vector& missing_properties); + + void GetInfo(SQLUSMALLINT info_type, SQLPOINTER value, SQLSMALLINT buffer_length, + SQLSMALLINT* output_length, bool is_unicode); + void SetConnectAttr(SQLINTEGER attribute, SQLPOINTER value, SQLINTEGER string_length, + bool isUnicode); + void GetConnectAttr(SQLINTEGER attribute, SQLPOINTER value, SQLINTEGER buffer_length, + SQLINTEGER* output_length, bool is_unicode); + + ~ODBCConnection() = default; + + inline ODBCStatement& GetTrackingStatement() { return *attribute_tracking_statement_; } + + void Disconnect(); + + void ReleaseConnection(); + + std::shared_ptr CreateStatement(); + void DropStatement(ODBCStatement* statement); + + std::shared_ptr CreateDescriptor(); + void DropDescriptor(ODBCDescriptor* descriptor); + + inline bool IsOdbc2Connection() const { return is_2x_connection_; } + + /// @return the DSN or empty string if Driver was used. + static std::string GetPropertiesFromConnString( + const std::string& conn_str, + driver::odbcabstraction::Connection::ConnPropertyMap& properties); + + private: + ODBCEnvironment& environment_; + std::shared_ptr spi_connection_; + // Extra ODBC statement that's used to track and validate when statement attributes are + // set through the connection handle. These attributes get copied to new ODBC statements + // when they are allocated. + std::shared_ptr attribute_tracking_statement_; + std::vector > statements_; + std::vector > descriptors_; + std::string dsn_; + const bool is_2x_connection_; + bool is_connected_; +}; + +} // namespace ODBC diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_descriptor.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_descriptor.h new file mode 100644 index 00000000000..1c45eb4073d --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_descriptor.h @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace driver { +namespace odbcabstraction { +class ResultSetMetadata; +} +} // namespace driver +namespace ODBC { +class ODBCConnection; +class ODBCStatement; +} // namespace ODBC + +namespace ODBC { +struct DescriptorRecord { + std::string base_column_name; + std::string base_table_name; + std::string catalog_name; + std::string label; + std::string literal_prefix; + std::string literal_suffix; + std::string local_type_name; + std::string name; + std::string schema_name; + std::string table_name; + std::string type_name; + SQLPOINTER data_ptr = NULL; + SQLLEN* indicator_ptr = NULL; + SQLLEN display_size = 0; + SQLLEN octet_length = 0; + SQLULEN length = 0; + SQLINTEGER auto_unique_value; + SQLINTEGER case_sensitive = SQL_TRUE; + SQLINTEGER datetime_interval_precision = 0; + SQLINTEGER num_prec_radix = 0; + SQLSMALLINT concise_type = SQL_C_DEFAULT; + SQLSMALLINT datetime_interval_code = 0; + SQLSMALLINT fixed_prec_scale = 0; + SQLSMALLINT nullable = SQL_NULLABLE_UNKNOWN; + SQLSMALLINT param_type = SQL_PARAM_INPUT; + SQLSMALLINT precision = 0; + SQLSMALLINT row_ver = 0; + SQLSMALLINT scale = 0; + SQLSMALLINT searchable = SQL_SEARCHABLE; + SQLSMALLINT type = SQL_C_DEFAULT; + SQLSMALLINT unnamed = SQL_TRUE; + SQLSMALLINT is_unsigned = SQL_FALSE; + SQLSMALLINT updatable = SQL_FALSE; + bool is_bound = false; + + void CheckConsistency(); +}; + +class ODBCDescriptor : public ODBCHandle { + public: + /// \brief Construct a new ODBCDescriptor object. Link the descriptor to a connection, + /// if applicable. A nullptr should be supplied for conn if the descriptor should not be + /// linked. + ODBCDescriptor(driver::odbcabstraction::Diagnostics& base_diagnostics, + ODBCConnection* conn, ODBCStatement* stmt, bool is_app_descriptor, + bool is_writable, bool is_2x_connection); + + driver::odbcabstraction::Diagnostics& GetDiagnosticsImpl(); + + ODBCConnection& GetConnection(); + + void SetHeaderField(SQLSMALLINT field_identifier, SQLPOINTER value, + SQLINTEGER buffer_length); + void SetField(SQLSMALLINT record_number, SQLSMALLINT field_identifier, SQLPOINTER value, + SQLINTEGER buffer_length); + void GetHeaderField(SQLSMALLINT field_identifier, SQLPOINTER value, + SQLINTEGER buffer_length, SQLINTEGER* output_length) const; + void GetField(SQLSMALLINT record_number, SQLSMALLINT field_identifier, SQLPOINTER value, + SQLINTEGER buffer_length, SQLINTEGER* output_length); + SQLSMALLINT GetAllocType() const; + bool IsAppDescriptor() const; + + inline bool HaveBindingsChanged() const { return has_bindings_changed_; } + + void RegisterToStatement(ODBCStatement* statement, bool is_apd); + void DetachFromStatement(ODBCStatement* statement, bool is_apd); + void ReleaseDescriptor(); + + void PopulateFromResultSetMetadata(driver::odbcabstraction::ResultSetMetadata* rsmd); + + const std::vector& GetRecords() const; + std::vector& GetRecords(); + + void BindCol(SQLSMALLINT record_number, SQLSMALLINT c_type, SQLPOINTER data_ptr, + SQLLEN buffer_length, SQLLEN* indicator_ptr); + void SetDataPtrOnRecord(SQLPOINTER data_ptr, SQLSMALLINT rec_number); + + inline SQLULEN GetBindOffset() { return bind_offset_ptr_ ? *bind_offset_ptr_ : 0UL; } + + inline SQLULEN GetBoundStructOffset() { + // If this is SQL_BIND_BY_COLUMN, bind_type_ is zero which indicates no offset due to + // use of a bound struct. If this is non-zero, row-wise binding is being used so the + // app should set this to sizeof(their struct). + return bind_type_; + } + + inline SQLULEN GetArraySize() { return array_size_; } + + inline SQLUSMALLINT* GetArrayStatusPtr() { return array_status_ptr_; } + + inline void SetRowsProcessed(SQLULEN rows) { + if (rows_processed_ptr_) { + *rows_processed_ptr_ = rows; + } + } + + inline void NotifyBindingsHavePropagated() { has_bindings_changed_ = false; } + + inline void NotifyBindingsHaveChanged() { has_bindings_changed_ = true; } + + private: + driver::odbcabstraction::Diagnostics diagnostics_; + std::vector registered_on_statements_as_apd_; + std::vector registered_on_statements_as_ard_; + std::vector records_; + ODBCConnection* owning_connection_; + ODBCStatement* parent_statement_; + SQLUSMALLINT* array_status_ptr_; + SQLULEN* bind_offset_ptr_; + SQLULEN* rows_processed_ptr_; + SQLULEN array_size_; + SQLINTEGER bind_type_; + SQLSMALLINT highest_one_based_bound_record_; + const bool is_2x_connection_; + bool is_app_descriptor_; + bool is_writable_; + bool has_bindings_changed_; +}; +} // namespace ODBC diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_environment.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_environment.h new file mode 100644 index 00000000000..a77e742a7a0 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_environment.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_handle.h" + +#include +#include +#include + +namespace driver { +namespace odbcabstraction { +class Driver; +} +} // namespace driver + +namespace ODBC { +class ODBCConnection; +} + +/** + * @brief An abstraction over an ODBC environment handle. + */ +namespace ODBC { +class ODBCEnvironment : public ODBCHandle { + public: + explicit ODBCEnvironment(std::shared_ptr driver); + driver::odbcabstraction::Diagnostics& GetDiagnosticsImpl(); + SQLINTEGER GetODBCVersion() const; + void SetODBCVersion(SQLINTEGER version); + SQLINTEGER GetConnectionPooling() const; + void SetConnectionPooling(SQLINTEGER pooling); + std::shared_ptr CreateConnection(); + void DropConnection(ODBCConnection* conn); + ~ODBCEnvironment() = default; + + private: + std::vector > connections_; + std::shared_ptr driver_; + std::unique_ptr diagnostics_; + SQLINTEGER version_; + SQLINTEGER connection_pooling_; +}; + +} // namespace ODBC diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_handle.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_handle.h new file mode 100644 index 00000000000..355d950502e --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_handle.h @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +/** + * @brief An abstraction over a generic ODBC handle. + */ +namespace ODBC { + +template +class ODBCHandle { + public: + inline driver::odbcabstraction::Diagnostics& GetDiagnostics() { + return static_cast(this)->GetDiagnosticsImpl(); + } + + inline driver::odbcabstraction::Diagnostics& GetDiagnosticsImpl() { + throw std::runtime_error("Illegal state -- diagnostics requested on invalid handle"); + } + + template + inline SQLRETURN Execute(SQLRETURN rc, Function function) { + try { + GetDiagnostics().Clear(); + rc = function(); + } catch (const driver::odbcabstraction::DriverException& ex) { + GetDiagnostics().AddError(ex); + } catch (const std::bad_alloc& ex) { + GetDiagnostics().AddError(driver::odbcabstraction::DriverException( + "A memory allocation error occurred.", "HY001")); + } catch (const std::exception& ex) { + GetDiagnostics().AddError(driver::odbcabstraction::DriverException(ex.what())); + } catch (...) { + GetDiagnostics().AddError( + driver::odbcabstraction::DriverException("An unknown error occurred.")); + } + + if (GetDiagnostics().HasError()) { + return SQL_ERROR; + } + if (SQL_SUCCEEDED(rc) && GetDiagnostics().HasWarning()) { + return SQL_SUCCESS_WITH_INFO; + } + return rc; + } + + template + inline SQLRETURN ExecuteWithLock(SQLRETURN rc, Function function) { + const std::lock_guard lock(mtx_); + return Execute(rc, function); + } + + template + static inline SQLRETURN ExecuteWithDiagnostics(SQLHANDLE handle, SQLRETURN rc, + Function func) { + if (!handle) { + return SQL_INVALID_HANDLE; + } + if (SHOULD_LOCK) { + return reinterpret_cast(handle)->ExecuteWithLock(rc, func); + } else { + return reinterpret_cast(handle)->Execute(rc, func); + } + } + + static Derived* Of(SQLHANDLE handle) { return reinterpret_cast(handle); } + + private: + std::mutex mtx_; +}; +} // namespace ODBC diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_statement.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_statement.h new file mode 100644 index 00000000000..aca0ce8b955 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_statement.h @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include +#include + +namespace driver { +namespace odbcabstraction { +class Statement; +class ResultSet; +} // namespace odbcabstraction +} // namespace driver + +namespace ODBC { +class ODBCConnection; +class ODBCDescriptor; +} // namespace ODBC + +/** + * @brief An abstraction over an ODBC connection handle. This also wraps an SPI + * Connection. + */ +namespace ODBC { +class ODBCStatement : public ODBCHandle { + public: + ODBCStatement(const ODBCStatement&) = delete; + ODBCStatement& operator=(const ODBCStatement&) = delete; + + ODBCStatement(ODBCConnection& connection, + std::shared_ptr spi_statement); + + ~ODBCStatement() = default; + + inline driver::odbcabstraction::Diagnostics& GetDiagnosticsImpl() { + return *diagnostics_; + } + + ODBCConnection& GetConnection(); + + void CopyAttributesFromConnection(ODBCConnection& connection); + void Prepare(const std::string& query); + void ExecutePrepared(); + void ExecuteDirect(const std::string& query); + + /** + * @brief Returns true if the number of rows fetch was greater than zero. + */ + bool Fetch(size_t rows); + bool IsPrepared() const; + + void GetStmtAttr(SQLINTEGER statement_attribute, SQLPOINTER output, + SQLINTEGER buffer_size, SQLINTEGER* str_len_ptr, bool is_unicode); + void SetStmtAttr(SQLINTEGER statement_attribute, SQLPOINTER value, + SQLINTEGER buffer_size, bool is_unicode); + + void RevertAppDescriptor(bool is_apd); + + inline ODBCDescriptor* GetIRD() { return ird_.get(); } + + inline ODBCDescriptor* GetARD() { return current_ard_; } + + inline SQLULEN GetRowsetSize() { return rowset_size_; } + + bool GetData(SQLSMALLINT record_number, SQLSMALLINT c_type, SQLPOINTER data_ptr, + SQLLEN buffer_length, SQLLEN* indicator_ptr); + + /** + * @brief Closes the cursor. This does _not_ un-prepare the statement or change + * bindings. + */ + void CloseCursor(bool suppress_errors); + + /** + * @brief Releases this statement from memory. + */ + void ReleaseStatement(); + + void GetTables(const std::string* catalog, const std::string* schema, + const std::string* table, const std::string* table_type); + void GetColumns(const std::string* catalog, const std::string* schema, + const std::string* table, const std::string* column); + void GetTypeInfo(SQLSMALLINT data_type); + void Cancel(); + + private: + ODBCConnection& connection_; + std::shared_ptr spi_statement_; + std::shared_ptr current_result_; + driver::odbcabstraction::Diagnostics* diagnostics_; + + std::shared_ptr built_in_ard_; + std::shared_ptr built_in_apd_; + std::shared_ptr ipd_; + std::shared_ptr ird_; + ODBCDescriptor* current_ard_; + ODBCDescriptor* current_apd_; + SQLULEN row_number_; + SQLULEN max_rows_; + SQLULEN rowset_size_; // Used by SQLExtendedFetch instead of the ARD array size. + bool is_prepared_; + bool has_reached_end_of_result_; +}; +} // namespace ODBC diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/type_utilities.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/type_utilities.h new file mode 100644 index 00000000000..dc14d582320 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/type_utilities.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace ODBC { +inline SQLSMALLINT GetSqlTypeForODBCVersion(SQLSMALLINT type, bool is_odbc_2x) { + switch (type) { + case SQL_DATE: + case SQL_TYPE_DATE: + return is_odbc_2x ? SQL_DATE : SQL_TYPE_DATE; + + case SQL_TIME: + case SQL_TYPE_TIME: + return is_odbc_2x ? SQL_TIME : SQL_TYPE_TIME; + + case SQL_TIMESTAMP: + case SQL_TYPE_TIMESTAMP: + return is_odbc_2x ? SQL_TIMESTAMP : SQL_TYPE_TIMESTAMP; + + default: + return type; + } +} +} // namespace ODBC diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h new file mode 100644 index 00000000000..0a2c334dc89 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/platform.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(_WIN32) +// NOMINMAX avoids std::min/max being defined as a c macro +# ifndef NOMINMAX +# define NOMINMAX +# endif + +// Avoid including extraneous Windows headers. +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif + +// winsock2.h must be included before windows.h to avoid conflicts +# include +# include + +# include + +# include +typedef SSIZE_T ssize_t; + +#endif diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h new file mode 100644 index 00000000000..792a52c1fad --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace driver { +namespace odbcabstraction { + +/// \brief Case insensitive comparator +struct CaseInsensitiveComparator { + bool operator()(const std::string_view& s1, const std::string_view& s2) const { + return boost::lexicographical_compare(s1, s2, boost::is_iless()); + } +}; + +// PropertyMap is case-insensitive for keys. +typedef std::map PropertyMap; + +class Statement; + +/// \brief High-level representation of an ODBC connection. +class Connection { + protected: + Connection() = default; + + public: + virtual ~Connection() = default; + + /// \brief Connection attributes + enum AttributeId { + ACCESS_MODE, // uint32_t - Tells if it should support write operations + CONNECTION_DEAD, // uint32_t - Tells if connection is still alive + CONNECTION_TIMEOUT, // uint32_t - The timeout for connection functions after + // connecting. + CURRENT_CATALOG, // std::string - The current catalog + LOGIN_TIMEOUT, // uint32_t - The timeout for the initial connection + PACKET_SIZE, // uint32_t - The Packet Size + }; + + typedef boost::variant Attribute; + typedef boost::variant Info; + typedef PropertyMap ConnPropertyMap; + + /// \brief Establish the connection. + /// \param properties [in] properties used to establish the connection. + /// \param missing_properties [out] vector of missing properties (if any). + virtual void Connect(const ConnPropertyMap& properties, + std::vector& missing_properties) = 0; + + /// \brief Close the connection. + virtual void Close() = 0; + + /// \brief Create a statement. + virtual std::shared_ptr CreateStatement() = 0; + + /// \brief Set a connection attribute (may be called at any time). + /// \param attribute [in] Which attribute to set. + /// \param value The value to be set. + /// \return true if the value was set successfully or false if it was substituted with + /// a similar value. + virtual bool SetAttribute(AttributeId attribute, const Attribute& value) = 0; + + /// \brief Retrieve a connection attribute + /// \param attribute [in] Attribute to be retrieved. + virtual boost::optional GetAttribute( + Connection::AttributeId attribute) = 0; + + /// \brief Retrieves info from the database (see ODBC's SQLGetInfo). + virtual Info GetInfo(uint16_t info_type) = 0; + + /// \brief Gets the diagnostics for this connection. + /// \return the diagnostics + virtual Diagnostics& GetDiagnostics() = 0; +}; + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/driver.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/driver.h new file mode 100644 index 00000000000..f13371bf2d5 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/driver.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include + +namespace driver { +namespace odbcabstraction { + +class Connection; + +/// \brief High-level representation of an ODBC driver. +class Driver { + protected: + Driver() = default; + + public: + virtual ~Driver() = default; + + /// \brief Create a connection using given ODBC version. + /// \param odbc_version ODBC version to be used. + virtual std::shared_ptr CreateConnection(OdbcVersion odbc_version) = 0; + + /// \brief Gets the diagnostics for this connection. + /// \return the diagnostics + virtual Diagnostics& GetDiagnostics() = 0; + + /// \brief Sets the driver version. + virtual void SetVersion(std::string version) = 0; + + /// \brief Register a log to be used by the system. + virtual void RegisterLog() = 0; +}; + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set.h new file mode 100644 index 00000000000..1b3f8eb96d8 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include + +#include + +namespace driver { +namespace odbcabstraction { + +class ResultSetMetadata; + +class ResultSet { + protected: + ResultSet() = default; + + public: + virtual ~ResultSet() = default; + + /// \brief Returns metadata for this ResultSet. + virtual std::shared_ptr GetMetadata() = 0; + + /// \brief Closes ResultSet, releasing any resources allocated by it. + virtual void Close() = 0; + + /// \brief Cancels ResultSet. + virtual void Cancel() = 0; + + /// \brief Binds a column with a result buffer. The buffer will be filled with + /// up to `GetMaxBatchSize()` values. + /// + /// \param column Column number to be bound with (starts from 1). + /// \param target_type Target data type expected by client. + /// \param precision Column's precision + /// \param scale Column's scale + /// \param buffer Target buffer to be filled with column values. + /// \param buffer_length Target buffer length. + /// \param strlen_buffer Buffer that holds the length of each value contained + /// on target buffer. + virtual void BindColumn(int column, int16_t target_type, int precision, int scale, + void* buffer, size_t buffer_length, ssize_t* strlen_buffer) = 0; + + /// \brief Fetches next rows from ResultSet and load values on buffers + /// previously bound with `BindColumn`. + /// + /// The parameters `buffer` and `strlen_buffer` passed to `BindColumn()` + /// should have capacity to accommodate the rows requested, otherwise data + /// will be truncated. + /// + /// \param rows The maximum number of rows to be fetched. + /// \param bind_offset The offset for bound columns and indicators. + /// \param bind_type The type of binding. Zero indicates columnar binding, non-zero + /// indicates + /// that this holds the size of an application row buffer. This + /// corresponds directly to SQL_DESC_BIND_TYPE in ODBC. + /// \param row_status_array The array to write statuses. + /// \returns The number of rows fetched. + virtual size_t Move(size_t rows, size_t bind_offset, size_t bind_type, + uint16_t* row_status_array) = 0; + + /// \brief Populates `buffer` with the value on current row for given column. + /// If the value doesn't fit the buffer this method returns true and + /// subsequent calls will fetch the rest of data. + /// + /// \param column Column number to be fetched. + /// \param target_type Target data type expected by client. + /// \param precision Column's precision + /// \param scale Column's scale + /// \param buffer Target buffer to be populated. + /// \param buffer_length Target buffer length. + /// \param strlen_buffer Buffer that holds the length of value being fetched. + /// \returns true if there is more data to fetch from the current cell; + /// false if the whole value was already fetched. + virtual bool GetData(int column, int16_t target_type, int precision, int scale, + void* buffer, size_t buffer_length, ssize_t* strlen_buffer) = 0; +}; + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set_metadata.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set_metadata.h new file mode 100644 index 00000000000..f625a2598c1 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set_metadata.h @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace driver { +namespace odbcabstraction { + +/// \brief High Level representation of the ResultSetMetadata from ODBC. +class ResultSetMetadata { + protected: + ResultSetMetadata() = default; + + public: + virtual ~ResultSetMetadata() = default; + + /// \brief It returns the total amount of the columns in the ResultSet. + /// \return the amount of columns. + virtual size_t GetColumnCount() = 0; + + /// \brief It retrieves the name of a specific column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the column name. + virtual std::string GetColumnName(int column_position) = 0; + + /// \brief It retrieves the size of a specific column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the column size. + virtual size_t GetPrecision(int column_position) = 0; + + /// \brief It retrieves the total of number of decimal digits. + /// \param column_position [in] the position of the column, starting from 1. + /// \return amount of decimal digits. + virtual size_t GetScale(int column_position) = 0; + + /// \brief It retrieves the SQL_DATA_TYPE of the column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the SQL_DATA_TYPE + virtual uint16_t GetDataType(int column_position) = 0; + + /// \brief It returns a boolean value indicating if the column can have + /// null values. + /// \param column_position [in] the position of the column, starting from 1. + /// \return true if column is nullable. + virtual Nullability IsNullable(int column_position) = 0; + + /// \brief It returns the Schema name for a specific column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the Schema name for given column. + virtual std::string GetSchemaName(int column_position) = 0; + + /// \brief It returns the Catalog Name for a specific column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the catalog name for given column. + virtual std::string GetCatalogName(int column_position) = 0; + + /// \brief It returns the Table Name for a specific column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the Table name for given column. + virtual std::string GetTableName(int column_position) = 0; + + /// \brief It retrieves the column label. + /// \param column_position [in] the position of the column, starting from 1. + /// \return column label. + virtual std::string GetColumnLabel(int column_position) = 0; + + /// \brief It retrieves the designated column's normal maximum width in + /// characters. + /// \param column_position [in] the position of the column, starting from 1. + /// \return column normal maximum width. + virtual size_t GetColumnDisplaySize(int column_position) = 0; + + /// \brief It retrieves the base name for the column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the base column name. + virtual std::string GetBaseColumnName(int column_position) = 0; + + /// \brief It retrieves the base table name that contains the column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the base table name. + virtual std::string GetBaseTableName(int column_position) = 0; + + /// \brief It retrieves the concise data type (SQL_DESC_CONCISE_TYPE). + /// \param column_position [in] the position of the column, starting from 1. + /// \return the concise data type. + virtual uint16_t GetConciseType(int column_position) = 0; + + /// \brief It retrieves the maximum or the actual character length + /// of a character string or binary data type. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the maximum length + virtual size_t GetLength(int column_position) = 0; + + /// \brief It retrieves the character or characters that the driver uses + /// as prefix for literal values. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the prefix character(s). + virtual std::string GetLiteralPrefix(int column_position) = 0; + + /// \brief It retrieves the character or characters that the driver uses + /// as prefix for literal values. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the suffix character(s). + virtual std::string GetLiteralSuffix(int column_position) = 0; + + /// \brief It retrieves the local type name for a specific column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the local type name. + virtual std::string GetLocalTypeName(int column_position) = 0; + + /// \brief It returns the column name alias. If it has no alias + /// it returns the column name. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the column name alias. + virtual std::string GetName(int column_position) = 0; + + /// \brief It returns a numeric value to indicate if the data + /// is an approximate or exact numeric data type. + /// \param column_position [in] the position of the column, starting from 1. + virtual size_t GetNumPrecRadix(int column_position) = 0; + + /// \brief It returns the length in bytes from a string or binary data. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the length in bytes. + virtual size_t GetOctetLength(int column_position) = 0; + + /// \brief It returns the data type as a string. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the data type string. + virtual std::string GetTypeName(int column_position) = 0; + + /// \brief It returns a numeric values indicate the updatability of the + /// column. + /// \param column_position [in] the position of the column, starting from 1. + /// \return the updatability of the column. + virtual Updatability GetUpdatable(int column_position) = 0; + + /// \brief It returns a boolean value indicating if the column is + /// autoincrementing. + /// \param column_position [in] the position of the column, starting from 1. + /// \return boolean values if column is auto incremental. + virtual bool IsAutoUnique(int column_position) = 0; + + /// \brief It returns a boolean value indicating if the column is + /// case sensitive. + /// \param column_position [in] the position of the column, starting from 1. + /// \return boolean values if column is case sensitive. + virtual bool IsCaseSensitive(int column_position) = 0; + + /// \brief It returns a boolean value indicating if the column can be used + /// in where clauses. + /// \param column_position [in] the position of the column, starting from 1. + /// \return boolean values if column can be used in where clauses. + virtual Searchability IsSearchable(int column_position) = 0; + + /// \brief It checks if a numeric column is signed or unsigned. + /// \param column_position [in] the position of the column, starting from 1. + /// \return check if the column is signed or not. + virtual bool IsUnsigned(int column_position) = 0; + + /// \brief It check if the columns has fixed precision and a nonzero + /// scale. + /// \param column_position [in] the position of the column, starting from 1. + /// \return if column has a fixed precision and non zero scale. + virtual bool IsFixedPrecScale(int column_position) = 0; +}; + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/statement.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/statement.h new file mode 100644 index 00000000000..2ac03405660 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/statement.h @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +namespace driver { +namespace odbcabstraction { + +using boost::optional; + +class ResultSet; + +class ResultSetMetadata; + +/// \brief High-level representation of an ODBC statement. +class Statement { + protected: + Statement() = default; + + public: + virtual ~Statement() = default; + + /// \brief Statement attributes that can be called at anytime. + ////TODO: Document attributes + enum StatementAttributeId { + MAX_LENGTH, // size_t - The maximum length when retrieving variable length data. 0 + // means no limit. + METADATA_ID, // size_t - Modifies catalog function arguments to be identifiers. + // SQL_TRUE or SQL_FALSE. + NOSCAN, // size_t - Indicates that the driver does not scan for escape sequences. + // Default to SQL_NOSCAN_OFF + QUERY_TIMEOUT, // size_t - The time to wait in seconds for queries to execute. 0 to + // have no timeout. + }; + + typedef boost::variant Attribute; + + /// \brief Set a statement attribute (may be called at any time) + /// + /// NOTE: Meant to be bound with SQLSetStmtAttr. + /// + /// \param attribute Attribute identifier to set. + /// \param value Value to be associated with the attribute. + /// \return true if the value was set successfully or false if it was substituted with + /// a similar value. + virtual bool SetAttribute(StatementAttributeId attribute, const Attribute& value) = 0; + + /// \brief Retrieve a statement attribute. + /// + /// NOTE: Meant to be bound with SQLGetStmtAttr. + /// + /// \param attribute Attribute identifier to be retrieved. + /// \return Value associated with the attribute. + virtual optional GetAttribute( + Statement::StatementAttributeId attribute) = 0; + + /// \brief Prepares the statement. + /// Returns ResultSetMetadata if query returns a result set, + /// otherwise it returns `boost::none`. + /// \param query The SQL query to prepare. + virtual boost::optional> Prepare( + const std::string& query) = 0; + + /// \brief Execute the prepared statement. + /// + /// NOTE: Must call `Prepare(const std::string &query)` before, otherwise it + /// will throw an exception. + /// + /// \returns true if the first result is a ResultSet object; + /// false if it is an update count or there are no results. + virtual bool ExecutePrepared() = 0; + + /// \brief Execute the statement if it is prepared or not. + /// \param query The SQL query to execute. + /// \returns true if the first result is a ResultSet object; + /// false if it is an update count or there are no results. + virtual bool Execute(const std::string& query) = 0; + + /// \brief Returns the current result as a ResultSet object. + virtual std::shared_ptr GetResultSet() = 0; + + /// \brief Retrieves the current result as an update count; + /// if the result is a ResultSet object or there are no more results, -1 is + /// returned. + virtual int64_t GetUpdateCount() = 0; + + /// \brief Returns the list of table, catalog, or schema names, and table + /// types, stored in a specific data source. The driver returns the + /// information as a result set. + /// + /// NOTE: This is meant to be used by ODBC 2.x binding. + /// + /// \param catalog_name The catalog name. + /// \param schema_name The schema name. + /// \param table_name The table name. + /// \param table_type The table type. + virtual std::shared_ptr GetTables_V2(const std::string* catalog_name, + const std::string* schema_name, + const std::string* table_name, + const std::string* table_type) = 0; + + /// \brief Returns the list of table, catalog, or schema names, and table + /// types, stored in a specific data source. The driver returns the + /// information as a result set. + /// + /// NOTE: This is meant to be used by ODBC 3.x binding. + /// + /// \param catalog_name The catalog name. + /// \param schema_name The schema name. + /// \param table_name The table name. + /// \param table_type The table type. + virtual std::shared_ptr GetTables_V3(const std::string* catalog_name, + const std::string* schema_name, + const std::string* table_name, + const std::string* table_type) = 0; + + /// \brief Returns the list of column names in specified tables. The driver + /// returns this information as a result set.. + /// + /// NOTE: This is meant to be used by ODBC 2.x binding. + /// + /// \param catalog_name The catalog name. + /// \param schema_name The schema name. + /// \param table_name The table name. + /// \param column_name The column name. + virtual std::shared_ptr GetColumns_V2(const std::string* catalog_name, + const std::string* schema_name, + const std::string* table_name, + const std::string* column_name) = 0; + + /// \brief Returns the list of column names in specified tables. The driver + /// returns this information as a result set.. + /// + /// NOTE: This is meant to be used by ODBC 3.x binding. + /// + /// \param catalog_name The catalog name. + /// \param schema_name The schema name. + /// \param table_name The table name. + /// \param column_name The column name. + virtual std::shared_ptr GetColumns_V3(const std::string* catalog_name, + const std::string* schema_name, + const std::string* table_name, + const std::string* column_name) = 0; + + /// \brief Returns information about data types supported by the data source. + /// The driver returns the information in the form of an SQL result set. The + /// data types are intended for use in Data Definition Language (DDL) + /// statements. + /// + /// NOTE: This is meant to be used by ODBC 2.x binding. + /// + /// \param data_type The SQL data type. + virtual std::shared_ptr GetTypeInfo_V2(int16_t data_type) = 0; + + /// \brief Returns information about data types supported by the data source. + /// The driver returns the information in the form of an SQL result set. The + /// data types are intended for use in Data Definition Language (DDL) + /// statements. + /// + /// NOTE: This is meant to be used by ODBC 3.x binding. + /// + /// \param data_type The SQL data type. + virtual std::shared_ptr GetTypeInfo_V3(int16_t data_type) = 0; + + /// \brief Gets the diagnostics for this statement. + /// \return the diagnostics + virtual Diagnostics& GetDiagnostics() = 0; + + /// \brief Cancels the processing of this statement. + virtual void Cancel() = 0; +}; + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h new file mode 100644 index 00000000000..cecc1bee9a2 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/types.h @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace driver { +namespace odbcabstraction { + +/// \brief Supported ODBC versions. +enum OdbcVersion { V_2, V_3, V_4 }; + +// Based on ODBC sql.h and sqlext.h definitions. +enum SqlDataType : int16_t { + SqlDataType_CHAR = 1, + SqlDataType_VARCHAR = 12, + SqlDataType_LONGVARCHAR = (-1), + SqlDataType_WCHAR = (-8), + SqlDataType_WVARCHAR = (-9), + SqlDataType_WLONGVARCHAR = (-10), + SqlDataType_DECIMAL = 3, + SqlDataType_NUMERIC = 2, + SqlDataType_SMALLINT = 5, + SqlDataType_INTEGER = 4, + SqlDataType_REAL = 7, + SqlDataType_FLOAT = 6, + SqlDataType_DOUBLE = 8, + SqlDataType_BIT = (-7), + SqlDataType_TINYINT = (-6), + SqlDataType_BIGINT = (-5), + SqlDataType_BINARY = (-2), + SqlDataType_VARBINARY = (-3), + SqlDataType_LONGVARBINARY = (-4), + SqlDataType_TYPE_DATE = 91, + SqlDataType_TYPE_TIME = 92, + SqlDataType_TYPE_TIMESTAMP = 93, + SqlDataType_INTERVAL_MONTH = (100 + 2), + SqlDataType_INTERVAL_YEAR = (100 + 1), + SqlDataType_INTERVAL_YEAR_TO_MONTH = (100 + 7), + SqlDataType_INTERVAL_DAY = (100 + 3), + SqlDataType_INTERVAL_HOUR = (100 + 4), + SqlDataType_INTERVAL_MINUTE = (100 + 5), + SqlDataType_INTERVAL_SECOND = (100 + 6), + SqlDataType_INTERVAL_DAY_TO_HOUR = (100 + 8), + SqlDataType_INTERVAL_DAY_TO_MINUTE = (100 + 9), + SqlDataType_INTERVAL_DAY_TO_SECOND = (100 + 10), + SqlDataType_INTERVAL_HOUR_TO_MINUTE = (100 + 11), + SqlDataType_INTERVAL_HOUR_TO_SECOND = (100 + 12), + SqlDataType_INTERVAL_MINUTE_TO_SECOND = (100 + 13), + SqlDataType_GUID = (-11), +}; + +enum SqlDateTimeSubCode : int16_t { + SqlDateTimeSubCode_DATE = 1, + SqlDateTimeSubCode_TIME = 2, + SqlDateTimeSubCode_TIMESTAMP = 3, + SqlDateTimeSubCode_YEAR = 1, + SqlDateTimeSubCode_MONTH = 2, + SqlDateTimeSubCode_DAY = 3, + SqlDateTimeSubCode_HOUR = 4, + SqlDateTimeSubCode_MINUTE = 5, + SqlDateTimeSubCode_SECOND = 6, + SqlDateTimeSubCode_YEAR_TO_MONTH = 7, + SqlDateTimeSubCode_DAY_TO_HOUR = 8, + SqlDateTimeSubCode_DAY_TO_MINUTE = 9, + SqlDateTimeSubCode_DAY_TO_SECOND = 10, + SqlDateTimeSubCode_HOUR_TO_MINUTE = 11, + SqlDateTimeSubCode_HOUR_TO_SECOND = 12, + SqlDateTimeSubCode_MINUTE_TO_SECOND = 13, +}; + +// Based on ODBC sql.h and sqlext.h definitions. +enum CDataType { + CDataType_CHAR = 1, + CDataType_WCHAR = -8, + CDataType_SSHORT = (5 + (-20)), + CDataType_USHORT = (5 + (-22)), + CDataType_SLONG = (4 + (-20)), + CDataType_ULONG = (4 + (-22)), + CDataType_FLOAT = 7, + CDataType_DOUBLE = 8, + CDataType_BIT = -7, + CDataType_DATE = 91, + CDataType_TIME = 92, + CDataType_TIMESTAMP = 93, + CDataType_STINYINT = ((-6) + (-20)), + CDataType_UTINYINT = ((-6) + (-22)), + CDataType_SBIGINT = ((-5) + (-20)), + CDataType_UBIGINT = ((-5) + (-22)), + CDataType_BINARY = (-2), + CDataType_NUMERIC = 2, + CDataType_DEFAULT = 99, +}; + +enum Nullability { + NULLABILITY_NO_NULLS = 0, + NULLABILITY_NULLABLE = 1, + NULLABILITY_UNKNOWN = 2, +}; + +enum Searchability { + SEARCHABILITY_NONE = 0, + SEARCHABILITY_LIKE_ONLY = 1, + SEARCHABILITY_ALL_EXPECT_LIKE = 2, + SEARCHABILITY_ALL = 3, +}; + +enum Updatability { + UPDATABILITY_READONLY = 0, + UPDATABILITY_WRITE = 1, + UPDATABILITY_READWRITE_UNKNOWN = 2, +}; + +constexpr ssize_t NULL_DATA = -1; +constexpr ssize_t NO_TOTAL = -4; +constexpr ssize_t ALL_TYPES = 0; +constexpr ssize_t DAYS_TO_SECONDS_MULTIPLIER = 86400; +constexpr ssize_t MILLI_TO_SECONDS_DIVISOR = 1000; +constexpr ssize_t MICRO_TO_SECONDS_DIVISOR = 1000000; +constexpr ssize_t NANO_TO_SECONDS_DIVISOR = 1000000000; + +typedef struct tagDATE_STRUCT { + int16_t year; + uint16_t month; + uint16_t day; +} DATE_STRUCT; + +typedef struct tagTIME_STRUCT { + uint16_t hour; + uint16_t minute; + uint16_t second; +} TIME_STRUCT; + +typedef struct tagTIMESTAMP_STRUCT { + int16_t year; + uint16_t month; + uint16_t day; + uint16_t hour; + uint16_t minute; + uint16_t second; + uint32_t fraction; +} TIMESTAMP_STRUCT; + +typedef struct tagNUMERIC_STRUCT { + uint8_t precision; + int8_t scale; + uint8_t sign; // The sign field is 1 if positive, 0 if negative. + uint8_t val[16]; //[e], [f] +} NUMERIC_STRUCT; + +enum RowStatus : uint16_t { + RowStatus_SUCCESS = 0, // Same as SQL_ROW_SUCCESS + RowStatus_SUCCESS_WITH_INFO = 6, // Same as SQL_ROW_SUCCESS_WITH_INFO + RowStatus_ERROR = 5, // Same as SQL_ROW_ERROR + RowStatus_NOROW = 3 // Same as SQL_ROW_NOROW +}; + +struct MetadataSettings { + boost::optional string_column_length{boost::none}; + size_t chunk_buffer_capacity; + bool use_wide_char; +}; + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/utils.h b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/utils.h new file mode 100644 index 00000000000..c2c4d020edc --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/utils.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h" + +namespace driver { +namespace odbcabstraction { + +using driver::odbcabstraction::Connection; + +/// Parse a string value to a boolean. +/// \param value the value to be parsed. +/// \return the parsed valued. +boost::optional AsBool(const std::string& value); + +/// Looks up for a value inside the ConnPropertyMap and then try to parse it. +/// In case it does not find or it cannot parse, the default value will be returned. +/// \param conn_property_map the map with the connection properties. +/// \param property_name the name of the property that will be looked up. +/// \return the parsed valued. +boost::optional AsBool(const Connection::ConnPropertyMap& conn_property_map, + const std::string_view& property_name); + +/// Looks up for a value inside the ConnPropertyMap and then try to parse it. +/// In case it does not find or it cannot parse, the default value will be returned. +/// \param min_value the minimum value to be parsed, else the default +/// value is returned. \param conn_property_map the map with the connection +/// properties. \param property_name the name of the property that will be +/// looked up. \return the parsed valued. \exception +/// std::invalid_argument exception from std::stoi \exception +/// std::out_of_range exception from std::stoi +boost::optional AsInt32(int32_t min_value, + const Connection::ConnPropertyMap& conn_property_map, + const std::string_view& property_name); +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_connection.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_connection.cc new file mode 100644 index 00000000000..95dc230f3b6 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_connection.cc @@ -0,0 +1,777 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_connection.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/attribute_utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_descriptor.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_environment.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_statement.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/connection.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/statement.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +using ODBC::ODBCConnection; +using ODBC::ODBCDescriptor; +using ODBC::ODBCStatement; + +using driver::odbcabstraction::Connection; +using driver::odbcabstraction::Diagnostics; +using driver::odbcabstraction::DriverException; +using driver::odbcabstraction::Statement; + +namespace { +// Key-value pairs separated by semi-colon. +// Note that the value can be wrapped in curly braces to escape other significant +// characters such as semi-colons and equals signs. NOTE: This can be optimized to be +// built statically. +const boost::xpressive::sregex CONNECTION_STR_REGEX( + boost::xpressive::sregex::compile("([^=;]+)=({.+}|[^=;]+|[^;])")); + +// Load properties from the given DSN. The properties loaded do _not_ overwrite existing +// entries in the properties. +void loadPropertiesFromDSN(const std::string& dsn, + Connection::ConnPropertyMap& properties) { + const size_t BUFFER_SIZE = 1024 * 10; + std::vector outputBuffer; + outputBuffer.resize(BUFFER_SIZE, '\0'); + SQLSetConfigMode(ODBC_BOTH_DSN); + + SQLGetPrivateProfileString(dsn.c_str(), NULL, "", &outputBuffer[0], BUFFER_SIZE, + "odbc.ini"); + + // The output buffer holds the list of keys in a series of NUL-terminated strings. + // The series is terminated with an empty string (eg a NUL-terminator terminating the + // last key followed by a NUL terminator after). + std::vector keys; + size_t pos = 0; + while (pos < BUFFER_SIZE) { + std::string key(&outputBuffer[pos]); + if (key.empty()) { + break; + } + size_t len = key.size(); + + // Skip over Driver or DSN keys. + if (!boost::iequals(key, "DSN") && !boost::iequals(key, "Driver")) { + keys.emplace_back(std::move(key)); + } + pos += len + 1; + } + + for (auto& key : keys) { + outputBuffer.clear(); + outputBuffer.resize(BUFFER_SIZE, '\0'); + + std::string key_str = std::string(key); + SQLGetPrivateProfileString(dsn.c_str(), key_str.c_str(), "", &outputBuffer[0], + BUFFER_SIZE, "odbc.ini"); + + std::string value = std::string(&outputBuffer[0]); + auto propIter = properties.find(key); + if (propIter == properties.end()) { + properties.emplace(std::make_pair(std::move(key), std::move(value))); + } + } +} + +} // namespace + +// Public +// ========================================================================================= +ODBCConnection::ODBCConnection(ODBCEnvironment& environment, + std::shared_ptr spi_connection) + : environment_(environment), + spi_connection_(std::move(spi_connection)), + is_2x_connection_(environment.GetODBCVersion() == SQL_OV_ODBC2), + is_connected_(false) {} + +Diagnostics& ODBCConnection::GetDiagnosticsImpl() { + return spi_connection_->GetDiagnostics(); +} + +bool ODBCConnection::IsConnected() const { return is_connected_; } + +const std::string& ODBCConnection::GetDSN() const { return dsn_; } + +void ODBCConnection::Connect(std::string dsn, + const Connection::ConnPropertyMap& properties, + std::vector& missing_properties) { + if (is_connected_) { + throw DriverException("Already connected.", "HY010"); + } + + dsn_ = std::move(dsn); + spi_connection_->Connect(properties, missing_properties); + is_connected_ = true; + std::shared_ptr spi_statement = spi_connection_->CreateStatement(); + attribute_tracking_statement_ = std::make_shared(*this, spi_statement); +} + +void ODBCConnection::GetInfo(SQLUSMALLINT info_type, SQLPOINTER value, + SQLSMALLINT buffer_length, SQLSMALLINT* output_length, + bool is_unicode) { + switch (info_type) { + case SQL_ACTIVE_ENVIRONMENTS: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; +#ifdef SQL_ASYNC_DBC_FUNCTIONS + case SQL_ASYNC_DBC_FUNCTIONS: + GetAttribute(static_cast(SQL_ASYNC_DBC_NOT_CAPABLE), value, + buffer_length, output_length); + break; +#endif + case SQL_ASYNC_MODE: + GetAttribute(static_cast(SQL_AM_NONE), value, buffer_length, + output_length); + break; +#ifdef SQL_ASYNC_NOTIFICATION + case SQL_ASYNC_NOTIFICATION: + GetAttribute(static_cast(SQL_ASYNC_NOTIFICATION_NOT_CAPABLE), value, + buffer_length, output_length); + break; +#endif + case SQL_BATCH_ROW_COUNT: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_BATCH_SUPPORT: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_DATA_SOURCE_NAME: + GetStringAttribute(is_unicode, dsn_, true, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DRIVER_ODBC_VER: + GetStringAttribute(is_unicode, "03.80", true, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DYNAMIC_CURSOR_ATTRIBUTES1: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_DYNAMIC_CURSOR_ATTRIBUTES2: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_FORWARD_ONLY_CURSOR_ATTRIBUTES1: + GetAttribute(static_cast(SQL_CA1_NEXT), value, buffer_length, + output_length); + break; + case SQL_FORWARD_ONLY_CURSOR_ATTRIBUTES2: + GetAttribute(static_cast(SQL_CA2_READ_ONLY_CONCURRENCY), value, + buffer_length, output_length); + break; + case SQL_FILE_USAGE: + GetAttribute(static_cast(SQL_FILE_NOT_SUPPORTED), value, + buffer_length, output_length); + break; + case SQL_KEYSET_CURSOR_ATTRIBUTES1: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_KEYSET_CURSOR_ATTRIBUTES2: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_MAX_ASYNC_CONCURRENT_STATEMENTS: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_ODBC_INTERFACE_CONFORMANCE: + GetAttribute(static_cast(SQL_OIC_CORE), value, buffer_length, + output_length); + break; + // case SQL_ODBC_STANDARD_CLI_CONFORMANCE: - mentioned in SQLGetInfo spec with no + // description and there is no constant for this. + case SQL_PARAM_ARRAY_ROW_COUNTS: + GetAttribute(static_cast(SQL_PARC_NO_BATCH), value, buffer_length, + output_length); + break; + case SQL_PARAM_ARRAY_SELECTS: + GetAttribute(static_cast(SQL_PAS_NO_SELECT), value, buffer_length, + output_length); + break; + case SQL_ROW_UPDATES: + GetStringAttribute(is_unicode, "N", true, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_SCROLL_OPTIONS: + GetAttribute(static_cast(SQL_SO_FORWARD_ONLY), value, buffer_length, + output_length); + break; + case SQL_STATIC_CURSOR_ATTRIBUTES1: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_STATIC_CURSOR_ATTRIBUTES2: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_BOOKMARK_PERSISTENCE: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_DESCRIBE_PARAMETER: + GetStringAttribute(is_unicode, "N", true, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_MULT_RESULT_SETS: + GetStringAttribute(is_unicode, "N", true, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_MULTIPLE_ACTIVE_TXN: + GetStringAttribute(is_unicode, "N", true, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_NEED_LONG_DATA_LEN: + GetStringAttribute(is_unicode, "N", true, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_TXN_CAPABLE: + GetAttribute(static_cast(SQL_TC_NONE), value, buffer_length, + output_length); + break; + case SQL_TXN_ISOLATION_OPTION: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_TABLE_TERM: + GetStringAttribute(is_unicode, "table", true, value, buffer_length, output_length, + GetDiagnostics()); + break; + // Deprecated ODBC 2.x fields required for backwards compatibility. + case SQL_ODBC_API_CONFORMANCE: + GetAttribute(static_cast(SQL_OAC_LEVEL1), value, buffer_length, + output_length); + break; + case SQL_FETCH_DIRECTION: + GetAttribute(static_cast(SQL_FETCH_NEXT), value, buffer_length, + output_length); + break; + case SQL_LOCK_TYPES: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_POS_OPERATIONS: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_POSITIONED_STATEMENTS: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_SCROLL_CONCURRENCY: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + case SQL_STATIC_SENSITIVITY: + GetAttribute(static_cast(0), value, buffer_length, output_length); + break; + + // Driver-level string properties. + case SQL_USER_NAME: + case SQL_COLUMN_ALIAS: + case SQL_DBMS_NAME: + case SQL_DBMS_VER: + case SQL_DRIVER_NAME: // TODO: This should be the driver's filename and shouldn't + // come from the SPI. + case SQL_DRIVER_VER: + case SQL_SEARCH_PATTERN_ESCAPE: + case SQL_SERVER_NAME: + case SQL_DATA_SOURCE_READ_ONLY: + case SQL_ACCESSIBLE_TABLES: + case SQL_ACCESSIBLE_PROCEDURES: + case SQL_CATALOG_TERM: + case SQL_COLLATION_SEQ: + case SQL_SCHEMA_TERM: + case SQL_CATALOG_NAME: + case SQL_CATALOG_NAME_SEPARATOR: + case SQL_EXPRESSIONS_IN_ORDERBY: + case SQL_IDENTIFIER_QUOTE_CHAR: + case SQL_INTEGRITY: + case SQL_KEYWORDS: + case SQL_LIKE_ESCAPE_CLAUSE: + case SQL_MAX_ROW_SIZE_INCLUDES_LONG: + case SQL_ORDER_BY_COLUMNS_IN_SELECT: + case SQL_OUTER_JOINS: // Not documented in SQLGetInfo, but other drivers return Y/N + // strings + case SQL_PROCEDURE_TERM: + case SQL_PROCEDURES: + case SQL_SPECIAL_CHARACTERS: + case SQL_XOPEN_CLI_YEAR: { + const auto& info = spi_connection_->GetInfo(info_type); + const std::string& info_value = boost::get(info); + GetStringAttribute(is_unicode, info_value, true, value, buffer_length, + output_length, GetDiagnostics()); + break; + } + + // Driver-level 32-bit integer properties. + case SQL_GETDATA_EXTENSIONS: + case SQL_INFO_SCHEMA_VIEWS: + case SQL_CURSOR_SENSITIVITY: + case SQL_DEFAULT_TXN_ISOLATION: + case SQL_AGGREGATE_FUNCTIONS: + case SQL_ALTER_DOMAIN: + // case SQL_ALTER_SCHEMA: + case SQL_ALTER_TABLE: + case SQL_DATETIME_LITERALS: + case SQL_CATALOG_USAGE: + case SQL_CREATE_ASSERTION: + case SQL_CREATE_CHARACTER_SET: + case SQL_CREATE_COLLATION: + case SQL_CREATE_DOMAIN: + case SQL_CREATE_SCHEMA: + case SQL_CREATE_TABLE: + case SQL_CREATE_TRANSLATION: + case SQL_CREATE_VIEW: + case SQL_INDEX_KEYWORDS: + case SQL_INSERT_STATEMENT: + case SQL_OJ_CAPABILITIES: + case SQL_SCHEMA_USAGE: + case SQL_SQL_CONFORMANCE: + case SQL_SUBQUERIES: + case SQL_UNION: + case SQL_MAX_BINARY_LITERAL_LEN: + case SQL_MAX_CHAR_LITERAL_LEN: + case SQL_MAX_ROW_SIZE: + case SQL_MAX_STATEMENT_LEN: + case SQL_CONVERT_FUNCTIONS: + case SQL_NUMERIC_FUNCTIONS: + case SQL_STRING_FUNCTIONS: + case SQL_SYSTEM_FUNCTIONS: + case SQL_TIMEDATE_ADD_INTERVALS: + case SQL_TIMEDATE_DIFF_INTERVALS: + case SQL_TIMEDATE_FUNCTIONS: + case SQL_CONVERT_BIGINT: + case SQL_CONVERT_BINARY: + case SQL_CONVERT_BIT: + case SQL_CONVERT_CHAR: + case SQL_CONVERT_DATE: + case SQL_CONVERT_DECIMAL: + case SQL_CONVERT_DOUBLE: + case SQL_CONVERT_FLOAT: + case SQL_CONVERT_GUID: + case SQL_CONVERT_INTEGER: + case SQL_CONVERT_INTERVAL_DAY_TIME: + case SQL_CONVERT_INTERVAL_YEAR_MONTH: + case SQL_CONVERT_LONGVARBINARY: + case SQL_CONVERT_LONGVARCHAR: + case SQL_CONVERT_NUMERIC: + case SQL_CONVERT_REAL: + case SQL_CONVERT_SMALLINT: + case SQL_CONVERT_TIME: + case SQL_CONVERT_TIMESTAMP: + case SQL_CONVERT_TINYINT: + case SQL_CONVERT_VARBINARY: + case SQL_CONVERT_VARCHAR: + case SQL_CONVERT_WCHAR: + case SQL_CONVERT_WVARCHAR: + case SQL_CONVERT_WLONGVARCHAR: + case SQL_DDL_INDEX: + case SQL_DROP_ASSERTION: + case SQL_DROP_CHARACTER_SET: + case SQL_DROP_COLLATION: + case SQL_DROP_DOMAIN: + case SQL_DROP_SCHEMA: + case SQL_DROP_TABLE: + case SQL_DROP_TRANSLATION: + case SQL_DROP_VIEW: + case SQL_MAX_INDEX_SIZE: + case SQL_SQL92_DATETIME_FUNCTIONS: + case SQL_SQL92_FOREIGN_KEY_DELETE_RULE: + case SQL_SQL92_FOREIGN_KEY_UPDATE_RULE: + case SQL_SQL92_GRANT: + case SQL_SQL92_NUMERIC_VALUE_FUNCTIONS: + case SQL_SQL92_PREDICATES: + case SQL_SQL92_RELATIONAL_JOIN_OPERATORS: + case SQL_SQL92_REVOKE: + case SQL_SQL92_ROW_VALUE_CONSTRUCTOR: + case SQL_SQL92_STRING_FUNCTIONS: + case SQL_SQL92_VALUE_EXPRESSIONS: + case SQL_STANDARD_CLI_CONFORMANCE: { + const auto& info = spi_connection_->GetInfo(info_type); + uint32_t info_value = boost::get(info); + GetAttribute(info_value, value, buffer_length, output_length); + break; + } + + // Driver-level 16-bit integer properties. + case SQL_MAX_CONCURRENT_ACTIVITIES: + case SQL_MAX_DRIVER_CONNECTIONS: + case SQL_CONCAT_NULL_BEHAVIOR: + case SQL_CURSOR_COMMIT_BEHAVIOR: + case SQL_CURSOR_ROLLBACK_BEHAVIOR: + case SQL_NULL_COLLATION: + case SQL_CATALOG_LOCATION: + case SQL_CORRELATION_NAME: + case SQL_GROUP_BY: + case SQL_IDENTIFIER_CASE: + case SQL_NON_NULLABLE_COLUMNS: + case SQL_QUOTED_IDENTIFIER_CASE: + case SQL_MAX_CATALOG_NAME_LEN: + case SQL_MAX_COLUMN_NAME_LEN: + case SQL_MAX_COLUMNS_IN_GROUP_BY: + case SQL_MAX_COLUMNS_IN_INDEX: + case SQL_MAX_COLUMNS_IN_ORDER_BY: + case SQL_MAX_COLUMNS_IN_SELECT: + case SQL_MAX_COLUMNS_IN_TABLE: + case SQL_MAX_CURSOR_NAME_LEN: + case SQL_MAX_IDENTIFIER_LEN: + case SQL_MAX_SCHEMA_NAME_LEN: + case SQL_MAX_TABLE_NAME_LEN: + case SQL_MAX_TABLES_IN_SELECT: + case SQL_MAX_PROCEDURE_NAME_LEN: + case SQL_MAX_USER_NAME_LEN: + case SQL_ODBC_SQL_CONFORMANCE: + case SQL_ODBC_SAG_CLI_CONFORMANCE: { + const auto& info = spi_connection_->GetInfo(info_type); + uint16_t info_value = boost::get(info); + GetAttribute(info_value, value, buffer_length, output_length); + break; + } + + // Special case - SQL_DATABASE_NAME is an alias for SQL_ATTR_CURRENT_CATALOG. + case SQL_DATABASE_NAME: { + const auto& attr = spi_connection_->GetAttribute(Connection::CURRENT_CATALOG); + if (!attr) { + throw DriverException("Optional feature not supported.", "HYC00"); + } + const std::string& info_value = boost::get(*attr); + GetStringAttribute(is_unicode, info_value, true, value, buffer_length, + output_length, GetDiagnostics()); + break; + } + default: + throw DriverException("Unknown SQLGetInfo type: " + std::to_string(info_type)); + } +} + +void ODBCConnection::SetConnectAttr(SQLINTEGER attribute, SQLPOINTER value, + SQLINTEGER string_length, bool is_unicode) { + uint32_t attribute_to_write = 0; + bool successfully_written = false; + switch (attribute) { + // Internal connection attributes +#ifdef SQL_ATR_ASYNC_DBC_EVENT + case SQL_ATTR_ASYNC_DBC_EVENT: + throw DriverException("Optional feature not supported.", "HYC00"); +#endif +#ifdef SQL_ATTR_ASYNC_DBC_FUNCTIONS_ENABLE + case SQL_ATTR_ASYNC_DBC_FUNCTIONS_ENABLE: + throw DriverException("Optional feature not supported.", "HYC00"); +#endif +#ifdef SQL_ATTR_ASYNC_PCALLBACK + case SQL_ATTR_ASYNC_DBC_PCALLBACK: + throw DriverException("Optional feature not supported.", "HYC00"); +#endif +#ifdef SQL_ATTR_ASYNC_DBC_PCONTEXT + case SQL_ATTR_ASYNC_DBC_PCONTEXT: + throw DriverException("Optional feature not supported.", "HYC00"); +#endif + case SQL_ATTR_AUTO_IPD: + throw DriverException("Cannot set read-only attribute", "HY092"); + case SQL_ATTR_AUTOCOMMIT: + CheckIfAttributeIsSetToOnlyValidValue(value, + static_cast(SQL_AUTOCOMMIT_ON)); + return; + case SQL_ATTR_CONNECTION_DEAD: + throw DriverException("Cannot set read-only attribute", "HY092"); +#ifdef SQL_ATTR_DBC_INFO_TOKEN + case SQL_ATTR_DBC_INFO_TOKEN: + throw DriverException("Optional feature not supported.", "HYC00"); +#endif + case SQL_ATTR_ENLIST_IN_DTC: + throw DriverException("Optional feature not supported.", "HYC00"); + case SQL_ATTR_ODBC_CURSORS: // DM-only. + throw DriverException("Invalid attribute", "HY092"); + case SQL_ATTR_QUIET_MODE: + throw DriverException("Cannot set read-only attribute", "HY092"); + case SQL_ATTR_TRACE: // DM-only + throw DriverException("Cannot set read-only attribute", "HY092"); + case SQL_ATTR_TRACEFILE: + throw DriverException("Optional feature not supported.", "HYC00"); + case SQL_ATTR_TRANSLATE_LIB: + throw DriverException("Optional feature not supported.", "HYC00"); + case SQL_ATTR_TRANSLATE_OPTION: + throw DriverException("Optional feature not supported.", "HYC00"); + case SQL_ATTR_TXN_ISOLATION: + throw DriverException("Optional feature not supported.", "HYC00"); + + // ODBCAbstraction-level attributes + case SQL_ATTR_CURRENT_CATALOG: { + std::string catalog; + if (is_unicode) { + SetAttributeUTF8(value, string_length, catalog); + } else { + SetAttributeSQLWCHAR(value, string_length, catalog); + } + if (!spi_connection_->SetAttribute(Connection::CURRENT_CATALOG, catalog)) { + throw DriverException("Option value changed.", "01S02"); + } + return; + } + + // Statement attributes that can be set through the connection. + // Only applies to SQL_ATTR_METADATA_ID, SQL_ATTR_ASYNC_ENABLE, and ODBC 2.x statement + // attributes. SQL_ATTR_ROW_NUMBER is excluded because it is read-only. Note that + // SQLGetConnectAttr cannot retrieve these attributes. + case SQL_ATTR_ASYNC_ENABLE: + case SQL_ATTR_METADATA_ID: + case SQL_ATTR_CONCURRENCY: + case SQL_ATTR_CURSOR_TYPE: + case SQL_ATTR_KEYSET_SIZE: + case SQL_ATTR_MAX_LENGTH: + case SQL_ATTR_MAX_ROWS: + case SQL_ATTR_NOSCAN: + case SQL_ATTR_QUERY_TIMEOUT: + case SQL_ATTR_RETRIEVE_DATA: + case SQL_ATTR_ROW_BIND_TYPE: + case SQL_ATTR_SIMULATE_CURSOR: + case SQL_ATTR_USE_BOOKMARKS: + attribute_tracking_statement_->SetStmtAttr(attribute, value, string_length, + is_unicode); + return; + + case SQL_ATTR_ACCESS_MODE: + SetAttribute(value, attribute_to_write); + successfully_written = + spi_connection_->SetAttribute(Connection::ACCESS_MODE, attribute_to_write); + break; + case SQL_ATTR_CONNECTION_TIMEOUT: + SetAttribute(value, attribute_to_write); + successfully_written = spi_connection_->SetAttribute(Connection::CONNECTION_TIMEOUT, + attribute_to_write); + break; + case SQL_ATTR_LOGIN_TIMEOUT: + SetAttribute(value, attribute_to_write); + successfully_written = + spi_connection_->SetAttribute(Connection::LOGIN_TIMEOUT, attribute_to_write); + break; + case SQL_ATTR_PACKET_SIZE: + SetAttribute(value, attribute_to_write); + successfully_written = + spi_connection_->SetAttribute(Connection::PACKET_SIZE, attribute_to_write); + break; + default: + throw DriverException("Invalid attribute: " + std::to_string(attribute), "HY092"); + } + + if (!successfully_written) { + GetDiagnostics().AddWarning("Option value changed.", "01S02", + driver::odbcabstraction::ODBCErrorCodes_GENERAL_WARNING); + } +} + +void ODBCConnection::GetConnectAttr(SQLINTEGER attribute, SQLPOINTER value, + SQLINTEGER buffer_length, SQLINTEGER* output_length, + bool is_unicode) { + using driver::odbcabstraction::Connection; + boost::optional spi_attribute; + + switch (attribute) { + // Internal connection attributes +#ifdef SQL_ATR_ASYNC_DBC_EVENT + case SQL_ATTR_ASYNC_DBC_EVENT: + GetAttribute(static_cast(NULL), value, buffer_length, output_length); + return; +#endif +#ifdef SQL_ATTR_ASYNC_DBC_FUNCTIONS_ENABLE + case SQL_ATTR_ASYNC_DBC_FUNCTIONS_ENABLE: + GetAttribute(static_cast(SQL_ASYNC_DBC_ENABLE_OFF), value, + buffer_length, output_length); + return; +#endif +#ifdef SQL_ATTR_ASYNC_PCALLBACK + case SQL_ATTR_ASYNC_DBC_PCALLBACK: + GetAttribute(static_cast(NULL), value, buffer_length, output_length); + return; +#endif +#ifdef SQL_ATTR_ASYNC_DBC_PCONTEXT + case SQL_ATTR_ASYNC_DBC_PCONTEXT: + GetAttribute(static_cast(NULL), value, buffer_length, output_length); + return; +#endif + case SQL_ATTR_ASYNC_ENABLE: + GetAttribute(static_cast(SQL_ASYNC_ENABLE_OFF), value, buffer_length, + output_length); + return; + case SQL_ATTR_AUTO_IPD: + GetAttribute(static_cast(SQL_FALSE), value, buffer_length, + output_length); + return; + case SQL_ATTR_AUTOCOMMIT: + GetAttribute(static_cast(SQL_AUTOCOMMIT_ON), value, buffer_length, + output_length); + return; +#ifdef SQL_ATTR_DBC_INFO_TOKEN + case SQL_ATTR_DBC_INFO_TOKEN: + throw DriverException("Cannot read set-only attribute", "HY092"); +#endif + case SQL_ATTR_ENLIST_IN_DTC: + GetAttribute(static_cast(NULL), value, buffer_length, output_length); + return; + case SQL_ATTR_ODBC_CURSORS: // DM-only. + throw DriverException("Invalid attribute", "HY092"); + case SQL_ATTR_QUIET_MODE: + GetAttribute(static_cast(NULL), value, buffer_length, output_length); + return; + case SQL_ATTR_TRACE: // DM-only + throw DriverException("Invalid attribute", "HY092"); + case SQL_ATTR_TRACEFILE: + throw DriverException("Optional feature not supported.", "HYC00"); + case SQL_ATTR_TRANSLATE_LIB: + throw DriverException("Optional feature not supported.", "HYC00"); + case SQL_ATTR_TRANSLATE_OPTION: + throw DriverException("Optional feature not supported.", "HYC00"); + case SQL_ATTR_TXN_ISOLATION: + throw DriverException("Optional feature not supported.", "HCY00"); + + // ODBCAbstraction-level connection attributes. + case SQL_ATTR_CURRENT_CATALOG: { + const auto& catalog = spi_connection_->GetAttribute(Connection::CURRENT_CATALOG); + if (!catalog) { + throw DriverException("Optional feature not supported.", "HYC00"); + } + const std::string& info_value = boost::get(*catalog); + GetStringAttribute(is_unicode, info_value, true, value, buffer_length, + output_length, GetDiagnostics()); + return; + } + + // These all are uint32_t attributes. + case SQL_ATTR_ACCESS_MODE: + spi_attribute = spi_connection_->GetAttribute(Connection::ACCESS_MODE); + break; + case SQL_ATTR_CONNECTION_DEAD: + spi_attribute = spi_connection_->GetAttribute(Connection::CONNECTION_DEAD); + break; + case SQL_ATTR_CONNECTION_TIMEOUT: + spi_attribute = spi_connection_->GetAttribute(Connection::CONNECTION_TIMEOUT); + break; + case SQL_ATTR_LOGIN_TIMEOUT: + spi_attribute = spi_connection_->GetAttribute(Connection::LOGIN_TIMEOUT); + break; + case SQL_ATTR_PACKET_SIZE: + spi_attribute = spi_connection_->GetAttribute(Connection::PACKET_SIZE); + break; + default: + throw DriverException("Invalid attribute", "HY092"); + } + + if (!spi_attribute) { + throw DriverException("Invalid attribute", "HY092"); + } + + GetAttribute(static_cast(boost::get(*spi_attribute)), value, + buffer_length, output_length); +} + +void ODBCConnection::Disconnect() { + if (is_connected_) { + // Ensure that all statements (and corresponding SPI statements) get cleaned + // up before terminating the SPI connection in case they need to be de-allocated in + // the reverse of the allocation order. + statements_.clear(); + spi_connection_->Close(); + is_connected_ = false; + } +} + +void ODBCConnection::ReleaseConnection() { + Disconnect(); + environment_.DropConnection(this); +} + +std::shared_ptr ODBCConnection::CreateStatement() { + std::shared_ptr spi_statement = spi_connection_->CreateStatement(); + std::shared_ptr statement = + std::make_shared(*this, spi_statement); + statements_.push_back(statement); + statement->CopyAttributesFromConnection(*this); + return statement; +} + +void ODBCConnection::DropStatement(ODBCStatement* stmt) { + auto it = std::find_if(statements_.begin(), statements_.end(), + [&stmt](const std::shared_ptr& statement) { + return statement.get() == stmt; + }); + if (statements_.end() != it) { + statements_.erase(it); + } +} + +std::shared_ptr ODBCConnection::CreateDescriptor() { + std::shared_ptr desc = std::make_shared( + spi_connection_->GetDiagnostics(), this, nullptr, true, true, false); + descriptors_.push_back(desc); + return desc; +} + +void ODBCConnection::DropDescriptor(ODBCDescriptor* desc) { + auto it = std::find_if(descriptors_.begin(), descriptors_.end(), + [&desc](const std::shared_ptr& descriptor) { + return descriptor.get() == desc; + }); + if (descriptors_.end() != it) { + descriptors_.erase(it); + } +} + +// Public Static +// =================================================================================== +std::string ODBCConnection::GetPropertiesFromConnString( + const std::string& conn_str, Connection::ConnPropertyMap& properties) { + const int groups[] = {1, 2}; // CONNECTION_STR_REGEX has two groups. key: 1, value: 2 + boost::xpressive::sregex_token_iterator regex_iter(conn_str.begin(), conn_str.end(), + CONNECTION_STR_REGEX, groups), + end; + + bool is_dsn_first = false; + bool is_driver_first = false; + std::string dsn; + for (auto it = regex_iter; end != regex_iter; ++regex_iter) { + std::string key = *regex_iter; + std::string value = *++regex_iter; + + // If the DSN shows up before driver key, load settings from the DSN. + // Only load values from the DSN once regardless of how many times the DSN + // key shows up. + if (boost::iequals(key, "DSN")) { + if (!is_driver_first) { + if (!is_dsn_first) { + is_dsn_first = true; + loadPropertiesFromDSN(value, properties); + dsn.swap(value); + } + } + continue; + } else if (boost::iequals(key, "Driver")) { + if (!is_dsn_first) { + is_driver_first = true; + } + continue; + } + + // Strip wrapping curly braces. + if (value.size() >= 2 && value[0] == '{' && value[value.size() - 1] == '}') { + value = value.substr(1, value.size() - 2); + } + + // Overwrite the existing value. Later copies of the key take precedence, + // including over entries in the DSN. + properties[key] = std::move(value); + } + return dsn; +} diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_descriptor.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_descriptor.cc new file mode 100644 index 00000000000..990209bdcc3 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_descriptor.cc @@ -0,0 +1,578 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_descriptor.h" + +#include +#include +#include +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/exceptions.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/attribute_utils.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_connection.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/odbc_statement.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/odbc_impl/type_utilities.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/result_set_metadata.h" +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/spi/statement.h" + +using ODBC::DescriptorRecord; +using ODBC::ODBCConnection; +using ODBC::ODBCDescriptor; +using ODBC::ODBCStatement; + +using driver::odbcabstraction::Diagnostics; +using driver::odbcabstraction::DriverException; +using driver::odbcabstraction::ResultSetMetadata; + +namespace { +SQLSMALLINT CalculateHighestBoundRecord(const std::vector& records) { + // Most applications will bind every column, so optimistically assume that we'll + // find the next bound record fastest by counting backwards. + for (size_t i = records.size(); i > 0; --i) { + if (records[i - 1].is_bound) { + return i; + } + } + return 0; +} +} // namespace + +// Public +// ========================================================================================= +ODBCDescriptor::ODBCDescriptor(Diagnostics& base_diagnostics, ODBCConnection* conn, + ODBCStatement* stmt, bool is_app_descriptor, + bool is_writable, bool is_2x_connection) + : diagnostics_(base_diagnostics.GetVendor(), + base_diagnostics.GetDataSourceComponent(), + driver::odbcabstraction::V_3), + owning_connection_(conn), + parent_statement_(stmt), + array_status_ptr_(nullptr), + bind_offset_ptr_(nullptr), + rows_processed_ptr_(nullptr), + array_size_(1), + bind_type_(SQL_BIND_BY_COLUMN), + highest_one_based_bound_record_(0), + is_2x_connection_(is_2x_connection), + is_app_descriptor_(is_app_descriptor), + is_writable_(is_writable), + has_bindings_changed_(true) {} + +Diagnostics& ODBCDescriptor::GetDiagnosticsImpl() { return diagnostics_; } + +ODBCConnection& ODBCDescriptor::GetConnection() { + if (owning_connection_) { + return *owning_connection_; + } + assert(parent_statement_); + return parent_statement_->GetConnection(); +} + +void ODBCDescriptor::SetHeaderField(SQLSMALLINT field_identifier, SQLPOINTER value, + SQLINTEGER buffer_length) { + // Only these two fields can be set on the IRD. + if (!is_writable_ && field_identifier != SQL_DESC_ARRAY_STATUS_PTR && + field_identifier != SQL_DESC_ROWS_PROCESSED_PTR) { + throw DriverException("Cannot modify read-only descriptor", "HY016"); + } + + switch (field_identifier) { + case SQL_DESC_ALLOC_TYPE: + throw DriverException("Invalid descriptor field", "HY091"); + case SQL_DESC_ARRAY_SIZE: + SetAttribute(value, array_size_); + has_bindings_changed_ = true; + break; + case SQL_DESC_ARRAY_STATUS_PTR: + SetPointerAttribute(value, array_status_ptr_); + has_bindings_changed_ = true; + break; + case SQL_DESC_BIND_OFFSET_PTR: + SetPointerAttribute(value, bind_offset_ptr_); + has_bindings_changed_ = true; + break; + case SQL_DESC_BIND_TYPE: + SetAttribute(value, bind_type_); + has_bindings_changed_ = true; + break; + case SQL_DESC_ROWS_PROCESSED_PTR: + SetPointerAttribute(value, rows_processed_ptr_); + has_bindings_changed_ = true; + break; + case SQL_DESC_COUNT: { + SQLSMALLINT new_count; + SetAttribute(value, new_count); + records_.resize(new_count); + + if (is_app_descriptor_ && new_count <= highest_one_based_bound_record_) { + highest_one_based_bound_record_ = CalculateHighestBoundRecord(records_); + } else { + highest_one_based_bound_record_ = new_count; + } + has_bindings_changed_ = true; + break; + } + default: + throw DriverException("Invalid descriptor field", "HY091"); + } +} + +void ODBCDescriptor::SetField(SQLSMALLINT record_number, SQLSMALLINT field_identifier, + SQLPOINTER value, SQLINTEGER buffer_length) { + if (!is_writable_) { + throw DriverException("Cannot modify read-only descriptor", "HY016"); + } + + // Handle header fields before validating the record number. + switch (field_identifier) { + case SQL_DESC_ALLOC_TYPE: + case SQL_DESC_ARRAY_SIZE: + case SQL_DESC_ARRAY_STATUS_PTR: + case SQL_DESC_BIND_OFFSET_PTR: + case SQL_DESC_BIND_TYPE: + case SQL_DESC_ROWS_PROCESSED_PTR: + case SQL_DESC_COUNT: + SetHeaderField(field_identifier, value, buffer_length); + return; + default: + break; + } + + if (record_number == 0) { + throw DriverException("Bookmarks are unsupported.", "07009"); + } + + if (record_number > records_.size()) { + throw DriverException("Invalid descriptor index", "HY009"); + } + + SQLSMALLINT zero_based_record = record_number - 1; + DescriptorRecord& record = records_[zero_based_record]; + switch (field_identifier) { + case SQL_DESC_AUTO_UNIQUE_VALUE: + case SQL_DESC_BASE_COLUMN_NAME: + case SQL_DESC_BASE_TABLE_NAME: + case SQL_DESC_CASE_SENSITIVE: + case SQL_DESC_CATALOG_NAME: + case SQL_DESC_DISPLAY_SIZE: + case SQL_DESC_FIXED_PREC_SCALE: + case SQL_DESC_LABEL: + case SQL_DESC_LITERAL_PREFIX: + case SQL_DESC_LITERAL_SUFFIX: + case SQL_DESC_LOCAL_TYPE_NAME: + case SQL_DESC_NULLABLE: + case SQL_DESC_NUM_PREC_RADIX: + case SQL_DESC_ROWVER: + case SQL_DESC_SCHEMA_NAME: + case SQL_DESC_SEARCHABLE: + case SQL_DESC_TABLE_NAME: + case SQL_DESC_TYPE_NAME: + case SQL_DESC_UNNAMED: + case SQL_DESC_UNSIGNED: + case SQL_DESC_UPDATABLE: + throw DriverException("Cannot modify read-only field.", "HY092"); + case SQL_DESC_CONCISE_TYPE: + SetAttribute(value, record.concise_type); + record.is_bound = false; + has_bindings_changed_ = true; + break; + case SQL_DESC_DATA_PTR: + SetDataPtrOnRecord(value, record_number); + break; + case SQL_DESC_DATETIME_INTERVAL_CODE: + SetAttribute(value, record.datetime_interval_code); + record.is_bound = false; + has_bindings_changed_ = true; + break; + case SQL_DESC_DATETIME_INTERVAL_PRECISION: + SetAttribute(value, record.datetime_interval_precision); + record.is_bound = false; + has_bindings_changed_ = true; + break; + case SQL_DESC_INDICATOR_PTR: + case SQL_DESC_OCTET_LENGTH_PTR: + SetPointerAttribute(value, record.indicator_ptr); + has_bindings_changed_ = true; + break; + case SQL_DESC_LENGTH: + SetAttribute(value, record.length); + record.is_bound = false; + has_bindings_changed_ = true; + break; + case SQL_DESC_NAME: + SetAttributeUTF8(value, buffer_length, record.name); + has_bindings_changed_ = true; + break; + case SQL_DESC_OCTET_LENGTH: + SetAttribute(value, record.octet_length); + record.is_bound = false; + has_bindings_changed_ = true; + break; + case SQL_DESC_PARAMETER_TYPE: + SetAttribute(value, record.param_type); + record.is_bound = false; + has_bindings_changed_ = true; + break; + case SQL_DESC_PRECISION: + SetAttribute(value, record.precision); + record.is_bound = false; + has_bindings_changed_ = true; + break; + case SQL_DESC_SCALE: + SetAttribute(value, record.scale); + record.is_bound = false; + has_bindings_changed_ = true; + break; + case SQL_DESC_TYPE: + SetAttribute(value, record.type); + record.is_bound = false; + has_bindings_changed_ = true; + break; + default: + throw DriverException("Invalid descriptor field", "HY091"); + } +} + +void ODBCDescriptor::GetHeaderField(SQLSMALLINT field_identifier, SQLPOINTER value, + SQLINTEGER buffer_length, + SQLINTEGER* output_length) const { + switch (field_identifier) { + case SQL_DESC_ALLOC_TYPE: { + SQLSMALLINT result; + if (owning_connection_) { + result = SQL_DESC_ALLOC_USER; + } else { + result = SQL_DESC_ALLOC_AUTO; + } + GetAttribute(result, value, buffer_length, output_length); + break; + } + case SQL_DESC_ARRAY_SIZE: + GetAttribute(array_size_, value, buffer_length, output_length); + break; + case SQL_DESC_ARRAY_STATUS_PTR: + GetAttribute(array_status_ptr_, value, buffer_length, output_length); + break; + case SQL_DESC_BIND_OFFSET_PTR: + GetAttribute(bind_offset_ptr_, value, buffer_length, output_length); + break; + case SQL_DESC_BIND_TYPE: + GetAttribute(bind_type_, value, buffer_length, output_length); + break; + case SQL_DESC_ROWS_PROCESSED_PTR: + GetAttribute(rows_processed_ptr_, value, buffer_length, output_length); + break; + case SQL_DESC_COUNT: { + GetAttribute(highest_one_based_bound_record_, value, buffer_length, output_length); + break; + } + default: + throw DriverException("Invalid descriptor field", "HY091"); + } +} + +void ODBCDescriptor::GetField(SQLSMALLINT record_number, SQLSMALLINT field_identifier, + SQLPOINTER value, SQLINTEGER buffer_length, + SQLINTEGER* output_length) { + // Handle header fields before validating the record number. + switch (field_identifier) { + case SQL_DESC_ALLOC_TYPE: + case SQL_DESC_ARRAY_SIZE: + case SQL_DESC_ARRAY_STATUS_PTR: + case SQL_DESC_BIND_OFFSET_PTR: + case SQL_DESC_BIND_TYPE: + case SQL_DESC_ROWS_PROCESSED_PTR: + case SQL_DESC_COUNT: + GetHeaderField(field_identifier, value, buffer_length, output_length); + return; + default: + break; + } + + if (record_number == 0) { + throw DriverException("Bookmarks are unsupported.", "07009"); + } + + if (record_number > records_.size()) { + throw DriverException("Invalid descriptor index", "07009"); + } + + // TODO: Restrict fields based on AppDescriptor IPD, and IRD. + + SQLSMALLINT zero_based_record = record_number - 1; + const DescriptorRecord& record = records_[zero_based_record]; + switch (field_identifier) { + case SQL_DESC_BASE_COLUMN_NAME: + GetAttributeUTF8(record.base_column_name, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_BASE_TABLE_NAME: + GetAttributeUTF8(record.base_table_name, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_CATALOG_NAME: + GetAttributeUTF8(record.catalog_name, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_LABEL: + GetAttributeUTF8(record.label, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_LITERAL_PREFIX: + GetAttributeUTF8(record.literal_prefix, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_LITERAL_SUFFIX: + GetAttributeUTF8(record.literal_suffix, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_LOCAL_TYPE_NAME: + GetAttributeUTF8(record.local_type_name, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_NAME: + GetAttributeUTF8(record.name, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_SCHEMA_NAME: + GetAttributeUTF8(record.schema_name, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_TABLE_NAME: + GetAttributeUTF8(record.table_name, value, buffer_length, output_length, + GetDiagnostics()); + break; + case SQL_DESC_TYPE_NAME: + GetAttributeUTF8(record.type_name, value, buffer_length, output_length, + GetDiagnostics()); + break; + + case SQL_DESC_DATA_PTR: + GetAttribute(record.data_ptr, value, buffer_length, output_length); + break; + case SQL_DESC_INDICATOR_PTR: + case SQL_DESC_OCTET_LENGTH_PTR: + GetAttribute(record.indicator_ptr, value, buffer_length, output_length); + break; + + case SQL_DESC_LENGTH: + GetAttribute(record.length, value, buffer_length, output_length); + break; + case SQL_DESC_OCTET_LENGTH: + GetAttribute(record.octet_length, value, buffer_length, output_length); + break; + + case SQL_DESC_AUTO_UNIQUE_VALUE: + GetAttribute(record.auto_unique_value, value, buffer_length, output_length); + break; + case SQL_DESC_CASE_SENSITIVE: + GetAttribute(record.case_sensitive, value, buffer_length, output_length); + break; + case SQL_DESC_DATETIME_INTERVAL_PRECISION: + GetAttribute(record.datetime_interval_precision, value, buffer_length, + output_length); + break; + case SQL_DESC_NUM_PREC_RADIX: + GetAttribute(record.num_prec_radix, value, buffer_length, output_length); + break; + + case SQL_DESC_CONCISE_TYPE: + GetAttribute(record.concise_type, value, buffer_length, output_length); + break; + case SQL_DESC_DATETIME_INTERVAL_CODE: + GetAttribute(record.datetime_interval_code, value, buffer_length, output_length); + break; + case SQL_DESC_DISPLAY_SIZE: + GetAttribute(record.display_size, value, buffer_length, output_length); + break; + case SQL_DESC_FIXED_PREC_SCALE: + GetAttribute(record.fixed_prec_scale, value, buffer_length, output_length); + break; + case SQL_DESC_NULLABLE: + GetAttribute(record.nullable, value, buffer_length, output_length); + break; + case SQL_DESC_PARAMETER_TYPE: + GetAttribute(record.param_type, value, buffer_length, output_length); + break; + case SQL_DESC_PRECISION: + GetAttribute(record.precision, value, buffer_length, output_length); + break; + case SQL_DESC_ROWVER: + GetAttribute(record.row_ver, value, buffer_length, output_length); + break; + case SQL_DESC_SCALE: + GetAttribute(record.scale, value, buffer_length, output_length); + break; + case SQL_DESC_SEARCHABLE: + GetAttribute(record.searchable, value, buffer_length, output_length); + break; + case SQL_DESC_TYPE: + GetAttribute(record.type, value, buffer_length, output_length); + break; + case SQL_DESC_UNNAMED: + GetAttribute(record.unnamed, value, buffer_length, output_length); + break; + case SQL_DESC_UNSIGNED: + GetAttribute(record.is_unsigned, value, buffer_length, output_length); + break; + case SQL_DESC_UPDATABLE: + GetAttribute(record.updatable, value, buffer_length, output_length); + break; + default: + throw DriverException("Invalid descriptor field", "HY091"); + } +} + +SQLSMALLINT ODBCDescriptor::GetAllocType() const { + return owning_connection_ != nullptr ? SQL_DESC_ALLOC_USER : SQL_DESC_ALLOC_AUTO; +} + +bool ODBCDescriptor::IsAppDescriptor() const { return is_app_descriptor_; } + +void ODBCDescriptor::RegisterToStatement(ODBCStatement* statement, bool is_apd) { + if (is_apd) { + registered_on_statements_as_apd_.push_back(statement); + } else { + registered_on_statements_as_ard_.push_back(statement); + } +} + +void ODBCDescriptor::DetachFromStatement(ODBCStatement* statement, bool is_apd) { + auto& vector_to_update = + is_apd ? registered_on_statements_as_apd_ : registered_on_statements_as_ard_; + auto it = std::find(vector_to_update.begin(), vector_to_update.end(), statement); + if (it != vector_to_update.end()) { + vector_to_update.erase(it); + } +} + +void ODBCDescriptor::ReleaseDescriptor() { + for (ODBCStatement* stmt : registered_on_statements_as_apd_) { + stmt->RevertAppDescriptor(true); + } + + for (ODBCStatement* stmt : registered_on_statements_as_ard_) { + stmt->RevertAppDescriptor(false); + } + + if (owning_connection_) { + owning_connection_->DropDescriptor(this); + } +} + +void ODBCDescriptor::PopulateFromResultSetMetadata(ResultSetMetadata* rsmd) { + records_.assign(rsmd->GetColumnCount(), DescriptorRecord()); + highest_one_based_bound_record_ = records_.size() + 1; + + for (size_t i = 0; i < records_.size(); ++i) { + size_t one_based_index = i + 1; + int16_t concise_type = rsmd->GetConciseType(one_based_index); + + records_[i].base_column_name = rsmd->GetBaseColumnName(one_based_index); + records_[i].base_table_name = rsmd->GetBaseTableName(one_based_index); + records_[i].catalog_name = rsmd->GetCatalogName(one_based_index); + records_[i].label = rsmd->GetColumnLabel(one_based_index); + records_[i].literal_prefix = rsmd->GetLiteralPrefix(one_based_index); + records_[i].literal_suffix = rsmd->GetLiteralSuffix(one_based_index); + records_[i].local_type_name = rsmd->GetLocalTypeName(one_based_index); + records_[i].name = rsmd->GetName(one_based_index); + records_[i].schema_name = rsmd->GetSchemaName(one_based_index); + records_[i].table_name = rsmd->GetTableName(one_based_index); + records_[i].type_name = rsmd->GetTypeName(one_based_index, concise_type); + records_[i].concise_type = GetSqlTypeForODBCVersion(concise_type, is_2x_connection_); + records_[i].data_ptr = nullptr; + records_[i].indicator_ptr = nullptr; + records_[i].display_size = rsmd->GetColumnDisplaySize(one_based_index); + records_[i].octet_length = rsmd->GetOctetLength(one_based_index); + records_[i].length = rsmd->GetLength(one_based_index); + records_[i].auto_unique_value = + rsmd->IsAutoUnique(one_based_index) ? SQL_TRUE : SQL_FALSE; + records_[i].case_sensitive = + rsmd->IsCaseSensitive(one_based_index) ? SQL_TRUE : SQL_FALSE; + records_[i].datetime_interval_precision; // TODO - update when rsmd adds this + SQLINTEGER num_prec_radix = rsmd->GetNumPrecRadix(one_based_index); + records_[i].num_prec_radix = num_prec_radix > 0 ? num_prec_radix : 0; + records_[i].datetime_interval_code; // TODO + records_[i].fixed_prec_scale = + rsmd->IsFixedPrecScale(one_based_index) ? SQL_TRUE : SQL_FALSE; + records_[i].nullable = rsmd->IsNullable(one_based_index); + records_[i].param_type = SQL_PARAM_INPUT; + records_[i].precision = rsmd->GetPrecision(one_based_index); + records_[i].row_ver = SQL_FALSE; + records_[i].scale = rsmd->GetScale(one_based_index); + records_[i].searchable = rsmd->IsSearchable(one_based_index); + records_[i].type = rsmd->GetDataType(one_based_index); + records_[i].unnamed = records_[i].name.empty() ? SQL_TRUE : SQL_FALSE; + records_[i].is_unsigned = rsmd->IsUnsigned(one_based_index) ? SQL_TRUE : SQL_FALSE; + records_[i].updatable = rsmd->GetUpdatable(one_based_index); + } +} + +const std::vector& ODBCDescriptor::GetRecords() const { + return records_; +} + +std::vector& ODBCDescriptor::GetRecords() { return records_; } + +void ODBCDescriptor::BindCol(SQLSMALLINT record_number, SQLSMALLINT c_type, + SQLPOINTER data_ptr, SQLLEN buffer_length, + SQLLEN* indicator_ptr) { + assert(is_app_descriptor_); + assert(is_writable_); + + // The set of records auto-expands to the supplied record number. + if (records_.size() < record_number) { + records_.resize(record_number); + } + + SQLSMALLINT zero_based_record_index = record_number - 1; + DescriptorRecord& record = records_[zero_based_record_index]; + + record.type = c_type; + record.indicator_ptr = indicator_ptr; + record.length = buffer_length; + + // Initialize default precision and scale for SQL_C_NUMERIC. + if (record.type == SQL_C_NUMERIC) { + record.precision = 38; + record.scale = 0; + } + SetDataPtrOnRecord(data_ptr, record_number); +} + +void ODBCDescriptor::SetDataPtrOnRecord(SQLPOINTER data_ptr, SQLSMALLINT record_number) { + assert(record_number <= records_.size()); + DescriptorRecord& record = records_[record_number - 1]; + if (data_ptr) { + record.CheckConsistency(); + record.is_bound = true; + } else { + record.is_bound = false; + } + record.data_ptr = data_ptr; + + // Bookkeeping on the highest bound record (used for returning SQL_DESC_COUNT) + if (highest_one_based_bound_record_ < record_number && data_ptr) { + highest_one_based_bound_record_ = record_number; + } else if (highest_one_based_bound_record_ == record_number && !data_ptr) { + highest_one_based_bound_record_ = CalculateHighestBoundRecord(records_); + } + has_bindings_changed_ = true; +} + +void DescriptorRecord::CheckConsistency() { + // TODO +} diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_environment.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_environment.cc new file mode 100644 index 00000000000..d0f93813e8f --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_environment.cc @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include + +using ODBC::ODBCConnection; +using ODBC::ODBCEnvironment; + +using driver::odbcabstraction::Connection; +using driver::odbcabstraction::Diagnostics; +using driver::odbcabstraction::Driver; + +// Public +// ========================================================================================= +ODBCEnvironment::ODBCEnvironment(std::shared_ptr driver) + : driver_(std::move(driver)), + diagnostics_(new Diagnostics(driver_->GetDiagnostics().GetVendor(), + driver_->GetDiagnostics().GetDataSourceComponent(), + driver::odbcabstraction::V_2)), + version_(SQL_OV_ODBC2), + connection_pooling_(SQL_CP_OFF) {} + +Diagnostics& ODBCEnvironment::GetDiagnosticsImpl() { return *diagnostics_; } + +SQLINTEGER ODBCEnvironment::GetODBCVersion() const { return version_; } + +void ODBCEnvironment::SetODBCVersion(SQLINTEGER version) { + if (version != version_) { + version_ = version; + diagnostics_.reset( + new Diagnostics(diagnostics_->GetVendor(), diagnostics_->GetDataSourceComponent(), + version == SQL_OV_ODBC2 ? driver::odbcabstraction::V_2 + : driver::odbcabstraction::V_3)); + } +} + +SQLINTEGER ODBCEnvironment::GetConnectionPooling() const { return connection_pooling_; } + +void ODBCEnvironment::SetConnectionPooling(SQLINTEGER connection_pooling) { + connection_pooling_ = connection_pooling; +} + +std::shared_ptr ODBCEnvironment::CreateConnection() { + std::shared_ptr spi_connection = + driver_->CreateConnection(version_ == SQL_OV_ODBC2 ? driver::odbcabstraction::V_2 + : driver::odbcabstraction::V_3); + std::shared_ptr new_conn = + std::make_shared(*this, spi_connection); + connections_.push_back(new_conn); + return new_conn; +} + +void ODBCEnvironment::DropConnection(ODBCConnection* conn) { + auto it = std::find_if(connections_.begin(), connections_.end(), + [&conn](const std::shared_ptr& connection) { + return connection.get() == conn; + }); + if (connections_.end() != it) { + connections_.erase(it); + } +} diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_statement.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_statement.cc new file mode 100644 index 00000000000..231653acb25 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/odbc_impl/odbc_statement.cc @@ -0,0 +1,788 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using ODBC::DescriptorRecord; +using ODBC::ODBCConnection; +using ODBC::ODBCDescriptor; +using ODBC::ODBCStatement; + +using driver::odbcabstraction::DriverException; +using driver::odbcabstraction::ResultSetMetadata; +using driver::odbcabstraction::Statement; + +namespace { +void DescriptorToHandle(SQLPOINTER output, ODBCDescriptor* descriptor, + SQLINTEGER* len_ptr) { + if (output) { + SQLHANDLE* output_handle = static_cast(output); + *output_handle = reinterpret_cast(descriptor); + } + if (len_ptr) { + *len_ptr = sizeof(SQLHANDLE); + } +} + +size_t GetLength(const DescriptorRecord& record) { + switch (record.type) { + case SQL_C_CHAR: + case SQL_C_WCHAR: + case SQL_C_BINARY: + return record.length; + + case SQL_C_BIT: + case SQL_C_TINYINT: + case SQL_C_STINYINT: + case SQL_C_UTINYINT: + return sizeof(SQLSCHAR); + + case SQL_C_SHORT: + case SQL_C_SSHORT: + case SQL_C_USHORT: + return sizeof(SQLSMALLINT); + + case SQL_C_LONG: + case SQL_C_SLONG: + case SQL_C_ULONG: + case SQL_C_FLOAT: + return sizeof(SQLINTEGER); + + case SQL_C_SBIGINT: + case SQL_C_UBIGINT: + case SQL_C_DOUBLE: + return sizeof(SQLBIGINT); + + case SQL_C_NUMERIC: + return sizeof(SQL_NUMERIC_STRUCT); + + case SQL_C_DATE: + case SQL_C_TYPE_DATE: + return sizeof(SQL_DATE_STRUCT); + + case SQL_C_TIME: + case SQL_C_TYPE_TIME: + return sizeof(SQL_TIME_STRUCT); + + case SQL_C_TIMESTAMP: + case SQL_C_TYPE_TIMESTAMP: + return sizeof(SQL_TIMESTAMP_STRUCT); + + case SQL_C_INTERVAL_DAY: + case SQL_C_INTERVAL_DAY_TO_HOUR: + case SQL_C_INTERVAL_DAY_TO_MINUTE: + case SQL_C_INTERVAL_DAY_TO_SECOND: + case SQL_C_INTERVAL_HOUR: + case SQL_C_INTERVAL_HOUR_TO_MINUTE: + case SQL_C_INTERVAL_HOUR_TO_SECOND: + case SQL_C_INTERVAL_MINUTE: + case SQL_C_INTERVAL_MINUTE_TO_SECOND: + case SQL_C_INTERVAL_SECOND: + case SQL_C_INTERVAL_YEAR: + case SQL_C_INTERVAL_YEAR_TO_MONTH: + case SQL_C_INTERVAL_MONTH: + return sizeof(SQL_INTERVAL_STRUCT); + default: + return record.length; + } +} + +SQLSMALLINT getc_typeForSQLType(const DescriptorRecord& record) { + switch (record.concise_type) { + case SQL_CHAR: + case SQL_VARCHAR: + case SQL_LONGVARCHAR: + return SQL_C_CHAR; + + case SQL_WCHAR: + case SQL_WVARCHAR: + case SQL_WLONGVARCHAR: + return SQL_C_WCHAR; + + case SQL_BINARY: + case SQL_VARBINARY: + case SQL_LONGVARBINARY: + return SQL_C_BINARY; + + case SQL_TINYINT: + return record.is_unsigned ? SQL_C_UTINYINT : SQL_C_STINYINT; + + case SQL_SMALLINT: + return record.is_unsigned ? SQL_C_USHORT : SQL_C_SSHORT; + + case SQL_INTEGER: + return record.is_unsigned ? SQL_C_ULONG : SQL_C_SLONG; + + case SQL_BIGINT: + return record.is_unsigned ? SQL_C_UBIGINT : SQL_C_SBIGINT; + + case SQL_REAL: + return SQL_C_FLOAT; + + case SQL_FLOAT: + case SQL_DOUBLE: + return SQL_C_DOUBLE; + + case SQL_DATE: + case SQL_TYPE_DATE: + return SQL_C_TYPE_DATE; + + case SQL_TIME: + case SQL_TYPE_TIME: + return SQL_C_TYPE_TIME; + + case SQL_TIMESTAMP: + case SQL_TYPE_TIMESTAMP: + return SQL_C_TYPE_TIMESTAMP; + + case SQL_C_INTERVAL_DAY: + return SQL_INTERVAL_DAY; + case SQL_C_INTERVAL_DAY_TO_HOUR: + return SQL_INTERVAL_DAY_TO_HOUR; + case SQL_C_INTERVAL_DAY_TO_MINUTE: + return SQL_INTERVAL_DAY_TO_MINUTE; + case SQL_C_INTERVAL_DAY_TO_SECOND: + return SQL_INTERVAL_DAY_TO_SECOND; + case SQL_C_INTERVAL_HOUR: + return SQL_INTERVAL_HOUR; + case SQL_C_INTERVAL_HOUR_TO_MINUTE: + return SQL_INTERVAL_HOUR_TO_MINUTE; + case SQL_C_INTERVAL_HOUR_TO_SECOND: + return SQL_INTERVAL_HOUR_TO_SECOND; + case SQL_C_INTERVAL_MINUTE: + return SQL_INTERVAL_MINUTE; + case SQL_C_INTERVAL_MINUTE_TO_SECOND: + return SQL_INTERVAL_MINUTE_TO_SECOND; + case SQL_C_INTERVAL_SECOND: + return SQL_INTERVAL_SECOND; + case SQL_C_INTERVAL_YEAR: + return SQL_INTERVAL_YEAR; + case SQL_C_INTERVAL_YEAR_TO_MONTH: + return SQL_INTERVAL_YEAR_TO_MONTH; + case SQL_C_INTERVAL_MONTH: + return SQL_INTERVAL_MONTH; + + default: + throw DriverException("Unknown SQL type: " + std::to_string(record.concise_type), + "HY003"); + } +} + +void CopyAttribute(Statement& source, Statement& target, + Statement::StatementAttributeId attribute_id) { + auto optional_value = source.GetAttribute(attribute_id); + if (optional_value) { + target.SetAttribute(attribute_id, *optional_value); + } +} +} // namespace + +// Public +// ========================================================================================= +ODBCStatement::ODBCStatement( + ODBCConnection& connection, + std::shared_ptr spi_statement) + : connection_(connection), + spi_statement_(std::move(spi_statement)), + diagnostics_(&spi_statement_->GetDiagnostics()), + built_in_ard_(std::make_shared(spi_statement_->GetDiagnostics(), + nullptr, this, true, true, + connection.IsOdbc2Connection())), + built_in_apd_(std::make_shared(spi_statement_->GetDiagnostics(), + nullptr, this, true, true, + connection.IsOdbc2Connection())), + ipd_(std::make_shared(spi_statement_->GetDiagnostics(), nullptr, + this, false, true, + connection.IsOdbc2Connection())), + ird_(std::make_shared(spi_statement_->GetDiagnostics(), nullptr, + this, false, false, + connection.IsOdbc2Connection())), + current_ard_(built_in_apd_.get()), + current_apd_(built_in_apd_.get()), + row_number_(0), + max_rows_(0), + rowset_size_(1), + is_prepared_(false), + has_reached_end_of_result_(false) {} + +ODBCConnection& ODBCStatement::GetConnection() { return connection_; } + +void ODBCStatement::CopyAttributesFromConnection(ODBCConnection& connection) { + ODBCStatement& tracking_statement = connection.GetTrackingStatement(); + + // Get abstraction attributes and copy to this spi_statement_. + // Possible ODBC attributes are below, but many of these are not supported by warpdrive + // or ODBCAbstaction: + // SQL_ATTR_ASYNC_ENABLE: + // SQL_ATTR_METADATA_ID: + // SQL_ATTR_CONCURRENCY: + // SQL_ATTR_CURSOR_TYPE: + // SQL_ATTR_KEYSET_SIZE: + // SQL_ATTR_MAX_LENGTH: + // SQL_ATTR_MAX_ROWS: + // SQL_ATTR_NOSCAN: + // SQL_ATTR_QUERY_TIMEOUT: + // SQL_ATTR_RETRIEVE_DATA: + // SQL_ATTR_SIMULATE_CURSOR: + // SQL_ATTR_USE_BOOKMARKS: + CopyAttribute(*tracking_statement.spi_statement_, *spi_statement_, + Statement::METADATA_ID); + CopyAttribute(*tracking_statement.spi_statement_, *spi_statement_, + Statement::MAX_LENGTH); + CopyAttribute(*tracking_statement.spi_statement_, *spi_statement_, Statement::NOSCAN); + CopyAttribute(*tracking_statement.spi_statement_, *spi_statement_, + Statement::QUERY_TIMEOUT); + + // SQL_ATTR_ROW_BIND_TYPE: + current_ard_->SetHeaderField( + SQL_DESC_BIND_TYPE, + reinterpret_cast( + static_cast(tracking_statement.current_ard_->GetBoundStructOffset())), + 0); +} + +bool ODBCStatement::IsPrepared() const { return is_prepared_; } + +void ODBCStatement::Prepare(const std::string& query) { + boost::optional > metadata = + spi_statement_->Prepare(query); + + if (metadata) { + ird_->PopulateFromResultSetMetadata(metadata->get()); + } + is_prepared_ = true; +} + +void ODBCStatement::ExecutePrepared() { + if (!is_prepared_) { + throw DriverException("Function sequence error", "HY010"); + } + + if (spi_statement_->ExecutePrepared()) { + current_result_ = spi_statement_->GetResultSet(); + ird_->PopulateFromResultSetMetadata( + spi_statement_->GetResultSet()->GetMetadata().get()); + has_reached_end_of_result_ = false; + } +} + +void ODBCStatement::ExecuteDirect(const std::string& query) { + if (spi_statement_->Execute(query)) { + current_result_ = spi_statement_->GetResultSet(); + ird_->PopulateFromResultSetMetadata(current_result_->GetMetadata().get()); + has_reached_end_of_result_ = false; + } + + // Direct execution wipes out the prepared state. + is_prepared_ = false; +} + +bool ODBCStatement::Fetch(size_t rows) { + if (has_reached_end_of_result_) { + ird_->SetRowsProcessed(0); + return false; + } + + if (max_rows_) { + rows = std::min(rows, max_rows_ - row_number_); + } + + if (current_ard_->HaveBindingsChanged()) { + // TODO: Deal handle when offset != buffer_length. + + // Wipe out all bindings in the ResultSet. + // Note that the number of ARD records can both be more or less + // than the number of columns. + for (size_t i = 0; i < ird_->GetRecords().size(); i++) { + if (i < current_ard_->GetRecords().size() && + current_ard_->GetRecords()[i].is_bound) { + const DescriptorRecord& ard_record = current_ard_->GetRecords()[i]; + current_result_->BindColumn(i + 1, ard_record.type, ard_record.precision, + ard_record.scale, ard_record.data_ptr, + GetLength(ard_record), ard_record.indicator_ptr); + } else { + current_result_->BindColumn(i + 1, + driver::odbcabstraction::CDataType_CHAR + /* arbitrary type, not used */, + 0, 0, nullptr, 0, nullptr); + } + } + current_ard_->NotifyBindingsHavePropagated(); + } + + size_t rows_fetched = current_result_->Move(rows, current_ard_->GetBindOffset(), + current_ard_->GetBoundStructOffset(), + ird_->GetArrayStatusPtr()); + ird_->SetRowsProcessed(static_cast(rows_fetched)); + + row_number_ += rows_fetched; + has_reached_end_of_result_ = rows_fetched != rows; + return rows_fetched != 0; +} + +void ODBCStatement::GetStmtAttr(SQLINTEGER statement_attribute, SQLPOINTER output, + SQLINTEGER buffer_size, SQLINTEGER* str_len_ptr, + bool is_unicode) { + using driver::odbcabstraction::Statement; + boost::optional spi_attribute; + switch (statement_attribute) { + // Descriptor accessor attributes + case SQL_ATTR_APP_PARAM_DESC: + DescriptorToHandle(output, current_apd_, str_len_ptr); + return; + case SQL_ATTR_APP_ROW_DESC: + DescriptorToHandle(output, current_ard_, str_len_ptr); + return; + case SQL_ATTR_IMP_PARAM_DESC: + DescriptorToHandle(output, ipd_.get(), str_len_ptr); + return; + case SQL_ATTR_IMP_ROW_DESC: + DescriptorToHandle(output, ird_.get(), str_len_ptr); + return; + + // Attributes that are descriptor fields + case SQL_ATTR_PARAM_BIND_OFFSET_PTR: + current_apd_->GetHeaderField(SQL_DESC_BIND_OFFSET_PTR, output, buffer_size, + str_len_ptr); + return; + case SQL_ATTR_PARAM_BIND_TYPE: + current_apd_->GetHeaderField(SQL_DESC_BIND_TYPE, output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_PARAM_OPERATION_PTR: + current_apd_->GetHeaderField(SQL_DESC_ARRAY_STATUS_PTR, output, buffer_size, + str_len_ptr); + return; + case SQL_ATTR_PARAM_STATUS_PTR: + ipd_->GetHeaderField(SQL_DESC_ARRAY_STATUS_PTR, output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_PARAMS_PROCESSED_PTR: + ipd_->GetHeaderField(SQL_DESC_ROWS_PROCESSED_PTR, output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_PARAMSET_SIZE: + current_apd_->GetHeaderField(SQL_DESC_ARRAY_SIZE, output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_ROW_ARRAY_SIZE: + current_ard_->GetHeaderField(SQL_DESC_ARRAY_SIZE, output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_ROW_BIND_OFFSET_PTR: + current_ard_->GetHeaderField(SQL_DESC_BIND_OFFSET_PTR, output, buffer_size, + str_len_ptr); + return; + case SQL_ATTR_ROW_BIND_TYPE: + current_ard_->GetHeaderField(SQL_DESC_BIND_TYPE, output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_ROW_OPERATION_PTR: + current_ard_->GetHeaderField(SQL_DESC_ARRAY_STATUS_PTR, output, buffer_size, + str_len_ptr); + return; + case SQL_ATTR_ROW_STATUS_PTR: + ird_->GetHeaderField(SQL_DESC_ARRAY_STATUS_PTR, output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_ROWS_FETCHED_PTR: + ird_->GetHeaderField(SQL_DESC_ROWS_PROCESSED_PTR, output, buffer_size, str_len_ptr); + return; + + case SQL_ATTR_ASYNC_ENABLE: + GetAttribute(static_cast(SQL_ASYNC_ENABLE_OFF), output, buffer_size, + str_len_ptr); + return; + +#ifdef SQL_ATTR_ASYNC_STMT_EVENT + case SQL_ATTR_ASYNC_STMT_EVENT: + throw DriverException("Unsupported attribute", "HYC00"); +#endif +#ifdef SQL_ATTR_ASYNC_STMT_PCALLBACK + case SQL_ATTR_ASYNC_STMT_PCALLBACK: + throw DriverException("Unsupported attribute", "HYC00"); +#endif +#ifdef SQL_ATTR_ASYNC_STMT_PCONTEXT + case SQL_ATTR_ASYNC_STMT_PCONTEXT: + throw DriverException("Unsupported attribute", "HYC00"); +#endif + case SQL_ATTR_CURSOR_SCROLLABLE: + GetAttribute(static_cast(SQL_NONSCROLLABLE), output, buffer_size, + str_len_ptr); + return; + + case SQL_ATTR_CURSOR_SENSITIVITY: + GetAttribute(static_cast(SQL_UNSPECIFIED), output, buffer_size, + str_len_ptr); + return; + + case SQL_ATTR_CURSOR_TYPE: + GetAttribute(static_cast(SQL_CURSOR_FORWARD_ONLY), output, buffer_size, + str_len_ptr); + return; + + case SQL_ATTR_ENABLE_AUTO_IPD: + GetAttribute(static_cast(SQL_FALSE), output, buffer_size, str_len_ptr); + return; + + case SQL_ATTR_FETCH_BOOKMARK_PTR: + GetAttribute(static_cast(NULL), output, buffer_size, str_len_ptr); + return; + + case SQL_ATTR_KEYSET_SIZE: + GetAttribute(static_cast(0), output, buffer_size, str_len_ptr); + return; + + case SQL_ATTR_ROW_NUMBER: + GetAttribute(static_cast(row_number_), output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_SIMULATE_CURSOR: + GetAttribute(static_cast(SQL_SC_UNIQUE), output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_USE_BOOKMARKS: + GetAttribute(static_cast(SQL_UB_OFF), output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_CONCURRENCY: + GetAttribute(static_cast(SQL_CONCUR_READ_ONLY), output, buffer_size, + str_len_ptr); + return; + case SQL_ATTR_MAX_ROWS: + GetAttribute(static_cast(max_rows_), output, buffer_size, str_len_ptr); + return; + case SQL_ATTR_RETRIEVE_DATA: + GetAttribute(static_cast(SQL_RD_ON), output, buffer_size, str_len_ptr); + return; + case SQL_ROWSET_SIZE: + GetAttribute(static_cast(rowset_size_), output, buffer_size, str_len_ptr); + return; + + // Driver-level statement attributes. These are all SQLULEN attributes. + case SQL_ATTR_MAX_LENGTH: + spi_attribute = spi_statement_->GetAttribute(Statement::MAX_LENGTH); + break; + case SQL_ATTR_METADATA_ID: + spi_attribute = spi_statement_->GetAttribute(Statement::METADATA_ID); + break; + case SQL_ATTR_NOSCAN: + spi_attribute = spi_statement_->GetAttribute(Statement::NOSCAN); + break; + case SQL_ATTR_QUERY_TIMEOUT: + spi_attribute = spi_statement_->GetAttribute(Statement::QUERY_TIMEOUT); + break; + default: + throw DriverException( + "Invalid statement attribute: " + std::to_string(statement_attribute), "HY092"); + } + + if (spi_attribute) { + GetAttribute(static_cast(boost::get(*spi_attribute)), output, + buffer_size, str_len_ptr); + return; + } + + throw DriverException( + "Invalid statement attribute: " + std::to_string(statement_attribute), "HY092"); +} + +void ODBCStatement::SetStmtAttr(SQLINTEGER statement_attribute, SQLPOINTER value, + SQLINTEGER buffer_size, bool is_unicode) { + size_t attribute_to_write = 0; + bool successfully_written = false; + + switch (statement_attribute) { + case SQL_ATTR_APP_PARAM_DESC: { + ODBCDescriptor* desc = static_cast(value); + if (current_apd_ != desc) { + if (current_apd_ != built_in_apd_.get()) { + current_apd_->DetachFromStatement(this, true); + } + current_apd_ = desc; + if (current_apd_ != built_in_apd_.get()) { + desc->RegisterToStatement(this, true); + } + } + return; + } + case SQL_ATTR_APP_ROW_DESC: { + ODBCDescriptor* desc = static_cast(value); + if (current_ard_ != desc) { + if (current_ard_ != built_in_ard_.get()) { + current_ard_->DetachFromStatement(this, false); + } + current_ard_ = desc; + if (current_ard_ != built_in_ard_.get()) { + desc->RegisterToStatement(this, false); + } + } + return; + } + case SQL_ATTR_IMP_PARAM_DESC: + throw DriverException("Cannot assign implementation descriptor.", "HY017"); + case SQL_ATTR_IMP_ROW_DESC: + throw DriverException("Cannot assign implementation descriptor.", "HY017"); + // Attributes that are descriptor fields + case SQL_ATTR_PARAM_BIND_OFFSET_PTR: + current_apd_->SetHeaderField(SQL_DESC_BIND_OFFSET_PTR, value, buffer_size); + return; + case SQL_ATTR_PARAM_BIND_TYPE: + current_apd_->SetHeaderField(SQL_DESC_BIND_TYPE, value, buffer_size); + return; + case SQL_ATTR_PARAM_OPERATION_PTR: + current_apd_->SetHeaderField(SQL_DESC_ARRAY_STATUS_PTR, value, buffer_size); + return; + case SQL_ATTR_PARAM_STATUS_PTR: + ipd_->SetHeaderField(SQL_DESC_ARRAY_STATUS_PTR, value, buffer_size); + return; + case SQL_ATTR_PARAMS_PROCESSED_PTR: + ipd_->SetHeaderField(SQL_DESC_ROWS_PROCESSED_PTR, value, buffer_size); + return; + case SQL_ATTR_PARAMSET_SIZE: + current_apd_->SetHeaderField(SQL_DESC_ARRAY_SIZE, value, buffer_size); + return; + case SQL_ATTR_ROW_ARRAY_SIZE: + current_ard_->SetHeaderField(SQL_DESC_ARRAY_SIZE, value, buffer_size); + return; + case SQL_ATTR_ROW_BIND_OFFSET_PTR: + current_ard_->SetHeaderField(SQL_DESC_BIND_OFFSET_PTR, value, buffer_size); + return; + case SQL_ATTR_ROW_BIND_TYPE: + current_ard_->SetHeaderField(SQL_DESC_BIND_TYPE, value, buffer_size); + return; + case SQL_ATTR_ROW_OPERATION_PTR: + current_ard_->SetHeaderField(SQL_DESC_ARRAY_STATUS_PTR, value, buffer_size); + return; + case SQL_ATTR_ROW_STATUS_PTR: + ird_->SetHeaderField(SQL_DESC_ARRAY_STATUS_PTR, value, buffer_size); + return; + case SQL_ATTR_ROWS_FETCHED_PTR: + ird_->SetHeaderField(SQL_DESC_ROWS_PROCESSED_PTR, value, buffer_size); + return; + + case SQL_ATTR_ASYNC_ENABLE: +#ifdef SQL_ATTR_ASYNC_STMT_EVENT + case SQL_ATTR_ASYNC_STMT_EVENT: + throw DriverException("Unsupported attribute", "HYC00"); +#endif +#ifdef SQL_ATTR_ASYNC_STMT_PCALLBACK + case SQL_ATTR_ASYNC_STMT_PCALLBACK: + throw DriverException("Unsupported attribute", "HYC00"); +#endif +#ifdef SQL_ATTR_ASYNC_STMT_PCONTEXT + case SQL_ATTR_ASYNC_STMT_PCONTEXT: + throw DriverException("Unsupported attribute", "HYC00"); +#endif + case SQL_ATTR_CONCURRENCY: + CheckIfAttributeIsSetToOnlyValidValue(value, + static_cast(SQL_CONCUR_READ_ONLY)); + return; + case SQL_ATTR_CURSOR_SCROLLABLE: + CheckIfAttributeIsSetToOnlyValidValue(value, + static_cast(SQL_NONSCROLLABLE)); + return; + case SQL_ATTR_CURSOR_SENSITIVITY: + CheckIfAttributeIsSetToOnlyValidValue(value, static_cast(SQL_UNSPECIFIED)); + return; + case SQL_ATTR_CURSOR_TYPE: + CheckIfAttributeIsSetToOnlyValidValue( + value, static_cast(SQL_CURSOR_FORWARD_ONLY)); + return; + case SQL_ATTR_ENABLE_AUTO_IPD: + CheckIfAttributeIsSetToOnlyValidValue(value, static_cast(SQL_FALSE)); + return; + case SQL_ATTR_FETCH_BOOKMARK_PTR: + if (value != NULL) { + throw DriverException("Optional feature not implemented", "HYC00"); + } + return; + case SQL_ATTR_KEYSET_SIZE: + CheckIfAttributeIsSetToOnlyValidValue(value, static_cast(0)); + return; + case SQL_ATTR_ROW_NUMBER: + throw DriverException("Cannot set read-only attribute", "HY092"); + case SQL_ATTR_SIMULATE_CURSOR: + CheckIfAttributeIsSetToOnlyValidValue(value, static_cast(SQL_SC_UNIQUE)); + return; + case SQL_ATTR_USE_BOOKMARKS: + CheckIfAttributeIsSetToOnlyValidValue(value, static_cast(SQL_UB_OFF)); + return; + case SQL_ATTR_RETRIEVE_DATA: + CheckIfAttributeIsSetToOnlyValidValue(value, static_cast(SQL_TRUE)); + return; + case SQL_ROWSET_SIZE: + SetAttribute(value, rowset_size_); + return; + + case SQL_ATTR_MAX_ROWS: + throw DriverException("Cannot set read-only attribute", "HY092"); + + // Driver-leve statement attributes. These are all size_t attributes + case SQL_ATTR_MAX_LENGTH: + SetAttribute(value, attribute_to_write); + successfully_written = + spi_statement_->SetAttribute(Statement::MAX_LENGTH, attribute_to_write); + break; + case SQL_ATTR_METADATA_ID: + SetAttribute(value, attribute_to_write); + successfully_written = + spi_statement_->SetAttribute(Statement::METADATA_ID, attribute_to_write); + break; + case SQL_ATTR_NOSCAN: + SetAttribute(value, attribute_to_write); + successfully_written = + spi_statement_->SetAttribute(Statement::NOSCAN, attribute_to_write); + break; + case SQL_ATTR_QUERY_TIMEOUT: + SetAttribute(value, attribute_to_write); + successfully_written = + spi_statement_->SetAttribute(Statement::QUERY_TIMEOUT, attribute_to_write); + break; + default: + throw DriverException("Invalid attribute: " + std::to_string(attribute_to_write), + "HY092"); + } + if (!successfully_written) { + GetDiagnostics().AddWarning("Optional value changed.", "01S02", + driver::odbcabstraction::ODBCErrorCodes_GENERAL_WARNING); + } +} + +void ODBCStatement::RevertAppDescriptor(bool isApd) { + if (isApd) { + current_apd_ = built_in_apd_.get(); + } else { + current_ard_ = built_in_ard_.get(); + } +} + +void ODBCStatement::CloseCursor(bool suppress_errors) { + if (!suppress_errors && !current_result_) { + throw DriverException("Invalid cursor state", "28000"); + } + + if (current_result_) { + current_result_->Close(); + current_result_ = nullptr; + } + + // Reset the fetching state of this statement. + current_ard_->NotifyBindingsHaveChanged(); + row_number_ = 0; + has_reached_end_of_result_ = false; +} + +bool ODBCStatement::GetData(SQLSMALLINT record_number, SQLSMALLINT c_type, + SQLPOINTER data_ptr, SQLLEN buffer_length, + SQLLEN* indicator_ptr) { + if (record_number == 0) { + throw DriverException("Bookmarks are not supported", "07009"); + } else if (record_number > ird_->GetRecords().size()) { + throw DriverException("Invalid column index: " + std::to_string(record_number), + "07009"); + } + + SQLSMALLINT evaluated_c_type = c_type; + + // TODO: Get proper default precision and scale from abstraction. + int precision = 38; // arrow::Decimal128Type::kMaxPrecision; + int scale = 0; + + if (c_type == SQL_ARD_TYPE) { + if (record_number > current_ard_->GetRecords().size()) { + throw DriverException("Invalid column index: " + std::to_string(record_number), + "07009"); + } + const DescriptorRecord& record = current_ard_->GetRecords()[record_number - 1]; + evaluated_c_type = record.concise_type; + precision = record.precision; + scale = record.scale; + } + + // Note: this is intentionally not an else if, since the type can be SQL_C_DEFAULT in + // the ARD. + if (evaluated_c_type == SQL_C_DEFAULT) { + if (record_number <= current_ard_->GetRecords().size()) { + const DescriptorRecord& ard_record = current_ard_->GetRecords()[record_number - 1]; + precision = ard_record.precision; + scale = ard_record.scale; + } + + const DescriptorRecord& ird_record = ird_->GetRecords()[record_number - 1]; + evaluated_c_type = getc_typeForSQLType(ird_record); + } + + return current_result_->GetData(record_number, evaluated_c_type, precision, scale, + data_ptr, buffer_length, indicator_ptr); +} + +void ODBCStatement::ReleaseStatement() { + CloseCursor(true); + connection_.DropStatement(this); +} + +void ODBCStatement::GetTables(const std::string* catalog, const std::string* schema, + const std::string* table, const std::string* tableType) { + CloseCursor(true); + if (connection_.IsOdbc2Connection()) { + current_result_ = spi_statement_->GetTables_V2(catalog, schema, table, tableType); + } else { + current_result_ = spi_statement_->GetTables_V3(catalog, schema, table, tableType); + } + ird_->PopulateFromResultSetMetadata(current_result_->GetMetadata().get()); + has_reached_end_of_result_ = false; + + // Direct execution wipes out the prepared state. + is_prepared_ = false; +} + +void ODBCStatement::GetColumns(const std::string* catalog, const std::string* schema, + const std::string* table, const std::string* column) { + CloseCursor(true); + if (connection_.IsOdbc2Connection()) { + current_result_ = spi_statement_->GetColumns_V2(catalog, schema, table, column); + } else { + current_result_ = spi_statement_->GetColumns_V3(catalog, schema, table, column); + } + ird_->PopulateFromResultSetMetadata(current_result_->GetMetadata().get()); + has_reached_end_of_result_ = false; + + // Direct execution wipes out the prepared state. + is_prepared_ = false; +} + +void ODBCStatement::GetTypeInfo(SQLSMALLINT data_type) { + CloseCursor(true); + if (connection_.IsOdbc2Connection()) { + current_result_ = spi_statement_->GetTypeInfo_V2(data_type); + } else { + current_result_ = spi_statement_->GetTypeInfo_V3(data_type); + } + ird_->PopulateFromResultSetMetadata(current_result_->GetMetadata().get()); + has_reached_end_of_result_ = false; + + // Direct execution wipes out the prepared state. + is_prepared_ = false; +} + +void ODBCStatement::Cancel() { spi_statement_->Cancel(); } diff --git a/cpp/src/arrow/flight/sql/odbc/odbcabstraction/utils.cc b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/utils.cc new file mode 100644 index 00000000000..074e542bc34 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/odbcabstraction/utils.cc @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/vendored/whereami/whereami.h" + +#include "arrow/flight/sql/odbc/odbcabstraction/include/odbcabstraction/utils.h" + +#include + +namespace driver { +namespace odbcabstraction { + +boost::optional AsBool(const std::string& value) { + if (boost::iequals(value, "true") || boost::iequals(value, "1")) { + return true; + } else if (boost::iequals(value, "false") || boost::iequals(value, "0")) { + return false; + } else { + return boost::none; + } +} + +boost::optional AsBool(const Connection::ConnPropertyMap& conn_property_map, + const std::string_view& property_name) { + auto extracted_property = conn_property_map.find(std::string(property_name)); + + if (extracted_property != conn_property_map.end()) { + return AsBool(extracted_property->second); + } + + return boost::none; +} + +boost::optional AsInt32(int32_t min_value, + const Connection::ConnPropertyMap& conn_property_map, + const std::string_view& property_name) { + auto extracted_property = conn_property_map.find(std::string(property_name)); + + if (extracted_property != conn_property_map.end()) { + const int32_t string_column_length = std::stoi(extracted_property->second); + + if (string_column_length >= min_value && string_column_length <= INT32_MAX) { + return string_column_length; + } + } + return boost::none; +} + +std::string GetModulePath() { + std::vector path; + int length, dirname_length; + length = wai_getModulePath(NULL, 0, &dirname_length); + + if (length != 0) { + path.resize(length); + wai_getModulePath(path.data(), length, &dirname_length); + } else { + throw DriverException("Could not find module path."); + } + + return std::string(path.begin(), path.begin() + dirname_length); +} + +} // namespace odbcabstraction +} // namespace driver diff --git a/cpp/src/arrow/flight/sql/odbc/visibility.h b/cpp/src/arrow/flight/sql/odbc/visibility.h new file mode 100644 index 00000000000..416dfecc864 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/visibility.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4251) +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif + +# ifdef ARROW_FLIGHT_SQL_ODBC_STATIC +# define ARROW_FLIGHT_SQL_ODBC_EXPORT +# elif defined(ARROW_FLIGHT_SQL_ODBC_EXPORTING) +# define ARROW_FLIGHT_SQL_ODBC_EXPORT __declspec(dllexport) +# else +# define ARROW_FLIGHT_SQL_ODBC_EXPORT __declspec(dllimport) +# endif + +# define ARROW_FLIGHT_SQL_ODBC_NO_EXPORT +#else // Not Windows +# ifndef ARROW_FLIGHT_SQL_ODBC_EXPORT +# define ARROW_FLIGHT_SQL_ODBC_EXPORT __attribute__((visibility("default"))) +# endif +# ifndef ARROW_FLIGHT_SQL_ODBC_NO_EXPORT +# define ARROW_FLIGHT_SQL_ODBC_NO_EXPORT __attribute__((visibility("hidden"))) +# endif +#endif // Non-Windows + +#if defined(_MSC_VER) +# pragma warning(pop) +#endif diff --git a/cpp/src/arrow/flight/sql/protocol_internal.cc b/cpp/src/arrow/flight/sql/protocol_internal.cc index 984e7822233..fdf8a119f7d 100644 --- a/cpp/src/arrow/flight/sql/protocol_internal.cc +++ b/cpp/src/arrow/flight/sql/protocol_internal.cc @@ -20,9 +20,13 @@ ARROW_SUPPRESS_DEPRECATION_WARNING #include "arrow/flight/sql/protocol_internal.h" +ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING + // NOTE(lidavidm): Normally this is forbidden, but on Windows to get // the dllexport/dllimport macro in the right places, we need to // ensure our header gets included (and Protobuf will not insert the // include for you) #include "arrow/flight/sql/FlightSql.pb.cc" // NOLINT + +ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING ARROW_UNSUPPRESS_DEPRECATION_WARNING diff --git a/cpp/src/arrow/flight/sql/server.cc b/cpp/src/arrow/flight/sql/server.cc index f68d884c621..8471fa8a2b1 100644 --- a/cpp/src/arrow/flight/sql/server.cc +++ b/cpp/src/arrow/flight/sql/server.cc @@ -1314,7 +1314,7 @@ const std::shared_ptr& SqlSchema::GetPrimaryKeysSchema() { return kSchema; } -const std::shared_ptr& GetImportedExportedKeysAndCrossReferenceSchema() { +static const std::shared_ptr& GetImportedExportedKeysAndCrossReferenceSchema() { static std::shared_ptr kSchema = arrow::schema( {field("pk_catalog_name", utf8(), true), field("pk_db_schema_name", utf8(), true), field("pk_table_name", utf8(), false), field("pk_column_name", utf8(), false), diff --git a/cpp/src/arrow/flight/sql/server_session_middleware.h b/cpp/src/arrow/flight/sql/server_session_middleware.h index 6eb11041a08..e7e71294f39 100644 --- a/cpp/src/arrow/flight/sql/server_session_middleware.h +++ b/cpp/src/arrow/flight/sql/server_session_middleware.h @@ -33,7 +33,7 @@ namespace arrow { namespace flight { namespace sql { -static constexpr char const kSessionCookieName[] = "arrow_flight_session_id"; +static constexpr const char kSessionCookieName[] = "arrow_flight_session_id"; class ARROW_FLIGHT_SQL_EXPORT FlightSession { protected: @@ -59,7 +59,7 @@ class ARROW_FLIGHT_SQL_EXPORT FlightSession { /// transport bug. class ARROW_FLIGHT_SQL_EXPORT ServerSessionMiddleware : public ServerMiddleware { public: - static constexpr char const kMiddlewareName[] = + static constexpr const char kMiddlewareName[] = "arrow::flight::sql::ServerSessionMiddleware"; std::string name() const override { return kMiddlewareName; } diff --git a/cpp/src/arrow/flight/test_definitions.cc b/cpp/src/arrow/flight/test_definitions.cc index ea6576088f2..c6b8e2b422c 100644 --- a/cpp/src/arrow/flight/test_definitions.cc +++ b/cpp/src/arrow/flight/test_definitions.cc @@ -1194,6 +1194,8 @@ void IpcOptionsTest::TestDoExchangeServerWriteOptions() { #if defined(ARROW_CUDA) +namespace { + Status CheckBuffersOnDevice(const Array& array, const Device& device) { if (array.num_fields() != 0) { return Status::NotImplemented("Nested arrays"); @@ -1284,6 +1286,8 @@ class CudaTestServer : public FlightServerBase { std::shared_ptr context_; }; +} // namespace + // Store CUDA objects without exposing them in the public header class CudaDataTest::Impl { public: @@ -1750,7 +1754,7 @@ void ErrorHandlingTest::TestGetFlightInfoMetadata() { })); } -void CheckErrorDetail(const Status& status) { +static void CheckErrorDetail(const Status& status) { auto detail = FlightStatusDetail::UnwrapStatus(status); ASSERT_NE(detail, nullptr) << status.ToString(); ASSERT_EQ(detail->code(), FlightStatusCode::Unauthorized); diff --git a/cpp/src/arrow/flight/test_util.cc b/cpp/src/arrow/flight/test_util.cc index aa10d9a7da8..46c8f4c984d 100644 --- a/cpp/src/arrow/flight/test_util.cc +++ b/cpp/src/arrow/flight/test_util.cc @@ -25,6 +25,7 @@ // We need Windows fixes before including Boost #include "arrow/util/windows_compatibility.h" +#include #include #include "arrow/array.h" @@ -77,6 +78,16 @@ FlightInfo MakeFlightInfo(const Schema& schema, const FlightDescriptor& descript return info; } +FlightInfo MakeFlightInfo(const FlightDescriptor& descriptor, + const std::vector& endpoints, + int64_t total_records, int64_t total_bytes, bool ordered, + std::string app_metadata) { + EXPECT_OK_AND_ASSIGN(auto info, + FlightInfo::Make(nullptr, descriptor, endpoints, total_records, + total_bytes, ordered, std::move(app_metadata))); + return info; +} + NumberingStream::NumberingStream(std::unique_ptr stream) : counter_(0), stream_(std::move(stream)) {} @@ -95,6 +106,20 @@ arrow::Result NumberingStream::Next() { return payload; } +void AssertEqual(const FlightInfo& expected, const FlightInfo& actual) { + ipc::DictionaryMemo expected_memo; + ipc::DictionaryMemo actual_memo; + ASSERT_OK_AND_ASSIGN(auto ex_schema, expected.GetSchema(&expected_memo)); + ASSERT_OK_AND_ASSIGN(auto actual_schema, actual.GetSchema(&actual_memo)); + + AssertSchemaEqual(*ex_schema, *actual_schema); + ASSERT_EQ(expected.total_records(), actual.total_records()); + ASSERT_EQ(expected.total_bytes(), actual.total_bytes()); + + ASSERT_EQ(expected.descriptor(), actual.descriptor()); + ASSERT_THAT(actual.endpoints(), ::testing::ContainerEq(expected.endpoints())); +} + std::shared_ptr ExampleIntSchema() { auto f0 = field("f0", int8()); auto f1 = field("f1", uint8()); diff --git a/cpp/src/arrow/flight/test_util.h b/cpp/src/arrow/flight/test_util.h index 946caebcc2b..59eb9c0c1a0 100644 --- a/cpp/src/arrow/flight/test_util.h +++ b/cpp/src/arrow/flight/test_util.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include @@ -42,20 +41,8 @@ namespace flight { // ---------------------------------------------------------------------- // Helpers to compare values for equality - -inline void AssertEqual(const FlightInfo& expected, const FlightInfo& actual) { - ipc::DictionaryMemo expected_memo; - ipc::DictionaryMemo actual_memo; - ASSERT_OK_AND_ASSIGN(auto ex_schema, expected.GetSchema(&expected_memo)); - ASSERT_OK_AND_ASSIGN(auto actual_schema, actual.GetSchema(&actual_memo)); - - AssertSchemaEqual(*ex_schema, *actual_schema); - ASSERT_EQ(expected.total_records(), actual.total_records()); - ASSERT_EQ(expected.total_bytes(), actual.total_bytes()); - - ASSERT_EQ(expected.descriptor(), actual.descriptor()); - ASSERT_THAT(actual.endpoints(), ::testing::ContainerEq(expected.endpoints())); -} +ARROW_FLIGHT_EXPORT +void AssertEqual(const FlightInfo& expected, const FlightInfo& actual); // ---------------------------------------------------------------------- // Fixture to use for running test servers @@ -143,6 +130,9 @@ class ARROW_FLIGHT_EXPORT NumberingStream : public FlightDataStream { ARROW_FLIGHT_EXPORT std::shared_ptr ExampleIntSchema(); +ARROW_FLIGHT_EXPORT +std::shared_ptr ExampleFloatSchema(); + ARROW_FLIGHT_EXPORT std::shared_ptr ExampleStringSchema(); @@ -182,6 +172,12 @@ FlightInfo MakeFlightInfo(const Schema& schema, const FlightDescriptor& descript int64_t total_records, int64_t total_bytes, bool ordered, std::string app_metadata); +ARROW_FLIGHT_EXPORT +FlightInfo MakeFlightInfo(const FlightDescriptor& descriptor, + const std::vector& endpoints, + int64_t total_records, int64_t total_bytes, bool ordered, + std::string app_metadata); + ARROW_FLIGHT_EXPORT Status ExampleTlsCertificates(std::vector* out); diff --git a/cpp/src/arrow/flight/transport/grpc/customize_grpc.h b/cpp/src/arrow/flight/transport/grpc/customize_grpc.h index b6680220875..153aa5ae1da 100644 --- a/cpp/src/arrow/flight/transport/grpc/customize_grpc.h +++ b/cpp/src/arrow/flight/transport/grpc/customize_grpc.h @@ -22,6 +22,7 @@ #include "arrow/flight/platform.h" #include "arrow/flight/type_fwd.h" +#include "arrow/flight/visibility.h" #include "arrow/util/config.h" // Silence protobuf warnings @@ -63,8 +64,8 @@ ::grpc::Status FlightDataSerialize(const arrow::flight::FlightPayload& msg, // Read internal::FlightData from grpc::ByteBuffer containing FlightData // protobuf without copying -::grpc::Status FlightDataDeserialize(::grpc::ByteBuffer* buffer, - arrow::flight::internal::FlightData* out); +ARROW_FLIGHT_EXPORT ::grpc::Status FlightDataDeserialize( + ::grpc::ByteBuffer* buffer, arrow::flight::internal::FlightData* out); } // namespace grpc } // namespace transport } // namespace flight diff --git a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc index dbd37780668..0b8c90a08eb 100644 --- a/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc +++ b/cpp/src/arrow/flight/transport/grpc/serialization_internal.cc @@ -69,6 +69,8 @@ using google::protobuf::io::CodedOutputStream; using ::grpc::ByteBuffer; +namespace { + bool ReadBytesZeroCopy(const std::shared_ptr& source_data, CodedInputStream* input, std::shared_ptr* out) { uint32_t length; @@ -151,7 +153,7 @@ class GrpcBuffer : public MutableBuffer { }; // Destructor callback for grpc::Slice -static void ReleaseBuffer(void* buf_ptr) { +void ReleaseBuffer(void* buf_ptr) { delete reinterpret_cast*>(buf_ptr); } @@ -174,7 +176,7 @@ arrow::Result<::grpc::Slice> SliceFromBuffer(const std::shared_ptr& buf) return slice; } -static const uint8_t kPaddingBytes[8] = {0, 0, 0, 0, 0, 0, 0, 0}; +const uint8_t kPaddingBytes[8] = {0, 0, 0, 0, 0, 0, 0, 0}; // Update the sizes of our Protobuf fields based on the given IPC payload. ::grpc::Status IpcMessageHeaderSize(const arrow::ipc::IpcPayload& ipc_msg, bool has_body, @@ -195,6 +197,8 @@ ::grpc::Status IpcMessageHeaderSize(const arrow::ipc::IpcPayload& ipc_msg, bool return ::grpc::Status::OK; } +} // namespace + ::grpc::Status FlightDataSerialize(const FlightPayload& msg, ByteBuffer* out, bool* own_buffer) { // Size of the IPC body (protobuf: data_body) diff --git a/cpp/src/arrow/flight/transport/grpc/util_internal.cc b/cpp/src/arrow/flight/transport/grpc/util_internal.cc index 88ec15bc66e..d29f1ce4178 100644 --- a/cpp/src/arrow/flight/transport/grpc/util_internal.cc +++ b/cpp/src/arrow/flight/transport/grpc/util_internal.cc @@ -29,7 +29,7 @@ #include "arrow/flight/types.h" #include "arrow/status.h" #include "arrow/util/string.h" -#include "arrow/util/string_builder.h" +#include "arrow/util/string_util.h" namespace arrow { @@ -134,9 +134,10 @@ static TransportStatus TransportStatusFromGrpc(const ::grpc::Status& grpc_status return TransportStatus{TransportStatusCode::kUnauthenticated, grpc_status.error_message()}; default: - return TransportStatus{TransportStatusCode::kUnknown, - util::StringBuilder("(", grpc_status.error_code(), ")", - grpc_status.error_message())}; + return TransportStatus{ + TransportStatusCode::kUnknown, + arrow::internal::JoinToString("(", grpc_status.error_code(), ")", + grpc_status.error_message())}; } } diff --git a/cpp/src/arrow/flight/transport_server.cc b/cpp/src/arrow/flight/transport_server.cc index cdc7e06e12e..197def7cabd 100644 --- a/cpp/src/arrow/flight/transport_server.cc +++ b/cpp/src/arrow/flight/transport_server.cc @@ -26,6 +26,7 @@ #include "arrow/ipc/reader.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/util/logging_internal.h" namespace arrow { namespace flight { @@ -136,6 +137,13 @@ class TransportMessageReader final : public FlightMessageReader { return out; } + ipc::ReadStats stats() const override { + if (batch_reader_ == nullptr) { + return ipc::ReadStats{}; + } + return batch_reader_->stats(); + } + private: /// Ensure we are set up to read data. Status EnsureDataStarted() { @@ -156,29 +164,67 @@ class TransportMessageReader final : public FlightMessageReader { FlightDescriptor descriptor_; std::shared_ptr peekable_reader_; std::shared_ptr memory_manager_; - std::shared_ptr batch_reader_; + std::shared_ptr batch_reader_; std::shared_ptr app_metadata_; }; -// TODO(ARROW-10787): this should use the same writer/ipc trick as client +/// \brief An IpcPayloadWriter for ServerDataStream. +/// +/// To support app_metadata and reuse the existing IPC infrastructure, +/// this takes a pointer to a buffer to be combined with the IPC +/// payload when writing a Flight payload. +class TransportMessagePayloadWriter : public ipc::internal::IpcPayloadWriter { + public: + TransportMessagePayloadWriter(ServerDataStream* stream, + std::shared_ptr* app_metadata) + : stream_(stream), app_metadata_(app_metadata) {} + + Status Start() override { return Status::OK(); } + Status WritePayload(const ipc::IpcPayload& ipc_payload) override { + FlightPayload payload; + payload.ipc_message = ipc_payload; + + if (ipc_payload.type == ipc::MessageType::RECORD_BATCH && *app_metadata_) { + payload.app_metadata = std::move(*app_metadata_); + } + ARROW_ASSIGN_OR_RAISE(auto success, stream_->WriteData(payload)); + if (!success) { + return MakeFlightError( + FlightStatusCode::Internal, + "Could not write record batch to stream (client disconnect?)"); + } + return arrow::Status::OK(); + } + Status Close() override { + // Closing is handled one layer up in TransportMessageWriter::Close + return Status::OK(); + } + + private: + ServerDataStream* stream_; + std::shared_ptr* app_metadata_; +}; + class TransportMessageWriter final : public FlightMessageWriter { public: explicit TransportMessageWriter(ServerDataStream* stream) - : stream_(stream), ipc_options_(::arrow::ipc::IpcWriteOptions::Defaults()) {} + : stream_(stream), + app_metadata_(nullptr), + ipc_options_(::arrow::ipc::IpcWriteOptions::Defaults()) {} Status Begin(const std::shared_ptr& schema, const ipc::IpcWriteOptions& options) override { - if (started_) { + if (batch_writer_) { return Status::Invalid("This writer has already been started."); } - started_ = true; ipc_options_ = options; + std::unique_ptr payload_writer( + new TransportMessagePayloadWriter(stream_, &app_metadata_)); - RETURN_NOT_OK(mapper_.AddSchemaFields(*schema)); - FlightPayload schema_payload; - RETURN_NOT_OK(ipc::GetSchemaPayload(*schema, ipc_options_, mapper_, - &schema_payload.ipc_message)); - return WritePayload(schema_payload); + ARROW_ASSIGN_OR_RAISE(batch_writer_, + ipc::internal::OpenRecordBatchWriter(std::move(payload_writer), + schema, ipc_options_)); + return Status::OK(); } Status WriteRecordBatch(const RecordBatch& batch) override { @@ -188,71 +234,50 @@ class TransportMessageWriter final : public FlightMessageWriter { Status WriteMetadata(std::shared_ptr app_metadata) override { FlightPayload payload{}; payload.app_metadata = app_metadata; - return WritePayload(payload); + ARROW_ASSIGN_OR_RAISE(auto success, stream_->WriteData(payload)); + if (!success) { + ARROW_RETURN_NOT_OK(Close()); + return MakeFlightError(FlightStatusCode::Internal, + "Could not write metadata to stream (client disconnect?)"); + } + return Status::OK(); } Status WriteWithMetadata(const RecordBatch& batch, std::shared_ptr app_metadata) override { RETURN_NOT_OK(CheckStarted()); - RETURN_NOT_OK(EnsureDictionariesWritten(batch)); - FlightPayload payload{}; - if (app_metadata) { - payload.app_metadata = app_metadata; + app_metadata_ = app_metadata; + auto status = batch_writer_->WriteRecordBatch(batch); + if (!status.ok()) { + ARROW_RETURN_NOT_OK(Close()); } - RETURN_NOT_OK(ipc::GetRecordBatchPayload(batch, ipc_options_, &payload.ipc_message)); - RETURN_NOT_OK(WritePayload(payload)); - ++stats_.num_record_batches; - return Status::OK(); + return status; } Status Close() override { - // It's fine to Close() without writing data + if (batch_writer_) { + RETURN_NOT_OK(batch_writer_->Close()); + } return Status::OK(); } - ipc::WriteStats stats() const override { return stats_; } - - private: - Status WritePayload(const FlightPayload& payload) { - ARROW_ASSIGN_OR_RAISE(auto success, stream_->WriteData(payload)); - if (!success) { - return MakeFlightError(FlightStatusCode::Internal, - "Could not write metadata to stream (client disconnect?)"); - } - ++stats_.num_messages; - return Status::OK(); + ipc::WriteStats stats() const override { + ARROW_CHECK_NE(batch_writer_, nullptr); + return batch_writer_->stats(); } + private: Status CheckStarted() { - if (!started_) { + if (!batch_writer_) { return Status::Invalid("This writer is not started. Call Begin() with a schema"); } return Status::OK(); } - Status EnsureDictionariesWritten(const RecordBatch& batch) { - if (dictionaries_written_) { - return Status::OK(); - } - dictionaries_written_ = true; - ARROW_ASSIGN_OR_RAISE(const auto dictionaries, - ipc::CollectDictionaries(batch, mapper_)); - for (const auto& pair : dictionaries) { - FlightPayload payload{}; - RETURN_NOT_OK(ipc::GetDictionaryPayload(pair.first, pair.second, ipc_options_, - &payload.ipc_message)); - RETURN_NOT_OK(WritePayload(payload)); - ++stats_.num_dictionary_batches; - } - return Status::OK(); - } - ServerDataStream* stream_; + std::unique_ptr batch_writer_; + std::shared_ptr app_metadata_; ::arrow::ipc::IpcWriteOptions ipc_options_; - ipc::DictionaryFieldMapper mapper_; - ipc::WriteStats stats_; - bool started_ = false; - bool dictionaries_written_ = false; }; /// \brief Adapt TransportDataStream to the FlightMetadataWriter diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index 65beec97d64..759b1410bda 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -37,7 +37,7 @@ #include "arrow/util/formatting.h" #include "arrow/util/logging.h" #include "arrow/util/string.h" -#include "arrow/util/string_builder.h" +#include "arrow/util/string_util.h" #include "arrow/util/uri.h" namespace arrow { @@ -189,8 +189,8 @@ static std::ostream& operator<<(std::ostream& os, std::map m) { // Wrapper types for Flight RPC protobuf messages std::string BasicAuth::ToString() const { - return arrow::util::StringBuilder(""); + return arrow::internal::JoinToString(""); } bool BasicAuth::Equals(const BasicAuth& other) const { @@ -280,10 +280,31 @@ arrow::Result FlightInfo::Make(const Schema& schema, return FlightInfo(std::move(data)); } +arrow::Result FlightInfo::Make(const std::shared_ptr& schema, + const FlightDescriptor& descriptor, + const std::vector& endpoints, + int64_t total_records, int64_t total_bytes, + bool ordered, std::string app_metadata) { + FlightInfo::Data data; + data.descriptor = descriptor; + data.endpoints = endpoints; + data.total_records = total_records; + data.total_bytes = total_bytes; + data.ordered = ordered; + data.app_metadata = std::move(app_metadata); + if (schema) { + RETURN_NOT_OK(internal::SchemaToString(*schema, &data.schema)); + } + return FlightInfo(std::move(data)); +} + arrow::Result> FlightInfo::GetSchema( ipc::DictionaryMemo* dictionary_memo) const { if (reconstructed_schema_) { return schema_; + } else if (data_.schema.empty()) { + reconstructed_schema_ = true; + return schema_; } // Create a non-owned Buffer to avoid copying io::BufferReader schema_reader(std::make_shared(data_.schema)); @@ -305,7 +326,9 @@ arrow::Status FlightInfo::Deserialize(std::string_view serialized, std::string FlightInfo::ToString() const { std::stringstream ss; ss << "ToString(); } else { ss << "(serialized)"; @@ -556,7 +579,8 @@ arrow::Status SetSessionOptionsRequest::Deserialize(std::string_view serialized, // SetSessionOptionsResult -std::ostream& operator<<(std::ostream& os, const SetSessionOptionsResult::Error& e) { +static std::ostream& operator<<(std::ostream& os, + const SetSessionOptionsResult::Error& e) { os << '{' << e.value << '}'; return os; } @@ -863,8 +887,8 @@ Status FlightPayload::Validate() const { } std::string ActionType::ToString() const { - return arrow::util::StringBuilder(""); + return arrow::internal::JoinToString(""); } const ActionType ActionType::kCancelFlightInfo = @@ -907,7 +931,7 @@ arrow::Status ActionType::Deserialize(std::string_view serialized, ActionType* o } std::string Criteria::ToString() const { - return arrow::util::StringBuilder(""); + return arrow::internal::JoinToString(""); } bool Criteria::Equals(const Criteria& other) const { diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index 8b612bd55ce..d498ac67f7a 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -47,6 +47,7 @@ class Table; namespace ipc { class DictionaryMemo; +struct ReadStats; } // namespace ipc namespace util { @@ -468,6 +469,151 @@ struct ARROW_FLIGHT_EXPORT FlightDescriptor } }; +/// \brief Data structure providing an opaque identifier or credential to use +/// when requesting a data stream with the DoGet RPC +struct ARROW_FLIGHT_EXPORT Ticket : public internal::BaseType { + std::string ticket; + + Ticket() = default; + Ticket(std::string ticket) // NOLINT runtime/explicit + : ticket(std::move(ticket)) {} + + std::string ToString() const; + bool Equals(const Ticket& other) const; + + using SuperT::Deserialize; + using SuperT::SerializeToString; + + /// \brief Get the wire-format representation of this type. + /// + /// Useful when interoperating with non-Flight systems (e.g. REST + /// services) that may want to return Flight types. + /// + /// Use `SerializeToString()` if you want a Result-returning version. + arrow::Status SerializeToString(std::string* out) const; + + /// \brief Parse the wire-format representation of this type. + /// + /// Useful when interoperating with non-Flight systems (e.g. REST + /// services) that may want to return Flight types. + /// + /// Use `Deserialize(serialized)` if you want a Result-returning version. + static arrow::Status Deserialize(std::string_view serialized, Ticket* out); +}; + +/// \brief A host location (a URI) +struct ARROW_FLIGHT_EXPORT Location : public internal::BaseType { + public: + /// \brief Initialize a blank location. + Location(); + + ~Location(); + + /// \brief Initialize a location by parsing a URI string + static arrow::Result Parse(const std::string& uri_string); + + /// \brief Get the fallback URI. + /// + /// arrow-flight-reuse-connection://? means that a client may attempt to + /// reuse an existing connection to a Flight service to fetch data instead + /// of creating a new connection to one of the other locations listed in a + /// FlightEndpoint response. + static const Location& ReuseConnection(); + + /// \brief Initialize a location for a non-TLS, gRPC-based Flight + /// service from a host and port + /// \param[in] host The hostname to connect to + /// \param[in] port The port + /// \return Arrow result with the resulting location + static arrow::Result ForGrpcTcp(const std::string& host, const int port); + + /// \brief Initialize a location for a TLS-enabled, gRPC-based Flight + /// service from a host and port + /// \param[in] host The hostname to connect to + /// \param[in] port The port + /// \return Arrow result with the resulting location + static arrow::Result ForGrpcTls(const std::string& host, const int port); + + /// \brief Initialize a location for a domain socket-based Flight + /// service + /// \param[in] path The path to the domain socket + /// \return Arrow result with the resulting location + static arrow::Result ForGrpcUnix(const std::string& path); + + /// \brief Initialize a location based on a URI scheme + static arrow::Result ForScheme(const std::string& scheme, + const std::string& host, const int port); + + /// \brief Get the scheme of this URI. + std::string scheme() const; + + /// \brief Get a representation of this URI as a string. + std::string ToString() const; + bool Equals(const Location& other) const; + + using SuperT::Deserialize; + using SuperT::SerializeToString; + + /// \brief Serialize this message to its wire-format representation. + /// + /// Use `SerializeToString()` if you want a Result-returning version. + arrow::Status SerializeToString(std::string* out) const; + + /// \brief Deserialize this message from its wire-format representation. + /// + /// Use `Deserialize(serialized)` if you want a Result-returning version. + static arrow::Status Deserialize(std::string_view serialized, Location* out); + + private: + friend class FlightClient; + friend class FlightServerBase; + std::shared_ptr uri_; +}; + +/// \brief A flight ticket and list of locations where the ticket can be +/// redeemed +struct ARROW_FLIGHT_EXPORT FlightEndpoint : public internal::BaseType { + /// Opaque ticket identify; use with DoGet RPC + Ticket ticket; + + /// List of locations where ticket can be redeemed. If the list is empty, the + /// ticket can only be redeemed on the current service where the ticket was + /// generated + std::vector locations; + + /// Expiration time of this stream. If present, clients may assume + /// they can retry DoGet requests. Otherwise, clients should avoid + /// retrying DoGet requests. + std::optional expiration_time; + + /// Opaque Application-defined metadata + std::string app_metadata; + + FlightEndpoint() = default; + FlightEndpoint(Ticket ticket, std::vector locations, + std::optional expiration_time, std::string app_metadata) + : ticket(std::move(ticket)), + locations(std::move(locations)), + expiration_time(expiration_time), + app_metadata(std::move(app_metadata)) {} + + std::string ToString() const; + bool Equals(const FlightEndpoint& other) const; + + using SuperT::Deserialize; + using SuperT::SerializeToString; + + /// \brief Serialize this message to its wire-format representation. + /// + /// Use `SerializeToString()` if you want a Result-returning version. + arrow::Status SerializeToString(std::string* out) const; + + /// \brief Deserialize this message from its wire-format representation. + /// + /// Use `Deserialize(serialized)` if you want a Result-returning version. + static arrow::Status Deserialize(std::string_view serialized, FlightEndpoint* out); +}; + /// \brief The access coordinates for retrieval of a dataset, returned by /// GetFlightInfo class ARROW_FLIGHT_EXPORT FlightInfo @@ -493,12 +639,21 @@ class ARROW_FLIGHT_EXPORT FlightInfo bool ordered = false, std::string app_metadata = ""); + /// \brief Factory method to construct a FlightInfo. + static arrow::Result Make(const std::shared_ptr& schema, + const FlightDescriptor& descriptor, + const std::vector& endpoints, + int64_t total_records, int64_t total_bytes, + bool ordered = false, + std::string app_metadata = ""); + /// \brief Deserialize the Arrow schema of the dataset. Populate any /// dictionary encoded fields into a DictionaryMemo for /// bookkeeping /// \param[in,out] dictionary_memo for dictionary bookkeeping, will /// be modified - /// \return Arrow result with the reconstructed Schema + /// \return Arrow result with the reconstructed Schema. Note that the schema + /// may be nullptr, as the schema is optional. arrow::Result> GetSchema( ipc::DictionaryMemo* dictionary_memo) const; @@ -703,151 +858,6 @@ struct ARROW_FLIGHT_EXPORT CancelFlightInfoResult ARROW_FLIGHT_EXPORT std::ostream& operator<<(std::ostream& os, CancelStatus status); -/// \brief Data structure providing an opaque identifier or credential to use -/// when requesting a data stream with the DoGet RPC -struct ARROW_FLIGHT_EXPORT Ticket : public internal::BaseType { - std::string ticket; - - Ticket() = default; - Ticket(std::string ticket) // NOLINT runtime/explicit - : ticket(std::move(ticket)) {} - - std::string ToString() const; - bool Equals(const Ticket& other) const; - - using SuperT::Deserialize; - using SuperT::SerializeToString; - - /// \brief Get the wire-format representation of this type. - /// - /// Useful when interoperating with non-Flight systems (e.g. REST - /// services) that may want to return Flight types. - /// - /// Use `SerializeToString()` if you want a Result-returning version. - arrow::Status SerializeToString(std::string* out) const; - - /// \brief Parse the wire-format representation of this type. - /// - /// Useful when interoperating with non-Flight systems (e.g. REST - /// services) that may want to return Flight types. - /// - /// Use `Deserialize(serialized)` if you want a Result-returning version. - static arrow::Status Deserialize(std::string_view serialized, Ticket* out); -}; - -/// \brief A host location (a URI) -struct ARROW_FLIGHT_EXPORT Location : public internal::BaseType { - public: - /// \brief Initialize a blank location. - Location(); - - ~Location(); - - /// \brief Initialize a location by parsing a URI string - static arrow::Result Parse(const std::string& uri_string); - - /// \brief Get the fallback URI. - /// - /// arrow-flight-reuse-connection://? means that a client may attempt to - /// reuse an existing connection to a Flight service to fetch data instead - /// of creating a new connection to one of the other locations listed in a - /// FlightEndpoint response. - static const Location& ReuseConnection(); - - /// \brief Initialize a location for a non-TLS, gRPC-based Flight - /// service from a host and port - /// \param[in] host The hostname to connect to - /// \param[in] port The port - /// \return Arrow result with the resulting location - static arrow::Result ForGrpcTcp(const std::string& host, const int port); - - /// \brief Initialize a location for a TLS-enabled, gRPC-based Flight - /// service from a host and port - /// \param[in] host The hostname to connect to - /// \param[in] port The port - /// \return Arrow result with the resulting location - static arrow::Result ForGrpcTls(const std::string& host, const int port); - - /// \brief Initialize a location for a domain socket-based Flight - /// service - /// \param[in] path The path to the domain socket - /// \return Arrow result with the resulting location - static arrow::Result ForGrpcUnix(const std::string& path); - - /// \brief Initialize a location based on a URI scheme - static arrow::Result ForScheme(const std::string& scheme, - const std::string& host, const int port); - - /// \brief Get the scheme of this URI. - std::string scheme() const; - - /// \brief Get a representation of this URI as a string. - std::string ToString() const; - bool Equals(const Location& other) const; - - using SuperT::Deserialize; - using SuperT::SerializeToString; - - /// \brief Serialize this message to its wire-format representation. - /// - /// Use `SerializeToString()` if you want a Result-returning version. - arrow::Status SerializeToString(std::string* out) const; - - /// \brief Deserialize this message from its wire-format representation. - /// - /// Use `Deserialize(serialized)` if you want a Result-returning version. - static arrow::Status Deserialize(std::string_view serialized, Location* out); - - private: - friend class FlightClient; - friend class FlightServerBase; - std::shared_ptr uri_; -}; - -/// \brief A flight ticket and list of locations where the ticket can be -/// redeemed -struct ARROW_FLIGHT_EXPORT FlightEndpoint : public internal::BaseType { - /// Opaque ticket identify; use with DoGet RPC - Ticket ticket; - - /// List of locations where ticket can be redeemed. If the list is empty, the - /// ticket can only be redeemed on the current service where the ticket was - /// generated - std::vector locations; - - /// Expiration time of this stream. If present, clients may assume - /// they can retry DoGet requests. Otherwise, clients should avoid - /// retrying DoGet requests. - std::optional expiration_time; - - /// Opaque Application-defined metadata - std::string app_metadata; - - FlightEndpoint() = default; - FlightEndpoint(Ticket ticket, std::vector locations, - std::optional expiration_time, std::string app_metadata) - : ticket(std::move(ticket)), - locations(std::move(locations)), - expiration_time(expiration_time), - app_metadata(std::move(app_metadata)) {} - - std::string ToString() const; - bool Equals(const FlightEndpoint& other) const; - - using SuperT::Deserialize; - using SuperT::SerializeToString; - - /// \brief Serialize this message to its wire-format representation. - /// - /// Use `SerializeToString()` if you want a Result-returning version. - arrow::Status SerializeToString(std::string* out) const; - - /// \brief Deserialize this message from its wire-format representation. - /// - /// Use `Deserialize(serialized)` if you want a Result-returning version. - static arrow::Status Deserialize(std::string_view serialized, FlightEndpoint* out); -}; - /// \brief The request of the RenewFlightEndpoint action. struct ARROW_FLIGHT_EXPORT RenewFlightEndpointRequest : public internal::BaseType { @@ -1170,6 +1180,9 @@ class ARROW_FLIGHT_EXPORT MetadataRecordBatchReader { /// \brief Consume entire stream as a Table virtual arrow::Result> ToTable(); + + /// \brief Return current read statistics + virtual arrow::ipc::ReadStats stats() const = 0; }; /// \brief Convert a MetadataRecordBatchReader to a regular RecordBatchReader. diff --git a/cpp/src/arrow/gpu/ArrowCUDAConfig.cmake.in b/cpp/src/arrow/gpu/ArrowCUDAConfig.cmake.in index 626c536a08d..5336b770161 100644 --- a/cpp/src/arrow/gpu/ArrowCUDAConfig.cmake.in +++ b/cpp/src/arrow/gpu/ArrowCUDAConfig.cmake.in @@ -27,7 +27,7 @@ @PACKAGE_INIT@ include(CMakeFindDependencyMacro) -find_dependency(Arrow) +find_dependency(Arrow CONFIG) if(CMAKE_VERSION VERSION_LESS 3.17) find_package(CUDA REQUIRED) add_library(ArrowCUDA::cuda_driver SHARED IMPORTED) diff --git a/cpp/src/arrow/gpu/cuda_memory.cc b/cpp/src/arrow/gpu/cuda_memory.cc index 9d71678ff3b..735a5dacc2b 100644 --- a/cpp/src/arrow/gpu/cuda_memory.cc +++ b/cpp/src/arrow/gpu/cuda_memory.cc @@ -206,7 +206,7 @@ CudaHostBuffer::CudaHostBuffer(uint8_t* data, const int64_t size) CudaHostBuffer::~CudaHostBuffer() { auto maybe_manager = CudaDeviceManager::Instance(); - ARROW_CHECK_OK(maybe_manager.status()); + ARROW_CHECK_OK(maybe_manager); ARROW_CHECK_OK((*maybe_manager)->FreeHost(const_cast(data_), size_)); } diff --git a/cpp/src/arrow/integration/meson.build b/cpp/src/arrow/integration/meson.build index f63c6e0745d..edc9fdf724f 100644 --- a/cpp/src/arrow/integration/meson.build +++ b/cpp/src/arrow/integration/meson.build @@ -20,8 +20,7 @@ install_headers(['json_integration.h']) exc = executable( 'arrow-json-integration-test', sources: ['json_integration_test.cc'], - dependencies: [arrow_dep, rapidjson_dep, gflags_dep, gtest_dep], - link_with: [arrow_test_lib], + dependencies: [arrow_test_dep_no_main, rapidjson_dep, gflags_dep], ) if needs_tests diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index f7afbca5580..623fcde413d 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -23,14 +23,22 @@ add_arrow_test(compressed_test PREFIX "arrow-io") add_arrow_test(file_test PREFIX "arrow-io") if(ARROW_HDFS) + set(HDFS_TEST_EXTRA_LINK_LIBS arrow::hadoop) + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9") + list(APPEND HDFS_TEST_EXTRA_LINK_LIBS stdc++fs) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8") + list(APPEND HDFS_TEST_EXTRA_LINK_LIBS c++fs) + endif() + endif() add_arrow_test(hdfs_test NO_VALGRIND PREFIX "arrow-io" EXTRA_LINK_LIBS - arrow::hadoop - Boost::filesystem - Boost::system) + ${HDFS_TEST_EXTRA_LINK_LIBS}) endif() add_arrow_test(memory_test PREFIX "arrow-io") diff --git a/cpp/src/arrow/io/caching.cc b/cpp/src/arrow/io/caching.cc index be20abdf91d..74e98170ad0 100644 --- a/cpp/src/arrow/io/caching.cc +++ b/cpp/src/arrow/io/caching.cc @@ -188,7 +188,12 @@ struct ReadRangeCache::Impl { entries = std::move(new_entries); } // Prefetch immediately, regardless of executor availability, if possible - return file->WillNeed(ranges); + auto st = file->WillNeed(ranges); + // As this is optimisation only, I/O failures should not be treated as fatal + if (st.IsIOError()) { + return Status::OK(); + } + return st; } // Read the given range from the cache, blocking if needed. Cannot read a range diff --git a/cpp/src/arrow/io/file_test.cc b/cpp/src/arrow/io/file_test.cc index 44a63e9fdfa..81ae716ef67 100644 --- a/cpp/src/arrow/io/file_test.cc +++ b/cpp/src/arrow/io/file_test.cc @@ -434,7 +434,7 @@ TEST_F(TestReadableFile, NonexistentFile) { auto maybe_file = ReadableFile::Open(path); ASSERT_RAISES(IOError, maybe_file); std::string message = maybe_file.status().message(); - ASSERT_NE(std::string::npos, message.find(path)); + ASSERT_NE(std::string::npos, message.find(path)) << message; } class MyMemoryPool : public MemoryPool { diff --git a/cpp/src/arrow/io/hdfs_internal.cc b/cpp/src/arrow/io/hdfs_internal.cc index 4a88b9a6be6..ab9f8e35026 100644 --- a/cpp/src/arrow/io/hdfs_internal.cc +++ b/cpp/src/arrow/io/hdfs_internal.cc @@ -58,7 +58,7 @@ namespace io::internal { namespace { template -Status SetSymbol(void* handle, char const* name, T** symbol) { +Status SetSymbol(void* handle, const char* name, T** symbol) { if (*symbol != nullptr) return Status::OK(); auto maybe_symbol = ::arrow::internal::GetSymbolAs(handle, name); diff --git a/cpp/src/arrow/io/hdfs_test.cc b/cpp/src/arrow/io/hdfs_test.cc index 6b989378c12..4f0a280b768 100644 --- a/cpp/src/arrow/io/hdfs_test.cc +++ b/cpp/src/arrow/io/hdfs_test.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include // IWYU pragma: keep @@ -36,11 +37,6 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" -// boost/filesystem.hpp should be included after -// arrow/util/windows_compatibility.h because boost/filesystem.hpp -// includes windows.h implicitly. -#include // NOLINT - namespace arrow { namespace io { @@ -90,9 +86,12 @@ class TestHadoopFileSystem : public ::testing::Test { client_ = nullptr; scratch_dir_ = - boost::filesystem::unique_path(boost::filesystem::temp_directory_path() / - "arrow-hdfs/scratch-%%%%") - .string(); + (std::filesystem::temp_directory_path() / "arrow-hdfs/scratch-").string(); + int random_size = 4; + scratch_dir_.resize(scratch_dir_.size() + random_size, '%'); + random_alnum( + random_size, 0, + reinterpret_cast(&scratch_dir_[scratch_dir_.size() - random_size])); loaded_driver_ = false; diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index 1ae03aeb143..d7b118b3982 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -31,7 +31,7 @@ #include "arrow/util/io_util.h" #include "arrow/util/logging_internal.h" #include "arrow/util/macros.h" -#include "arrow/util/memory.h" +#include "arrow/util/memory_internal.h" namespace arrow { namespace io { diff --git a/cpp/src/arrow/io/memory_test.cc b/cpp/src/arrow/io/memory_test.cc index 03d0e65daee..eabee87146d 100644 --- a/cpp/src/arrow/io/memory_test.cc +++ b/cpp/src/arrow/io/memory_test.cc @@ -681,7 +681,7 @@ TEST(TestInputStreamIterator, Closed) { AssertBufferEqual(*buf, "dat"); // Close stream and read from iterator ASSERT_OK(reader->Close()); - ASSERT_RAISES(Invalid, it.Next().status()); + ASSERT_RAISES(Invalid, it.Next()); } TEST(CoalesceReadRanges, Basics) { diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 9e0b1d723b9..6e73c71d897 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -38,7 +38,6 @@ function(ADD_ARROW_IPC_TEST REL_TEST_NAME) endfunction() add_arrow_test(feather_test) -add_arrow_ipc_test(json_simple_test) add_arrow_ipc_test(message_internal_test) add_arrow_ipc_test(read_write_test) add_arrow_ipc_test(tensor_test) @@ -71,12 +70,7 @@ endif() add_arrow_benchmark(read_write_benchmark PREFIX "arrow-ipc") -if(ARROW_FUZZING - OR (ARROW_BUILD_UTILITIES - AND ARROW_TESTING - AND ARROW_WITH_LZ4 - AND ARROW_WITH_ZSTD - )) +if(ARROW_BUILD_FUZZING_UTILITIES) add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc) target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB} ${ARROW_TEST_LINK_LIBS}) diff --git a/cpp/src/arrow/ipc/api.h b/cpp/src/arrow/ipc/api.h index b5690aed8da..3047180fb1a 100644 --- a/cpp/src/arrow/ipc/api.h +++ b/cpp/src/arrow/ipc/api.h @@ -19,7 +19,6 @@ #include "arrow/ipc/dictionary.h" #include "arrow/ipc/feather.h" -#include "arrow/ipc/json_simple.h" #include "arrow/ipc/message.h" #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index ba3f4d828c3..e1dc6046a11 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -319,6 +319,24 @@ TEST_P(TestFeather, SliceBooleanRoundTrip) { CheckSlices(batch); } +TEST_P(TestFeather, SliceListRoundTrip) { + if (GetParam().version == kFeatherV1Version) { + GTEST_SKIP() << "Feather V1 does not support list types"; + } + std::shared_ptr batch; + ASSERT_OK(ipc::test::MakeListRecordBatchSized(600, &batch)); + CheckSlices(batch); +} + +TEST_P(TestFeather, SliceListViewRoundTrip) { + if (GetParam().version == kFeatherV1Version) { + GTEST_SKIP() << "Feather V1 does not support list view types"; + } + std::shared_ptr batch; + ASSERT_OK(ipc::test::MakeListViewRecordBatchSized(600, &batch)); + CheckSlices(batch); +} + INSTANTIATE_TEST_SUITE_P( FeatherTests, TestFeather, ::testing::Values(TestParam(kFeatherV1Version), TestParam(kFeatherV2Version), diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc index 6ccf1155d12..123b6981b28 100644 --- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc +++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc @@ -27,9 +27,9 @@ #include "arrow/io/file.h" #include "arrow/io/memory.h" -#include "arrow/ipc/json_simple.h" #include "arrow/ipc/test_common.h" #include "arrow/ipc/writer.h" +#include "arrow/json/from_string.h" #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/testing/extension_type.h" @@ -41,7 +41,7 @@ namespace arrow::ipc { using ::arrow::internal::CreateDir; using ::arrow::internal::PlatformFilename; -using internal::json::ArrayFromJSON; +using ::arrow::json::ArrayFromJSONString; Result> MakeExtensionBatch() { auto array = ExampleUuid(); @@ -60,7 +60,7 @@ Result> MakeMapBatch() { [] ] )"; - ARROW_ASSIGN_OR_RAISE(array, ArrayFromJSON(map(int16(), int32()), json_input)); + ARROW_ASSIGN_OR_RAISE(array, ArrayFromJSONString(map(int16(), int32()), json_input)); auto schema = ::arrow::schema({field("f0", array->type())}); return RecordBatch::Make(schema, array->length(), {array}); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc deleted file mode 100644 index 19f0a6ae1e1..00000000000 --- a/cpp/src/arrow/ipc/json_simple.cc +++ /dev/null @@ -1,1080 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include - -#include "arrow/array/array_dict.h" -#include "arrow/array/builder_binary.h" -#include "arrow/array/builder_decimal.h" -#include "arrow/array/builder_dict.h" -#include "arrow/array/builder_nested.h" -#include "arrow/array/builder_primitive.h" -#include "arrow/array/builder_time.h" -#include "arrow/array/builder_union.h" -#include "arrow/chunked_array.h" -#include "arrow/ipc/json_simple.h" -#include "arrow/scalar.h" -#include "arrow/type_traits.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/decimal.h" -#include "arrow/util/float16.h" -#include "arrow/util/logging_internal.h" -#include "arrow/util/value_parsing.h" - -#include "arrow/json/rapidjson_defs.h" - -#include -#include -#include -#include -#include - -namespace rj = arrow::rapidjson; - -namespace arrow { - -using internal::ParseValue; -using util::Float16; - -namespace ipc { -namespace internal { -namespace json { - -using ::arrow::internal::checked_cast; -using ::arrow::internal::checked_pointer_cast; - -namespace { - -constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; - -const char* JsonTypeName(rj::Type json_type) { - switch (json_type) { - case rapidjson::kNullType: - return "null"; - case rapidjson::kFalseType: - return "false"; - case rapidjson::kTrueType: - return "true"; - case rapidjson::kObjectType: - return "object"; - case rapidjson::kArrayType: - return "array"; - case rapidjson::kStringType: - return "string"; - case rapidjson::kNumberType: - return "number"; - default: - return "unknown"; - } -} - -Status JSONTypeError(const char* expected_type, rj::Type json_type) { - return Status::Invalid("Expected ", expected_type, " or null, got JSON type ", - JsonTypeName(json_type)); -} - -class Converter { - public: - virtual ~Converter() = default; - - virtual Status Init() { return Status::OK(); } - - virtual Status AppendValue(const rj::Value& json_obj) = 0; - - Status AppendNull() { return this->builder()->AppendNull(); } - - virtual Status AppendValues(const rj::Value& json_array) = 0; - - virtual std::shared_ptr builder() = 0; - - virtual Status Finish(std::shared_ptr* out) { - auto builder = this->builder(); - if (builder->length() == 0) { - // Make sure the builder was initialized - RETURN_NOT_OK(builder->Resize(1)); - } - return builder->Finish(out); - } - - protected: - std::shared_ptr type_; -}; - -Status GetConverter(const std::shared_ptr&, std::shared_ptr* out); - -// CRTP -template -class ConcreteConverter : public Converter { - public: - Result SizeOfJSONArray(const rj::Value& json_obj) { - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - return json_obj.Size(); - } - - Status AppendValues(const rj::Value& json_array) final { - auto self = static_cast(this); - ARROW_ASSIGN_OR_RAISE(auto size, SizeOfJSONArray(json_array)); - for (uint32_t i = 0; i < size; ++i) { - RETURN_NOT_OK(self->AppendValue(json_array[i])); - } - return Status::OK(); - } - - const std::shared_ptr& value_type() { - if (type_->id() != Type::DICTIONARY) { - return type_; - } - return checked_cast(*type_).value_type(); - } - - template - Status MakeConcreteBuilder(std::shared_ptr* out) { - std::unique_ptr builder; - RETURN_NOT_OK(MakeBuilder(default_memory_pool(), this->type_, &builder)); - *out = checked_pointer_cast(std::move(builder)); - DCHECK(*out); - return Status::OK(); - } -}; - -// ------------------------------------------------------------------------ -// Converter for null arrays - -class NullConverter final : public ConcreteConverter { - public: - explicit NullConverter(const std::shared_ptr& type) { - type_ = type; - builder_ = std::make_shared(); - } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return AppendNull(); - } - return JSONTypeError("null", json_obj.GetType()); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; -}; - -// ------------------------------------------------------------------------ -// Converter for boolean arrays - -class BooleanConverter final : public ConcreteConverter { - public: - explicit BooleanConverter(const std::shared_ptr& type) { - type_ = type; - builder_ = std::make_shared(); - } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return AppendNull(); - } - if (json_obj.IsBool()) { - return builder_->Append(json_obj.GetBool()); - } - if (json_obj.IsInt()) { - return builder_->Append(json_obj.GetInt() != 0); - } - return JSONTypeError("boolean", json_obj.GetType()); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; -}; - -// ------------------------------------------------------------------------ -// Helpers for numeric converters - -// Convert single signed integer value (also {Date,Time}{32,64} and Timestamp) -template -enable_if_physical_signed_integer ConvertNumber(const rj::Value& json_obj, - const DataType& type, - typename T::c_type* out) { - if (json_obj.IsInt64()) { - int64_t v64 = json_obj.GetInt64(); - *out = static_cast(v64); - if (*out == v64) { - return Status::OK(); - } else { - return Status::Invalid("Value ", v64, " out of bounds for ", type); - } - } else { - *out = static_cast(0); - return JSONTypeError("signed int", json_obj.GetType()); - } -} - -// Convert single unsigned integer value -template -enable_if_unsigned_integer ConvertNumber(const rj::Value& json_obj, - const DataType& type, - typename T::c_type* out) { - if (json_obj.IsUint64()) { - uint64_t v64 = json_obj.GetUint64(); - *out = static_cast(v64); - if (*out == v64) { - return Status::OK(); - } else { - return Status::Invalid("Value ", v64, " out of bounds for ", type); - } - } else { - *out = static_cast(0); - return JSONTypeError("unsigned int", json_obj.GetType()); - } -} - -// Convert float16/HalfFloatType -template -enable_if_half_float ConvertNumber(const rj::Value& json_obj, - const DataType& type, uint16_t* out) { - if (json_obj.IsDouble()) { - double f64 = json_obj.GetDouble(); - *out = Float16(f64).bits(); - return Status::OK(); - } else if (json_obj.IsUint()) { - uint32_t u32t = json_obj.GetUint(); - double f64 = static_cast(u32t); - *out = Float16(f64).bits(); - return Status::OK(); - } else if (json_obj.IsInt()) { - int32_t i32t = json_obj.GetInt(); - double f64 = static_cast(i32t); - *out = Float16(f64).bits(); - return Status::OK(); - } else { - *out = static_cast(0); - return JSONTypeError("unsigned int", json_obj.GetType()); - } -} - -// Convert single floating point value -template -enable_if_physical_floating_point ConvertNumber(const rj::Value& json_obj, - const DataType& type, - typename T::c_type* out) { - if (json_obj.IsNumber()) { - *out = static_cast(json_obj.GetDouble()); - return Status::OK(); - } else { - *out = static_cast(0); - return JSONTypeError("number", json_obj.GetType()); - } -} - -// ------------------------------------------------------------------------ -// Converter for int arrays - -template ::BuilderType> -class IntegerConverter final - : public ConcreteConverter> { - using c_type = typename Type::c_type; - - static constexpr auto is_signed = std::is_signed::value; - - public: - explicit IntegerConverter(const std::shared_ptr& type) { this->type_ = type; } - - Status Init() override { return this->MakeConcreteBuilder(&builder_); } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - c_type value; - RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); - return builder_->Append(value); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; -}; - -// ------------------------------------------------------------------------ -// Converter for float arrays - -template ::BuilderType> -class FloatConverter final : public ConcreteConverter> { - using c_type = typename Type::c_type; - - public: - explicit FloatConverter(const std::shared_ptr& type) { this->type_ = type; } - - Status Init() override { return this->MakeConcreteBuilder(&builder_); } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - c_type value; - RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); - return builder_->Append(value); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; -}; - -// ------------------------------------------------------------------------ -// Converter for decimal arrays - -template -class DecimalConverter final - : public ConcreteConverter< - DecimalConverter> { - public: - explicit DecimalConverter(const std::shared_ptr& type) { - this->type_ = type; - decimal_type_ = &checked_cast(*this->value_type()); - } - - Status Init() override { return this->MakeConcreteBuilder(&builder_); } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - if (json_obj.IsString()) { - int32_t precision, scale; - DecimalValue d; - auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); - RETURN_NOT_OK(DecimalValue::FromString(view, &d, &precision, &scale)); - if (scale != decimal_type_->scale()) { - return Status::Invalid("Invalid scale for decimal: expected ", - decimal_type_->scale(), ", got ", scale); - } - return builder_->Append(d); - } - return JSONTypeError("decimal string", json_obj.GetType()); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; - const DecimalSubtype* decimal_type_; -}; - -template ::BuilderType> -using Decimal32Converter = DecimalConverter; -template ::BuilderType> -using Decimal64Converter = DecimalConverter; -template ::BuilderType> -using Decimal128Converter = DecimalConverter; -template ::BuilderType> -using Decimal256Converter = DecimalConverter; - -// ------------------------------------------------------------------------ -// Converter for timestamp arrays - -class TimestampConverter final : public ConcreteConverter { - public: - explicit TimestampConverter(const std::shared_ptr& type) - : timestamp_type_{checked_cast(type.get())} { - this->type_ = type; - builder_ = std::make_shared(type, default_memory_pool()); - } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - int64_t value; - if (json_obj.IsNumber()) { - RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); - } else if (json_obj.IsString()) { - std::string_view view(json_obj.GetString(), json_obj.GetStringLength()); - if (!ParseValue(*timestamp_type_, view.data(), view.size(), &value)) { - return Status::Invalid("couldn't parse timestamp from ", view); - } - } else { - return JSONTypeError("timestamp", json_obj.GetType()); - } - return builder_->Append(value); - } - - std::shared_ptr builder() override { return builder_; } - - private: - const TimestampType* timestamp_type_; - std::shared_ptr builder_; -}; - -// ------------------------------------------------------------------------ -// Converter for day-time interval arrays - -class DayTimeIntervalConverter final - : public ConcreteConverter { - public: - explicit DayTimeIntervalConverter(const std::shared_ptr& type) { - this->type_ = type; - builder_ = std::make_shared(default_memory_pool()); - } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - DayTimeIntervalType::DayMilliseconds value; - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - if (json_obj.Size() != 2) { - return Status::Invalid( - "day time interval pair must have exactly two elements, had ", json_obj.Size()); - } - RETURN_NOT_OK(ConvertNumber(json_obj[0], *this->type_, &value.days)); - RETURN_NOT_OK( - ConvertNumber(json_obj[1], *this->type_, &value.milliseconds)); - return builder_->Append(value); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; -}; - -class MonthDayNanoIntervalConverter final - : public ConcreteConverter { - public: - explicit MonthDayNanoIntervalConverter(const std::shared_ptr& type) { - this->type_ = type; - builder_ = std::make_shared(default_memory_pool()); - } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - MonthDayNanoIntervalType::MonthDayNanos value; - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - if (json_obj.Size() != 3) { - return Status::Invalid( - "month_day_nano_interval must have exactly 3 elements, had ", json_obj.Size()); - } - RETURN_NOT_OK(ConvertNumber(json_obj[0], *this->type_, &value.months)); - RETURN_NOT_OK(ConvertNumber(json_obj[1], *this->type_, &value.days)); - RETURN_NOT_OK( - ConvertNumber(json_obj[2], *this->type_, &value.nanoseconds)); - - return builder_->Append(value); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; -}; - -// ------------------------------------------------------------------------ -// Converter for binary and string arrays - -template ::BuilderType> -class StringConverter final - : public ConcreteConverter> { - public: - explicit StringConverter(const std::shared_ptr& type) { this->type_ = type; } - - Status Init() override { return this->MakeConcreteBuilder(&builder_); } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - if (json_obj.IsString()) { - auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); - return builder_->Append(view); - } else { - return JSONTypeError("string", json_obj.GetType()); - } - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; -}; - -// ------------------------------------------------------------------------ -// Converter for fixed-size binary arrays - -template ::BuilderType> -class FixedSizeBinaryConverter final - : public ConcreteConverter> { - public: - explicit FixedSizeBinaryConverter(const std::shared_ptr& type) { - this->type_ = type; - } - - Status Init() override { return this->MakeConcreteBuilder(&builder_); } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - if (json_obj.IsString()) { - auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); - if (view.length() != static_cast(builder_->byte_width())) { - std::stringstream ss; - ss << "Invalid string length " << view.length() << " in JSON input for " - << this->type_->ToString(); - return Status::Invalid(ss.str()); - } - return builder_->Append(view); - } else { - return JSONTypeError("string", json_obj.GetType()); - } - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; -}; - -// ------------------------------------------------------------------------ -// Converter for list arrays - -template -class VarLengthListLikeConverter final - : public ConcreteConverter> { - public: - using BuilderType = typename TypeTraits::BuilderType; - - explicit VarLengthListLikeConverter(const std::shared_ptr& type) { - this->type_ = type; - } - - Status Init() override { - const auto& var_length_list_like_type = checked_cast(*this->type_); - RETURN_NOT_OK( - GetConverter(var_length_list_like_type.value_type(), &child_converter_)); - auto child_builder = child_converter_->builder(); - builder_ = - std::make_shared(default_memory_pool(), child_builder, this->type_); - return Status::OK(); - } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - // Extend the child converter with this JSON array - ARROW_ASSIGN_OR_RAISE(auto size, this->SizeOfJSONArray(json_obj)); - RETURN_NOT_OK(builder_->Append(true, size)); - return child_converter_->AppendValues(json_obj); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; - std::shared_ptr child_converter_; -}; - -// ------------------------------------------------------------------------ -// Converter for map arrays - -class MapConverter final : public ConcreteConverter { - public: - explicit MapConverter(const std::shared_ptr& type) { type_ = type; } - - Status Init() override { - const auto& map_type = checked_cast(*type_); - RETURN_NOT_OK(GetConverter(map_type.key_type(), &key_converter_)); - RETURN_NOT_OK(GetConverter(map_type.item_type(), &item_converter_)); - auto key_builder = key_converter_->builder(); - auto item_builder = item_converter_->builder(); - builder_ = std::make_shared(default_memory_pool(), key_builder, - item_builder, type_); - return Status::OK(); - } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - RETURN_NOT_OK(builder_->Append()); - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - auto size = json_obj.Size(); - for (uint32_t i = 0; i < size; ++i) { - const auto& json_pair = json_obj[i]; - if (!json_pair.IsArray()) { - return JSONTypeError("array", json_pair.GetType()); - } - if (json_pair.Size() != 2) { - return Status::Invalid("key item pair must have exactly two elements, had ", - json_pair.Size()); - } - if (json_pair[0].IsNull()) { - return Status::Invalid("null key is invalid"); - } - RETURN_NOT_OK(key_converter_->AppendValue(json_pair[0])); - RETURN_NOT_OK(item_converter_->AppendValue(json_pair[1])); - } - return Status::OK(); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; - std::shared_ptr key_converter_, item_converter_; -}; - -// ------------------------------------------------------------------------ -// Converter for fixed size list arrays - -class FixedSizeListConverter final : public ConcreteConverter { - public: - explicit FixedSizeListConverter(const std::shared_ptr& type) { type_ = type; } - - Status Init() override { - const auto& list_type = checked_cast(*type_); - list_size_ = list_type.list_size(); - RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); - auto child_builder = child_converter_->builder(); - builder_ = std::make_shared(default_memory_pool(), - child_builder, type_); - return Status::OK(); - } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - RETURN_NOT_OK(builder_->Append()); - // Extend the child converter with this JSON array - RETURN_NOT_OK(child_converter_->AppendValues(json_obj)); - if (json_obj.GetArray().Size() != static_cast(list_size_)) { - return Status::Invalid("incorrect list size ", json_obj.GetArray().Size()); - } - return Status::OK(); - } - - std::shared_ptr builder() override { return builder_; } - - private: - int32_t list_size_; - std::shared_ptr builder_; - std::shared_ptr child_converter_; -}; - -// ------------------------------------------------------------------------ -// Converter for struct arrays - -class StructConverter final : public ConcreteConverter { - public: - explicit StructConverter(const std::shared_ptr& type) { type_ = type; } - - Status Init() override { - std::vector> child_builders; - for (const auto& field : type_->fields()) { - std::shared_ptr child_converter; - RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); - child_converters_.push_back(child_converter); - child_builders.push_back(child_converter->builder()); - } - builder_ = std::make_shared(type_, default_memory_pool(), - std::move(child_builders)); - return Status::OK(); - } - - // Append a JSON value that is either an array of N elements in order - // or an object mapping struct names to values (omitted struct members - // are mapped to null). - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - if (json_obj.IsArray()) { - auto size = json_obj.Size(); - auto expected_size = static_cast(type_->num_fields()); - if (size != expected_size) { - return Status::Invalid("Expected array of size ", expected_size, - ", got array of size ", size); - } - for (uint32_t i = 0; i < size; ++i) { - RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); - } - return builder_->Append(); - } - if (json_obj.IsObject()) { - auto remaining = json_obj.MemberCount(); - auto num_children = type_->num_fields(); - for (int32_t i = 0; i < num_children; ++i) { - const auto& field = type_->field(i); - auto it = json_obj.FindMember(field->name()); - if (it != json_obj.MemberEnd()) { - --remaining; - RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); - } else { - RETURN_NOT_OK(child_converters_[i]->AppendNull()); - } - } - if (remaining > 0) { - rj::StringBuffer sb; - rj::Writer writer(sb); - json_obj.Accept(writer); - return Status::Invalid("Unexpected members in JSON object for type ", - type_->ToString(), " Object: ", sb.GetString()); - } - return builder_->Append(); - } - return JSONTypeError("array or object", json_obj.GetType()); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; - std::vector> child_converters_; -}; - -// ------------------------------------------------------------------------ -// Converter for union arrays - -class UnionConverter final : public ConcreteConverter { - public: - explicit UnionConverter(const std::shared_ptr& type) { type_ = type; } - - Status Init() override { - auto union_type = checked_cast(type_.get()); - mode_ = union_type->mode(); - type_id_to_child_num_.clear(); - type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); - int child_i = 0; - for (auto type_id : union_type->type_codes()) { - type_id_to_child_num_[type_id] = child_i++; - } - std::vector> child_builders; - for (const auto& field : type_->fields()) { - std::shared_ptr child_converter; - RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); - child_converters_.push_back(child_converter); - child_builders.push_back(child_converter->builder()); - } - if (mode_ == UnionMode::DENSE) { - builder_ = std::make_shared(default_memory_pool(), - std::move(child_builders), type_); - } else { - builder_ = std::make_shared(default_memory_pool(), - std::move(child_builders), type_); - } - return Status::OK(); - } - - // Append a JSON value that must be a 2-long array, containing the type_id - // and value of the UnionArray's slot. - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return this->AppendNull(); - } - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - if (json_obj.Size() != 2) { - return Status::Invalid("Expected [type_id, value] pair, got array of size ", - json_obj.Size()); - } - const auto& id_obj = json_obj[0]; - if (!id_obj.IsInt()) { - return JSONTypeError("int", id_obj.GetType()); - } - - auto id = static_cast(id_obj.GetInt()); - auto child_num = type_id_to_child_num_[id]; - if (child_num == -1) { - return Status::Invalid("type_id ", id, " not found in ", *type_); - } - - auto child_converter = child_converters_[child_num]; - if (mode_ == UnionMode::SPARSE) { - RETURN_NOT_OK(checked_cast(*builder_).Append(id)); - for (auto&& other_converter : child_converters_) { - if (other_converter != child_converter) { - RETURN_NOT_OK(other_converter->AppendNull()); - } - } - } else { - RETURN_NOT_OK(checked_cast(*builder_).Append(id)); - } - return child_converter->AppendValue(json_obj[1]); - } - - std::shared_ptr builder() override { return builder_; } - - private: - UnionMode::type mode_; - std::shared_ptr builder_; - std::vector> child_converters_; - std::vector type_id_to_child_num_; -}; - -// ------------------------------------------------------------------------ -// General conversion functions - -Status ConversionNotImplemented(const std::shared_ptr& type) { - return Status::NotImplemented("JSON conversion to ", type->ToString(), - " not implemented"); -} - -Status GetDictConverter(const std::shared_ptr& type, - std::shared_ptr* out) { - std::shared_ptr res; - - const auto value_type = checked_cast(*type).value_type(); - -#define SIMPLE_CONVERTER_CASE(ID, CLASS, TYPE) \ - case ID: \ - res = std::make_shared>>(type); \ - break; - -#define PARAM_CONVERTER_CASE(ID, CLASS, TYPE) \ - case ID: \ - res = std::make_shared>>(type); \ - break; - - switch (value_type->id()) { - PARAM_CONVERTER_CASE(Type::INT8, IntegerConverter, Int8Type) - PARAM_CONVERTER_CASE(Type::INT16, IntegerConverter, Int16Type) - PARAM_CONVERTER_CASE(Type::INT32, IntegerConverter, Int32Type) - PARAM_CONVERTER_CASE(Type::INT64, IntegerConverter, Int64Type) - PARAM_CONVERTER_CASE(Type::UINT8, IntegerConverter, UInt8Type) - PARAM_CONVERTER_CASE(Type::UINT16, IntegerConverter, UInt16Type) - PARAM_CONVERTER_CASE(Type::UINT32, IntegerConverter, UInt32Type) - PARAM_CONVERTER_CASE(Type::UINT64, IntegerConverter, UInt64Type) - PARAM_CONVERTER_CASE(Type::FLOAT, FloatConverter, FloatType) - PARAM_CONVERTER_CASE(Type::DOUBLE, FloatConverter, DoubleType) - PARAM_CONVERTER_CASE(Type::STRING, StringConverter, StringType) - PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) - PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) - PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) - PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType) - PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType) - SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, - FixedSizeBinaryType) - SIMPLE_CONVERTER_CASE(Type::DECIMAL32, Decimal32Converter, Decimal32Type) - SIMPLE_CONVERTER_CASE(Type::DECIMAL64, Decimal64Converter, Decimal64Type) - SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) - SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter, Decimal256Type) - default: - return ConversionNotImplemented(type); - } - -#undef SIMPLE_CONVERTER_CASE -#undef PARAM_CONVERTER_CASE - - RETURN_NOT_OK(res->Init()); - *out = res; - return Status::OK(); -} - -Status GetConverter(const std::shared_ptr& type, - std::shared_ptr* out) { - if (type->id() == Type::DICTIONARY) { - return GetDictConverter(type, out); - } - - std::shared_ptr res; - -#define SIMPLE_CONVERTER_CASE(ID, CLASS) \ - case ID: \ - res = std::make_shared(type); \ - break; - - switch (type->id()) { - SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::UINT64, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) - SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::DURATION, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::NA, NullConverter) - SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) - SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) - SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) - SIMPLE_CONVERTER_CASE(Type::LIST, VarLengthListLikeConverter) - SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, VarLengthListLikeConverter) - SIMPLE_CONVERTER_CASE(Type::LIST_VIEW, VarLengthListLikeConverter) - SIMPLE_CONVERTER_CASE(Type::LARGE_LIST_VIEW, - VarLengthListLikeConverter) - SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) - SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) - SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) - SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) - SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) - SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) - SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) - SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter) - SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter) - SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) - SIMPLE_CONVERTER_CASE(Type::DECIMAL32, Decimal32Converter<>) - SIMPLE_CONVERTER_CASE(Type::DECIMAL64, Decimal64Converter<>) - SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) - SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) - SIMPLE_CONVERTER_CASE(Type::SPARSE_UNION, UnionConverter) - SIMPLE_CONVERTER_CASE(Type::DENSE_UNION, UnionConverter) - SIMPLE_CONVERTER_CASE(Type::INTERVAL_MONTHS, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::INTERVAL_DAY_TIME, DayTimeIntervalConverter) - SIMPLE_CONVERTER_CASE(Type::INTERVAL_MONTH_DAY_NANO, MonthDayNanoIntervalConverter) - default: - return ConversionNotImplemented(type); - } - -#undef SIMPLE_CONVERTER_CASE - - RETURN_NOT_OK(res->Init()); - *out = res; - return Status::OK(); -} - -} // namespace - -Result> ArrayFromJSON(const std::shared_ptr& type, - std::string_view json_string) { - std::shared_ptr converter; - RETURN_NOT_OK(GetConverter(type, &converter)); - - rj::Document json_doc; - json_doc.Parse(json_string.data(), json_string.length()); - if (json_doc.HasParseError()) { - return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", - GetParseError_En(json_doc.GetParseError())); - } - - // The JSON document should be an array, append it - RETURN_NOT_OK(converter->AppendValues(json_doc)); - std::shared_ptr out; - RETURN_NOT_OK(converter->Finish(&out)); - return out; -} - -Result> ArrayFromJSON(const std::shared_ptr& type, - const std::string& json_string) { - return ArrayFromJSON(type, std::string_view(json_string)); -} - -Result> ArrayFromJSON(const std::shared_ptr& type, - const char* json_string) { - return ArrayFromJSON(type, std::string_view(json_string)); -} - -Status ChunkedArrayFromJSON(const std::shared_ptr& type, - const std::vector& json_strings, - std::shared_ptr* out) { - ArrayVector out_chunks; - out_chunks.reserve(json_strings.size()); - for (const std::string& chunk_json : json_strings) { - out_chunks.emplace_back(); - ARROW_ASSIGN_OR_RAISE(out_chunks.back(), ArrayFromJSON(type, chunk_json)); - } - *out = std::make_shared(std::move(out_chunks), type); - return Status::OK(); -} - -Status DictArrayFromJSON(const std::shared_ptr& type, - std::string_view indices_json, std::string_view dictionary_json, - std::shared_ptr* out) { - if (type->id() != Type::DICTIONARY) { - return Status::TypeError("DictArrayFromJSON requires dictionary type, got ", *type); - } - - const auto& dictionary_type = checked_cast(*type); - - ARROW_ASSIGN_OR_RAISE(auto indices, - ArrayFromJSON(dictionary_type.index_type(), indices_json)); - ARROW_ASSIGN_OR_RAISE(auto dictionary, - ArrayFromJSON(dictionary_type.value_type(), dictionary_json)); - - return DictionaryArray::FromArrays(type, std::move(indices), std::move(dictionary)) - .Value(out); -} - -Status ScalarFromJSON(const std::shared_ptr& type, std::string_view json_string, - std::shared_ptr* out) { - std::shared_ptr converter; - RETURN_NOT_OK(GetConverter(type, &converter)); - - rj::Document json_doc; - json_doc.Parse(json_string.data(), json_string.length()); - if (json_doc.HasParseError()) { - return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", - GetParseError_En(json_doc.GetParseError())); - } - - std::shared_ptr array; - RETURN_NOT_OK(converter->AppendValue(json_doc)); - RETURN_NOT_OK(converter->Finish(&array)); - DCHECK_EQ(array->length(), 1); - return array->GetScalar(0).Value(out); -} - -Status DictScalarFromJSON(const std::shared_ptr& type, - std::string_view index_json, std::string_view dictionary_json, - std::shared_ptr* out) { - if (type->id() != Type::DICTIONARY) { - return Status::TypeError("DictScalarFromJSON requires dictionary type, got ", *type); - } - - const auto& dictionary_type = checked_cast(*type); - - std::shared_ptr index; - std::shared_ptr dictionary; - RETURN_NOT_OK(ScalarFromJSON(dictionary_type.index_type(), index_json, &index)); - ARROW_ASSIGN_OR_RAISE(dictionary, - ArrayFromJSON(dictionary_type.value_type(), dictionary_json)); - - *out = DictionaryScalar::Make(std::move(index), std::move(dictionary)); - return Status::OK(); -} - -} // namespace json -} // namespace internal -} // namespace ipc -} // namespace arrow diff --git a/cpp/src/arrow/ipc/json_simple.h b/cpp/src/arrow/ipc/json_simple.h deleted file mode 100644 index 3a730ee6a3f..00000000000 --- a/cpp/src/arrow/ipc/json_simple.h +++ /dev/null @@ -1,71 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Implement a simple JSON representation format for arrays - -#pragma once - -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/type_fwd.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; - -namespace ipc { -namespace internal { -namespace json { - -ARROW_EXPORT -Result> ArrayFromJSON(const std::shared_ptr&, - const std::string& json); - -ARROW_EXPORT -Result> ArrayFromJSON(const std::shared_ptr&, - std::string_view json); - -ARROW_EXPORT -Result> ArrayFromJSON(const std::shared_ptr&, - const char* json); - -ARROW_EXPORT -Status ChunkedArrayFromJSON(const std::shared_ptr& type, - const std::vector& json_strings, - std::shared_ptr* out); - -ARROW_EXPORT -Status DictArrayFromJSON(const std::shared_ptr&, std::string_view indices_json, - std::string_view dictionary_json, std::shared_ptr* out); - -ARROW_EXPORT -Status ScalarFromJSON(const std::shared_ptr&, std::string_view json, - std::shared_ptr* out); - -ARROW_EXPORT -Status DictScalarFromJSON(const std::shared_ptr&, std::string_view index_json, - std::string_view dictionary_json, std::shared_ptr* out); - -} // namespace json -} // namespace internal -} // namespace ipc -} // namespace arrow diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc deleted file mode 100644 index 31312f1ac69..00000000000 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ /dev/null @@ -1,1541 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "arrow/array.h" -#include "arrow/array/builder_decimal.h" -#include "arrow/array/builder_nested.h" -#include "arrow/array/builder_primitive.h" -#include "arrow/array/builder_time.h" -#include "arrow/chunked_array.h" -#include "arrow/ipc/json_simple.h" -#include "arrow/scalar.h" -#include "arrow/testing/builder.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bitmap_builders.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/decimal.h" -#include "arrow/util/float16.h" - -#if defined(_MSC_VER) -// "warning C4307: '+': integral constant overflow" -# pragma warning(disable : 4307) -#endif - -namespace arrow { - -using util::Float16; - -namespace ipc { -namespace internal { -namespace json { - -using ::arrow::internal::BytesToBits; -using ::arrow::internal::checked_cast; -using ::arrow::internal::checked_pointer_cast; - -using ListAndListViewTypes = - ::testing::Types; - -// Avoid undefined behaviour on signed overflow -template -Signed SafeSignedAdd(Signed u, Signed v) { - using Unsigned = typename std::make_unsigned::type; - return static_cast(static_cast(u) + static_cast(v)); -} - -// Special case for 8-bit ints (must output their decimal value, not the -// corresponding ASCII character) -void JSONArrayInternal(std::ostream* ss, int8_t value) { - *ss << static_cast(value); -} - -void JSONArrayInternal(std::ostream* ss, uint8_t value) { - *ss << static_cast(value); -} - -template -void JSONArrayInternal(std::ostream* ss, Value&& value) { - *ss << value; -} - -template -void JSONArrayInternal(std::ostream* ss, Value&& value, Tail&&... tail) { - JSONArrayInternal(ss, std::forward(value)); - *ss << ", "; - JSONArrayInternal(ss, std::forward(tail)...); -} - -template -std::string JSONArray(Args&&... args) { - std::stringstream ss; - ss << "["; - JSONArrayInternal(&ss, std::forward(args)...); - ss << "]"; - return ss.str(); -} - -template -void AssertJSONArray(const std::shared_ptr& type, const std::string& json, - const std::vector& values) { - std::shared_ptr expected; - - ASSERT_OK_AND_ASSIGN(auto actual, ArrayFromJSON(type, json)); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector(type, values, &expected); - AssertArraysEqual(*expected, *actual); -} - -template -void AssertJSONArray(const std::shared_ptr& type, const std::string& json, - const std::vector& is_valid, - const std::vector& values) { - std::shared_ptr expected; - - ASSERT_OK_AND_ASSIGN(auto actual, ArrayFromJSON(type, json)); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector(type, is_valid, values, &expected); - AssertArraysEqual(*expected, *actual); -} - -void AssertJSONDictArray(const std::shared_ptr& index_type, - const std::shared_ptr& value_type, - const std::string& json, - const std::string& expected_indices_json, - const std::string& expected_values_json) { - auto type = dictionary(index_type, value_type); - - ASSERT_OK_AND_ASSIGN(auto expected_indices, - ArrayFromJSON(index_type, expected_indices_json)); - ASSERT_OK_AND_ASSIGN(auto expected_values, - ArrayFromJSON(value_type, expected_values_json)); - - ASSERT_OK_AND_ASSIGN(auto actual, ArrayFromJSON(type, json)); - ASSERT_OK(actual->ValidateFull()); - - const auto& dict_array = checked_cast(*actual); - AssertArraysEqual(*expected_indices, *dict_array.indices()); - AssertArraysEqual(*expected_values, *dict_array.dictionary()); -} - -template -void AssertJSONScalar(const std::shared_ptr& type, const std::string& json, - const bool is_valid, const C_TYPE value) { - SCOPED_TRACE(json); - std::shared_ptr actual, expected; - - ASSERT_OK(ScalarFromJSON(type, json, &actual)); - if (is_valid) { - ASSERT_OK_AND_ASSIGN(expected, MakeScalar(type, value)); - } else { - expected = MakeNullScalar(type); - } - AssertScalarsEqual(*expected, *actual, /*verbose=*/true); -} - -TEST(TestHelper, JSONArray) { - // Test the JSONArray helper func - std::string s = - JSONArray(123, -4.5, static_cast(-12), static_cast(34)); - ASSERT_EQ(s, "[123, -4.5, -12, 34]"); - s = JSONArray(9223372036854775807LL, 9223372036854775808ULL, -9223372036854775807LL - 1, - 18446744073709551615ULL); - ASSERT_EQ(s, - "[9223372036854775807, 9223372036854775808, -9223372036854775808, " - "18446744073709551615]"); -} - -TEST(TestHelper, SafeSignedAdd) { - ASSERT_EQ(0, SafeSignedAdd(-128, -128)); - ASSERT_EQ(1, SafeSignedAdd(-128, -127)); - ASSERT_EQ(-128, SafeSignedAdd(1, 127)); - ASSERT_EQ(-2147483648LL, SafeSignedAdd(1, 2147483647)); -} - -template -class TestIntegers : public ::testing::Test { - public: - std::shared_ptr type() { return TypeTraits::type_singleton(); } -}; - -TYPED_TEST_SUITE_P(TestIntegers); - -TYPED_TEST_P(TestIntegers, Basics) { - using T = TypeParam; - using c_type = typename T::c_type; - - std::shared_ptr expected, actual; - auto type = this->type(); - - AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[4, 0, 5]", {4, 0, 5}); - AssertJSONArray(type, "[4, null, 5]", {true, false, true}, {4, 0, 5}); - - // Test limits - const auto min_val = std::numeric_limits::min(); - const auto max_val = std::numeric_limits::max(); - std::string json_string = JSONArray(0, 1, min_val); - AssertJSONArray(type, json_string, {0, 1, min_val}); - json_string = JSONArray(0, 1, max_val); - AssertJSONArray(type, json_string, {0, 1, max_val}); -} - -TYPED_TEST_P(TestIntegers, Errors) { - std::shared_ptr array; - auto type = this->type(); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "0")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "{}")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0.0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"0\"]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]")); -} - -TYPED_TEST_P(TestIntegers, OutOfBounds) { - using T = TypeParam; - using c_type = typename T::c_type; - - std::shared_ptr array; - auto type = this->type(); - - if (type->id() == Type::UINT64) { - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[18446744073709551616]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-1]")); - } else if (type->id() == Type::INT64) { - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[9223372036854775808]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-9223372036854775809]")); - } else if (std::is_signed::value) { - const auto lower = SafeSignedAdd(std::numeric_limits::min(), -1); - const auto upper = SafeSignedAdd(std::numeric_limits::max(), +1); - auto json_string = JSONArray(lower); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string)); - json_string = JSONArray(upper); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string)); - } else { - const auto upper = static_cast(std::numeric_limits::max()) + 1; - auto json_string = JSONArray(upper); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, json_string)); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[-1]")); - } -} - -TYPED_TEST_P(TestIntegers, Dictionary) { - std::shared_ptr array; - std::shared_ptr value_type = this->type(); - - if (value_type->id() == Type::HALF_FLOAT) { - // Unsupported, skip - return; - } - - AssertJSONDictArray(int8(), value_type, "[1, 2, 3, null, 3, 1]", - /*indices=*/"[0, 1, 2, null, 2, 0]", - /*values=*/"[1, 2, 3]"); -} - -REGISTER_TYPED_TEST_SUITE_P(TestIntegers, Basics, Errors, OutOfBounds, Dictionary); - -INSTANTIATE_TYPED_TEST_SUITE_P(TestInt8, TestIntegers, Int8Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestInt16, TestIntegers, Int16Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestInt32, TestIntegers, Int32Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestInt64, TestIntegers, Int64Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt8, TestIntegers, UInt8Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt16, TestIntegers, UInt16Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt32, TestIntegers, UInt32Type); -INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt64, TestIntegers, UInt64Type); - -template -class TestStrings : public ::testing::Test { - public: - std::shared_ptr type() const { - if constexpr (is_binary_view_like_type::value) { - return T::is_utf8 ? utf8_view() : binary_view(); - } else { - return TypeTraits::type_singleton(); - } - } -}; - -TYPED_TEST_SUITE_P(TestStrings); - -TYPED_TEST_P(TestStrings, Basics) { - using T = TypeParam; - auto type = this->type(); - - std::shared_ptr expected, actual; - - AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[\"\", \"foo\"]", {"", "foo"}); - AssertJSONArray(type, "[\"\", null]", {true, false}, {"", ""}); - // NUL character in string - std::string s = "some"; - s += '\x00'; - s += "char"; - AssertJSONArray(type, "[\"\", \"some\\u0000char\"]", {"", s}); - // UTF8 sequence in string - AssertJSONArray(type, "[\"\xc3\xa9\"]", {"\xc3\xa9"}); - - if (!T::is_utf8) { - // Arbitrary binary (non-UTF8) sequence in string - s = "\xff\x9f"; - AssertJSONArray(type, "[\"" + s + "\"]", {s}); - } - - // Bytes < 0x20 can be represented as JSON unicode escapes - s = '\x00'; - s += "\x1f"; - AssertJSONArray(type, "[\"\\u0000\\u001f\"]", {s}); -} - -TYPED_TEST_P(TestStrings, Errors) { - auto type = this->type(); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]")); -} - -TYPED_TEST_P(TestStrings, Dictionary) { - auto value_type = this->type(); - - AssertJSONDictArray(int16(), value_type, R"(["foo", "bar", null, "bar", "foo"])", - /*indices=*/"[0, 1, null, 1, 0]", - /*values=*/R"(["foo", "bar"])"); -} - -REGISTER_TYPED_TEST_SUITE_P(TestStrings, Basics, Errors, Dictionary); - -INSTANTIATE_TYPED_TEST_SUITE_P(TestString, TestStrings, StringType); -INSTANTIATE_TYPED_TEST_SUITE_P(TestBinary, TestStrings, BinaryType); -INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeString, TestStrings, LargeStringType); -INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeBinary, TestStrings, LargeBinaryType); -INSTANTIATE_TYPED_TEST_SUITE_P(TestStringView, TestStrings, StringViewType); -INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryView, TestStrings, BinaryViewType); - -TEST(TestNull, Basics) { - std::shared_ptr type = null(); - std::shared_ptr expected, actual; - - AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[null, null]", {nullptr, nullptr}); -} - -TEST(TestNull, Errors) { - std::shared_ptr type = null(); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[NaN]")); -} - -TEST(TestBoolean, Basics) { - std::shared_ptr type = boolean(); - - AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[false, true, false]", {false, true, false}); - AssertJSONArray(type, "[false, true, null]", {true, true, false}, - {false, true, false}); - // Supports integer literal casting - AssertJSONArray(type, "[0, 1, 0]", {false, true, false}); - AssertJSONArray(type, "[0, 1, null]", {true, true, false}, - {false, true, false}); -} - -TEST(TestBoolean, Errors) { - std::shared_ptr type = boolean(); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0.0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"true\"]")); -} - -TEST(TestFloat, Basics) { - std::shared_ptr type = float32(); - std::shared_ptr expected, actual; - - AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0f, 2.5f, -3.0e4f}); - AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, - {-0.0f, INFINITY, -INFINITY, 0.0f}); - - // Check NaN separately as AssertArraysEqual simply memcmp's array contents - // and NaNs can have many bit representations. - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[NaN]")); - ASSERT_OK(actual->ValidateFull()); - float value = checked_cast(*actual).Value(0); - ASSERT_TRUE(std::isnan(value)); -} - -TEST(TestFloat, Errors) { - std::shared_ptr type = float32(); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[true]")); -} - -TEST(TestDouble, Basics) { - std::shared_ptr type = float64(); - std::shared_ptr expected, actual; - - AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0, 2.5, -3.0e4}); - AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, - {-0.0, INFINITY, -INFINITY, 0.0}); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[NaN]")); - ASSERT_OK(actual->ValidateFull()); - double value = checked_cast(*actual).Value(0); - ASSERT_TRUE(std::isnan(value)); -} - -TEST(TestDouble, Errors) { - std::shared_ptr type = float64(); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[true]")); -} - -TEST(TestTimestamp, Basics) { - // Timestamp type - auto type = timestamp(TimeUnit::SECOND); - AssertJSONArray( - type, R"(["1970-01-01","2000-02-29","3989-07-14","1900-02-28"])", - {0, 951782400, 63730281600LL, -2203977600LL}); - - type = timestamp(TimeUnit::NANO); - AssertJSONArray( - type, R"(["1970-01-01","2000-02-29","1900-02-28"])", - {0, 951782400000000000LL, -2203977600000000000LL}); -} - -TEST(TestDate, Basics) { - auto type = date32(); - AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); - type = date64(); - AssertJSONArray(type, R"([86400000, null, 172800000])", {true, false, true}, - {86400000, 0, 172800000}); -} - -TEST(TestTime, Basics) { - auto type = time32(TimeUnit::SECOND); - AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); - type = time32(TimeUnit::MILLI); - AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); - - type = time64(TimeUnit::MICRO); - AssertJSONArray(type, R"([1, null, 9999999999])", {true, false, true}, - {1, 0, 9999999999LL}); - type = time64(TimeUnit::NANO); - AssertJSONArray(type, R"([1, null, 9999999999999])", {true, false, true}, - {1, 0, 9999999999999LL}); -} - -TEST(TestDuration, Basics) { - auto type = duration(TimeUnit::SECOND); - AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", - {false, true, true}, - {0, -7777777777777LL, 9999999999999LL}); - type = duration(TimeUnit::MILLI); - AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", - {false, true, true}, - {0, -7777777777777LL, 9999999999999LL}); - type = duration(TimeUnit::MICRO); - AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", - {false, true, true}, - {0, -7777777777777LL, 9999999999999LL}); - type = duration(TimeUnit::NANO); - AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", - {false, true, true}, - {0, -7777777777777LL, 9999999999999LL}); -} - -TEST(TestMonthInterval, Basics) { - auto type = month_interval(); - AssertJSONArray(type, R"([123, -456, null])", {true, true, false}, - {123, -456, 0}); -} - -TEST(TestDayTimeInterval, Basics) { - auto type = day_time_interval(); - AssertJSONArray(type, R"([[1, -600], null])", {true, false}, - {{1, -600}, {}}); -} - -TEST(MonthDayNanoInterval, Basics) { - auto type = month_day_nano_interval(); - AssertJSONArray(type, R"([[1, -600, 5000], null])", - {true, false}, {{1, -600, 5000}, {}}); -} - -TEST(TestFixedSizeBinary, Basics) { - std::shared_ptr type = fixed_size_binary(3); - std::shared_ptr expected, actual; - - AssertJSONArray(type, "[]", {}); - AssertJSONArray(type, "[\"foo\", \"bar\"]", - {"foo", "bar"}); - AssertJSONArray(type, "[null, \"foo\"]", - {false, true}, {"", "foo"}); - // Arbitrary binary (non-UTF8) sequence in string - std::string s = "\xff\x9f\xcc"; - AssertJSONArray(type, "[\"" + s + "\"]", {s}); -} - -TEST(TestFixedSizeBinary, Errors) { - std::shared_ptr type = fixed_size_binary(3); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]")); - // Invalid length - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"abcd\"]")); -} - -TEST(TestFixedSizeBinary, Dictionary) { - std::shared_ptr type = fixed_size_binary(3); - - AssertJSONDictArray(int8(), type, R"(["foo", "bar", "foo", null])", - /*indices=*/"[0, 1, 0, null]", - /*values=*/R"(["foo", "bar"])"); - - // Invalid length - std::shared_ptr array; - ASSERT_RAISES(Invalid, ArrayFromJSON(dictionary(int8(), type), R"(["x"])")); -} - -template -void TestDecimalBasic(std::shared_ptr type) { - std::shared_ptr expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - { - DecimalBuilder builder(type); - ASSERT_OK(builder.Finish(&expected)); - } - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[\"123.4567\", \"-78.9000\"]")); - ASSERT_OK(actual->ValidateFull()); - { - DecimalBuilder builder(type); - ASSERT_OK(builder.Append(DecimalValue(1234567))); - ASSERT_OK(builder.Append(DecimalValue(-789000))); - ASSERT_OK(builder.Finish(&expected)); - } - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[\"123.4567\", null]")); - ASSERT_OK(actual->ValidateFull()); - { - DecimalBuilder builder(type); - ASSERT_OK(builder.Append(DecimalValue(1234567))); - ASSERT_OK(builder.AppendNull()); - ASSERT_OK(builder.Finish(&expected)); - } - AssertArraysEqual(*expected, *actual); -} - -TEST(TestDecimal32, Basics) { - TestDecimalBasic(decimal32(8, 4)); -} - -TEST(TestDecimal64, Basics) { - TestDecimalBasic(decimal64(10, 4)); -} - -TEST(TestDecimal128, Basics) { - TestDecimalBasic(decimal128(10, 4)); -} - -TEST(TestDecimal256, Basics) { - TestDecimalBasic(decimal256(10, 4)); -} - -TEST(TestDecimal, Errors) { - for (std::shared_ptr type : - {decimal32(8, 4), decimal64(10, 4), decimal128(10, 4), decimal256(10, 4)}) { - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[12.3456]")); - // Bad scale - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"12.345\"]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"12.34560\"]")); - } -} - -TEST(TestDecimal, Dictionary) { - for (std::shared_ptr type : - {decimal32(8, 2), decimal64(10, 2), decimal128(10, 2), decimal256(10, 2)}) { - AssertJSONDictArray(int32(), type, - R"(["123.45", "-78.90", "-78.90", null, "123.45"])", - /*indices=*/"[0, 1, 1, null, 0]", - /*values=*/R"(["123.45", "-78.90"])"); - } -} - -template -class TestVarLengthListArray : public ::testing::Test { - public: - using TypeClass = T; - using offset_type = typename TypeClass::offset_type; - using ArrayType = typename TypeTraits::ArrayType; - using BuilderType = typename TypeTraits::BuilderType; - using OffsetType = typename TypeTraits::OffsetType; - - static constexpr bool is_list_view_type = is_list_view(TypeClass::type_id); - - void TestIntegerList() { - auto pool = default_memory_pool(); - std::shared_ptr type = std::make_shared(int64()); - std::shared_ptr offsets, sizes, values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - ArrayFromVector({}, &values); - if constexpr (is_list_view_type) { - ArrayFromVector({}, &sizes); - ASSERT_OK_AND_ASSIGN(expected, - ArrayType::FromArrays(*offsets, *sizes, *values, pool)); - } else { - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); - } - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 2, 2, 3}, &offsets); - ArrayFromVector({4, 5, 6}, &values); - if constexpr (is_list_view_type) { - ArrayFromVector({2, 0, 1}, &sizes); - ASSERT_OK_AND_ASSIGN(expected, - ArrayType::FromArrays(*offsets, *sizes, *values, pool)); - } else { - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); - } - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - if constexpr (is_list_view_type) { - ArrayFromVector({0, 1, 2}, &sizes); - ASSERT_OK_AND_ASSIGN(expected, - ArrayType::FromArrays(*offsets, *sizes, *values, pool)); - } else { - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); - } - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append(true, 0)); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); - } - AssertArraysEqual(*expected, *actual); - } - - void TestIntegerListErrors() { - std::shared_ptr type = std::make_shared(int64()); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); - } - - void TestNullList() { - auto pool = default_memory_pool(); - std::shared_ptr type = std::make_shared(null()); - std::shared_ptr offsets, sizes, values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - values = std::make_shared(0); - if constexpr (is_list_view_type) { - ArrayFromVector({}, &sizes); - ASSERT_OK_AND_ASSIGN(expected, - ArrayType::FromArrays(*offsets, *sizes, *values, pool)); - } else { - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); - } - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - values = std::make_shared(3); - if constexpr (is_list_view_type) { - ArrayFromVector({0, 1, 2}, &sizes); - ASSERT_OK_AND_ASSIGN(expected, - ArrayType::FromArrays(*offsets, *sizes, *values, pool)); - } else { - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); - } - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append(true, 0)); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); - } - AssertArraysEqual(*expected, *actual); - } - - void TestIntegerListList() { - auto pool = default_memory_pool(); - std::shared_ptr type = - std::make_shared(std::make_shared(uint8())); - std::shared_ptr offsets, sizes, values, nested, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - if constexpr (is_list_view_type) { - ArrayFromVector({1, 2, 3}, &sizes); - ASSERT_OK_AND_ASSIGN(nested, - ArrayType::FromArrays(*offsets, *sizes, *values, pool)); - } else { - ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); - } - ArrayFromVector({0, 2, 3}, &offsets); - if constexpr (is_list_view_type) { - ArrayFromVector({2, 1}, &sizes); - ASSERT_OK_AND_ASSIGN(expected, - ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); - } else { - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); - } - ASSERT_EQ(actual->length(), 2); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN( - actual, ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - if constexpr (is_list_view_type) { - ArrayFromVector({0, 1, 0, 2, 3}, &sizes); - ASSERT_OK_AND_ASSIGN(nested, - ArrayType::FromArrays(*offsets, *sizes, *values, pool)); - } else { - ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); - } - ArrayFromVector({0, 0, 1, 4, 5}, &offsets); - if constexpr (is_list_view_type) { - ArrayFromVector({0, 1, 3, 1}, &sizes); - ASSERT_OK_AND_ASSIGN(expected, - ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); - } else { - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); - } - ASSERT_EQ(actual->length(), 4); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - auto& child_builder = checked_cast(*list_builder.value_builder()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append(true, 0)); - ASSERT_OK(child_builder.AppendNull()); - ASSERT_OK(list_builder.Append(true, 0)); - ASSERT_OK(child_builder.Append(true, 0)); - ASSERT_OK(list_builder.Finish(&expected)); - } - } -}; - -TYPED_TEST_SUITE(TestVarLengthListArray, ListAndListViewTypes); - -TYPED_TEST(TestVarLengthListArray, IntegerList) { this->TestIntegerList(); } - -TYPED_TEST(TestVarLengthListArray, IntegerListErrors) { this->TestIntegerListErrors(); } - -TYPED_TEST(TestVarLengthListArray, NullList) { this->TestNullList(); } - -TYPED_TEST(TestVarLengthListArray, IntegerListList) { this->TestIntegerListList(); } - -TEST(TestMap, IntegerToInteger) { - auto type = map(int16(), int16()); - std::shared_ptr expected, actual; - - const char* input = R"( -[ - [[0, 1], [1, 1], [2, 2], [3, 3], [4, 5], [5, 8]], - null, - [[0, null], [1, null], [2, 0], [3, 1], [4, null], [5, 2]], - [] - ] -)"; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, input)); - - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder)); - auto& map_builder = checked_cast(*builder); - auto& key_builder = checked_cast(*map_builder.key_builder()); - auto& item_builder = checked_cast(*map_builder.item_builder()); - - ASSERT_OK(map_builder.Append()); - ASSERT_OK(key_builder.AppendValues({0, 1, 2, 3, 4, 5})); - ASSERT_OK(item_builder.AppendValues({1, 1, 2, 3, 5, 8})); - ASSERT_OK(map_builder.AppendNull()); - ASSERT_OK(map_builder.Append()); - ASSERT_OK(key_builder.AppendValues({0, 1, 2, 3, 4, 5})); - ASSERT_OK(item_builder.AppendValues({-1, -1, 0, 1, -1, 2}, {0, 0, 1, 1, 0, 1})); - ASSERT_OK(map_builder.Append()); - ASSERT_OK(map_builder.Finish(&expected)); - - ASSERT_ARRAYS_EQUAL(*actual, *expected); -} - -TEST(TestMap, StringToInteger) { - auto type = map(utf8(), int32()); - const char* input = R"( -[ - [["joe", 0], ["mark", null]], - null, - [["cap", 8]], - [] - ] -)"; - ASSERT_OK_AND_ASSIGN(auto actual, ArrayFromJSON(type, input)); - std::vector offsets = {0, 2, 2, 3, 3}; - ASSERT_OK_AND_ASSIGN(auto expected_keys, - ArrayFromJSON(utf8(), R"(["joe", "mark", "cap"])")); - ASSERT_OK_AND_ASSIGN(auto expected_values, ArrayFromJSON(int32(), "[0, null, 8]")); - ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, - BytesToBits(std::vector({1, 0, 1, 1}))); - auto expected = - std::make_shared(type, 4, Buffer::Wrap(offsets), expected_keys, - expected_values, expected_null_bitmap, 1); - ASSERT_ARRAYS_EQUAL(*actual, *expected); -} - -TEST(TestMap, Errors) { - auto type = map(int16(), int16()); - std::shared_ptr array; - - // list of pairs isn't an array - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - // pair isn't an array - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[null]]")); - // pair with length != 2 - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[[0]]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[[0, 1, 2]]]")); - // null key - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[[null, 0]]]")); - // key or value fails to convert - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[[0.0, 0]]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[[0, 0.0]]]")); -} - -TEST(TestMap, IntegerMapToStringList) { - auto type = map(map(int16(), int16()), list(utf8())); - std::shared_ptr expected, actual; - - const char* input = R"( -[ - [ - [ - [], - [null, "empty"] - ], - [ - [[0, 1]], - null - ], - [ - [[0, 0], [1, 1]], - ["bootstrapping tautology?", "lispy", null, "i can see eternity"] - ] - ], - null - ] -)"; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, input)); - - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder)); - auto& map_builder = checked_cast(*builder); - auto& key_builder = checked_cast(*map_builder.key_builder()); - auto& key_key_builder = checked_cast(*key_builder.key_builder()); - auto& key_item_builder = checked_cast(*key_builder.item_builder()); - auto& item_builder = checked_cast(*map_builder.item_builder()); - auto& item_value_builder = checked_cast(*item_builder.value_builder()); - - ASSERT_OK(map_builder.Append()); - ASSERT_OK(key_builder.Append()); - ASSERT_OK(item_builder.Append()); - ASSERT_OK(item_value_builder.AppendNull()); - ASSERT_OK(item_value_builder.Append("empty")); - - ASSERT_OK(key_builder.Append()); - ASSERT_OK(item_builder.AppendNull()); - ASSERT_OK(key_key_builder.AppendValues({0})); - ASSERT_OK(key_item_builder.AppendValues({1})); - - ASSERT_OK(key_builder.Append()); - ASSERT_OK(item_builder.Append()); - ASSERT_OK(key_key_builder.AppendValues({0, 1})); - ASSERT_OK(key_item_builder.AppendValues({0, 1})); - ASSERT_OK(item_value_builder.Append("bootstrapping tautology?")); - ASSERT_OK(item_value_builder.Append("lispy")); - ASSERT_OK(item_value_builder.AppendNull()); - ASSERT_OK(item_value_builder.Append("i can see eternity")); - - ASSERT_OK(map_builder.AppendNull()); - - ASSERT_OK(map_builder.Finish(&expected)); - ASSERT_ARRAYS_EQUAL(*actual, *expected); -} - -TEST(TestFixedSizeList, IntegerList) { - auto pool = default_memory_pool(); - auto type = fixed_size_list(int64(), 2); - std::shared_ptr values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({}, &values); - expected = std::make_shared(type, 0, values); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [0, 0], [6, 7]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({4, 5, 0, 0, 6, 7}, &values); - expected = std::make_shared(type, 3, values); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, - ArrayFromJSON(type, "[[null, null], [0, null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - auto is_valid = std::vector{false, false, true, false, true, false}; - ArrayFromVector(is_valid, {0, 0, 0, 0, 6, 0}, &values); - expected = std::make_shared(type, 3, values); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null, null], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - auto value_builder = checked_cast(list_builder.value_builder()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(value_builder->AppendNull()); - ASSERT_OK(value_builder->AppendNull()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); - } - AssertArraysEqual(*expected, *actual); -} - -TEST(TestFixedSizeList, IntegerListErrors) { - std::shared_ptr type = fixed_size_list(int64(), 2); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0, 1.0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808, 0]]")); -} - -TEST(TestFixedSizeList, NullList) { - auto pool = default_memory_pool(); - std::shared_ptr type = fixed_size_list(null(), 2); - std::shared_ptr values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - values = std::make_shared(0); - expected = std::make_shared(type, 0, values); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, - ArrayFromJSON(type, "[[null, null], [null, null], [null, null]]")); - ASSERT_OK(actual->ValidateFull()); - values = std::make_shared(6); - expected = std::make_shared(type, 3, values); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null, null], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - auto value_builder = checked_cast(list_builder.value_builder()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(value_builder->AppendNull()); - ASSERT_OK(value_builder->AppendNull()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); - } - AssertArraysEqual(*expected, *actual); -} - -TEST(TestFixedSizeList, IntegerListList) { - auto pool = default_memory_pool(); - auto nested_type = fixed_size_list(uint8(), 2); - std::shared_ptr type = fixed_size_list(nested_type, 1); - std::shared_ptr values, nested, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[1, 4]], [[2, 5]], [[3, 6]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({1, 4, 2, 5, 3, 6}, &values); - nested = std::make_shared(nested_type, 3, values); - expected = std::make_shared(type, 3, nested); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[1, null]], [null], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - auto nested_builder = - checked_cast(list_builder.value_builder()); - auto value_builder = checked_cast(nested_builder->value_builder()); - - ASSERT_OK(list_builder.Append()); - ASSERT_OK(nested_builder->Append()); - ASSERT_OK(value_builder->Append(1)); - ASSERT_OK(value_builder->AppendNull()); - - ASSERT_OK(list_builder.Append()); - ASSERT_OK(nested_builder->AppendNull()); - - ASSERT_OK(list_builder.AppendNull()); - - ASSERT_OK(list_builder.Finish(&expected)); - } - AssertArraysEqual(*expected, *actual); -} - -TEST(TestStruct, SimpleStruct) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - std::shared_ptr type = struct_({field_a, field_b}); - std::shared_ptr a, b, expected, actual; - std::shared_ptr null_bitmap; - std::vector is_valid; - std::vector> children; - - // Trivial - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({}, &a); - ArrayFromVector({}, &b); - children.assign({a, b}); - expected = std::make_shared(type, 0, children); - AssertArraysEqual(*expected, *actual); - - // Non-empty - ArrayFromVector({5, 6}, &a); - ArrayFromVector({true, false}, &b); - children.assign({a, b}); - expected = std::make_shared(type, 2, children); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[5, true], [6, false]]")); - ASSERT_OK(actual->ValidateFull()); - AssertArraysEqual(*expected, *actual); - ASSERT_OK_AND_ASSIGN( - actual, ArrayFromJSON(type, "[{\"a\": 5, \"b\": true}, {\"b\": false, \"a\": 6}]")); - ASSERT_OK(actual->ValidateFull()); - AssertArraysEqual(*expected, *actual); - - // With nulls - is_valid = {false, true, false, false}; - ArrayFromVector(is_valid, {0, 5, 6, 0}, &a); - is_valid = {false, false, true, false}; - ArrayFromVector(is_valid, {false, true, false, false}, &b); - children.assign({a, b}); - BitmapFromVector({false, true, true, true}, &null_bitmap); - expected = std::make_shared(type, 4, children, null_bitmap, 1); - - ASSERT_OK_AND_ASSIGN( - actual, ArrayFromJSON(type, "[null, [5, null], [null, false], [null, null]]")); - ASSERT_OK(actual->ValidateFull()); - AssertArraysEqual(*expected, *actual); - // When using object notation, null members can be omitted - ASSERT_OK_AND_ASSIGN( - actual, ArrayFromJSON(type, "[null, {\"a\": 5, \"b\": null}, {\"b\": false}, {}]")); - ASSERT_OK(actual->ValidateFull()); - AssertArraysEqual(*expected, *actual); -} - -TEST(TestStruct, NestedStruct) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - auto field_c = field("c", float64()); - std::shared_ptr nested_type = struct_({field_a, field_b}); - auto field_nested = field("nested", nested_type); - std::shared_ptr type = struct_({field_nested, field_c}); - std::shared_ptr expected, actual; - std::shared_ptr null_bitmap; - std::vector is_valid; - std::vector> children(2); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({}, &children[0]); - ArrayFromVector({}, &children[1]); - children[0] = std::make_shared(nested_type, 0, children); - ArrayFromVector({}, &children[1]); - expected = std::make_shared(type, 0, children); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, - ArrayFromJSON(type, "[[[5, true], 1.5], [[6, false], -3e2]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({5, 6}, &children[0]); - ArrayFromVector({true, false}, &children[1]); - children[0] = std::make_shared(nested_type, 2, children); - ArrayFromVector({1.5, -300.0}, &children[1]); - expected = std::make_shared(type, 2, children); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, - ArrayFromJSON(type, "[null, [[5, null], null], [null, -3e2]]")); - ASSERT_OK(actual->ValidateFull()); - is_valid = {false, true, false}; - ArrayFromVector(is_valid, {0, 5, 0}, &children[0]); - is_valid = {false, false, false}; - ArrayFromVector(is_valid, {false, false, false}, &children[1]); - BitmapFromVector({false, true, false}, &null_bitmap); - children[0] = std::make_shared(nested_type, 3, children, null_bitmap, 2); - is_valid = {false, false, true}; - ArrayFromVector(is_valid, {0.0, 0.0, -300.0}, &children[1]); - BitmapFromVector({false, true, true}, &null_bitmap); - expected = std::make_shared(type, 3, children, null_bitmap, 1); - AssertArraysEqual(*expected, *actual); -} - -TEST(TestStruct, Errors) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - std::shared_ptr type = struct_({field_a, field_b}); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0, true]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, true, 1]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[true, 0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"b\": 0, \"a\": true}]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]")); -} - -TEST(TestDenseUnion, Basics) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - - auto type = dense_union({field_a, field_b}, {4, 8}); - ASSERT_OK_AND_ASSIGN( - auto array_parsed, - ArrayFromJSON(type, "[null, [4, 122], [8, true], [4, null], null, [8, false]]")); - auto array = checked_pointer_cast(array_parsed); - - ASSERT_OK_AND_ASSIGN(auto expected_types, ArrayFromJSON(int8(), "[4, 4, 8, 4, 4, 8]")); - ASSERT_OK_AND_ASSIGN(auto expected_offsets, - ArrayFromJSON(int32(), "[0, 1, 0, 2, 3, 1]")); - ASSERT_OK_AND_ASSIGN(auto expected_a, ArrayFromJSON(int8(), "[null, 122, null, null]")); - ASSERT_OK_AND_ASSIGN(auto expected_b, ArrayFromJSON(boolean(), "[true, false]")); - - ASSERT_OK_AND_ASSIGN( - auto expected, DenseUnionArray::Make(*expected_types, *expected_offsets, - {expected_a, expected_b}, {"a", "b"}, {4, 8})); - - ASSERT_ARRAYS_EQUAL(*expected, *array); - - // ensure that the array is as dense as we expect - ASSERT_TRUE(array->value_offsets()->Equals(*expected_offsets->data()->buffers[1])); - ASSERT_ARRAYS_EQUAL(*expected_a, *array->field(0)); - ASSERT_ARRAYS_EQUAL(*expected_b, *array->field(1)); -} - -TEST(TestSparseUnion, Basics) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - - auto type = sparse_union({field_a, field_b}, {4, 8}); - ASSERT_OK_AND_ASSIGN( - auto array, - ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]")); - - ASSERT_OK_AND_ASSIGN(auto expected_types, ArrayFromJSON(int8(), "[4, 8, 4, 4, 8]")); - ASSERT_OK_AND_ASSIGN(auto expected_a, - ArrayFromJSON(int8(), "[122, null, null, null, null]")); - ASSERT_OK_AND_ASSIGN(auto expected_b, - ArrayFromJSON(boolean(), "[null, true, null, null, false]")); - - ASSERT_OK_AND_ASSIGN(auto expected, - SparseUnionArray::Make(*expected_types, {expected_a, expected_b}, - {"a", "b"}, {4, 8})); - - ASSERT_ARRAYS_EQUAL(*expected, *array); -} - -TEST(TestDenseUnion, ListOfUnion) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - auto union_type = dense_union({field_a, field_b}, {4, 8}); - auto list_type = list(union_type); - ASSERT_OK_AND_ASSIGN(auto parsed_array, ArrayFromJSON(list_type, - "[" - "[[4, 122], [8, true]]," - "[[4, null], null, [8, false]]" - "]")); - auto array = checked_pointer_cast(parsed_array); - - ASSERT_OK_AND_ASSIGN(auto expected_types, ArrayFromJSON(int8(), "[4, 8, 4, 4, 8]")); - ASSERT_OK_AND_ASSIGN(auto expected_offsets, ArrayFromJSON(int32(), "[0, 0, 1, 2, 1]")); - ASSERT_OK_AND_ASSIGN(auto expected_a, ArrayFromJSON(int8(), "[122, null, null]")); - ASSERT_OK_AND_ASSIGN(auto expected_b, ArrayFromJSON(boolean(), "[true, false]")); - - ASSERT_OK_AND_ASSIGN( - auto expected_values, - DenseUnionArray::Make(*expected_types, *expected_offsets, {expected_a, expected_b}, - {"a", "b"}, {4, 8})); - ASSERT_OK_AND_ASSIGN(auto expected_list_offsets, ArrayFromJSON(int32(), "[0, 2, 5]")); - ASSERT_OK_AND_ASSIGN(auto expected, - ListArray::FromArrays(*expected_list_offsets, *expected_values)); - - ASSERT_ARRAYS_EQUAL(*expected, *array); - - // ensure that the array is as dense as we expect - auto array_values = checked_pointer_cast(array->values()); - ASSERT_TRUE(array_values->value_offsets()->Equals( - *checked_pointer_cast(expected_values)->value_offsets())); - ASSERT_ARRAYS_EQUAL(*expected_a, *array_values->field(0)); - ASSERT_ARRAYS_EQUAL(*expected_b, *array_values->field(1)); -} - -TEST(TestSparseUnion, ListOfUnion) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - auto union_type = sparse_union({field_a, field_b}, {4, 8}); - auto list_type = list(union_type); - ASSERT_OK_AND_ASSIGN(auto array, ArrayFromJSON(list_type, - "[" - "[[4, 122], [8, true]]," - "[[4, null], null, [8, false]]" - "]")); - - ASSERT_OK_AND_ASSIGN(auto expected_types, ArrayFromJSON(int8(), "[4, 8, 4, 4, 8]")); - ASSERT_OK_AND_ASSIGN(auto expected_a, - ArrayFromJSON(int8(), "[122, null, null, null, null]")); - ASSERT_OK_AND_ASSIGN(auto expected_b, - ArrayFromJSON(boolean(), "[null, true, null, null, false]")); - - ASSERT_OK_AND_ASSIGN(auto expected_values, - SparseUnionArray::Make(*expected_types, {expected_a, expected_b}, - {"a", "b"}, {4, 8})); - ASSERT_OK_AND_ASSIGN(auto expected_list_offsets, ArrayFromJSON(int32(), "[0, 2, 5]")); - ASSERT_OK_AND_ASSIGN(auto expected, - ListArray::FromArrays(*expected_list_offsets, *expected_values)); - - ASSERT_ARRAYS_EQUAL(*expected, *array); -} - -TEST(TestDenseUnion, UnionOfStructs) { - std::vector> fields = { - field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), - field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), - field("foxtrot", list(int8()))})), - field("q", struct_({field("quebec", utf8())}))}; - auto type = dense_union(fields, {0, 23, 47}); - ASSERT_OK_AND_ASSIGN(auto array_parsed, - ArrayFromJSON(type, R"([[0, {"alpha": 0.0, "bravo": "charlie"}], - [23, {"whiskey": 99}], - [0, {"bravo": "mike"}], - null, - [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}] - ])")); - auto array = checked_pointer_cast(array_parsed); - - ASSERT_OK_AND_ASSIGN(auto expected_types, ArrayFromJSON(int8(), "[0, 23, 0, 0, 23]")); - ASSERT_OK_AND_ASSIGN(auto expected_offsets, ArrayFromJSON(int32(), "[0, 0, 1, 2, 1]")); - ASSERT_OK_AND_ASSIGN(auto expected_fields_0, ArrayFromJSON(fields[0]->type(), R"([ - {"alpha": 0.0, "bravo": "charlie"}, - {"bravo": "mike"}, - null - ])")); - ASSERT_OK_AND_ASSIGN(auto expected_fields_1, ArrayFromJSON(fields[1]->type(), R"([ - {"whiskey": 99}, - {"tango": 8.25, "foxtrot": [0, 2, 3]} - ])")); - ASSERT_OK_AND_ASSIGN(auto expected_fields_2, ArrayFromJSON(fields[2]->type(), "[]")); - ArrayVector expected_fields = {expected_fields_0, expected_fields_1, expected_fields_2}; - - ASSERT_OK_AND_ASSIGN( - auto expected, - DenseUnionArray::Make(*expected_types, *expected_offsets, expected_fields, - {"ab", "wtf", "q"}, {0, 23, 47})); - - ASSERT_ARRAYS_EQUAL(*expected, *array); - - // ensure that the array is as dense as we expect - ASSERT_TRUE(array->value_offsets()->Equals(*expected_offsets->data()->buffers[1])); - for (int i = 0; i < type->num_fields(); ++i) { - ASSERT_ARRAYS_EQUAL(*checked_cast(*expected).field(i), - *array->field(i)); - } -} - -TEST(TestSparseUnion, UnionOfStructs) { - std::vector> fields = { - field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), - field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), - field("foxtrot", list(int8()))})), - field("q", struct_({field("quebec", utf8())}))}; - auto type = sparse_union(fields, {0, 23, 47}); - ASSERT_OK_AND_ASSIGN(auto array, ArrayFromJSON(type, R"([ - [0, {"alpha": 0.0, "bravo": "charlie"}], - [23, {"whiskey": 99}], - [0, {"bravo": "mike"}], - null, - [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}] - ])")); - - ASSERT_OK_AND_ASSIGN(auto expected_types, ArrayFromJSON(int8(), "[0, 23, 0, 0, 23]")); - ASSERT_OK_AND_ASSIGN(auto expected_fields_0, ArrayFromJSON(fields[0]->type(), R"([ - {"alpha": 0.0, "bravo": "charlie"}, - null, - {"bravo": "mike"}, - null, - null - ])")); - ASSERT_OK_AND_ASSIGN(auto expected_fields_1, ArrayFromJSON(fields[1]->type(), R"([ - null, - {"whiskey": 99}, - null, - null, - {"tango": 8.25, "foxtrot": [0, 2, 3]} - ])")); - ASSERT_OK_AND_ASSIGN(auto expected_fields_2, - ArrayFromJSON(fields[2]->type(), "[null, null, null, null, null]")) - ArrayVector expected_fields = {expected_fields_0, expected_fields_1, expected_fields_2}; - - ASSERT_OK_AND_ASSIGN(auto expected, - SparseUnionArray::Make(*expected_types, expected_fields, - {"ab", "wtf", "q"}, {0, 23, 47})); - - ASSERT_ARRAYS_EQUAL(*expected, *array); -} - -TEST(TestDenseUnion, Errors) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - std::shared_ptr type = dense_union({field_a, field_b}, {4, 8}); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a valid type_id\"]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 99]]")); // 0 is not one of {4, 8} - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]")); // "" is not a valid int8() - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a pair\"]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]")); -} - -TEST(TestSparseUnion, Errors) { - auto field_a = field("a", int8()); - auto field_b = field("b", boolean()); - std::shared_ptr type = sparse_union({field_a, field_b}, {4, 8}); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a valid type_id\"]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 99]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]")); - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a pair\"]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]")); -} - -TEST(TestNestedDictionary, ListOfDict) { - auto index_type = int8(); - auto value_type = utf8(); - auto dict_type = dictionary(index_type, value_type); - auto type = list(dict_type); - - std::shared_ptr array, expected, indices, values, dicts, offsets; - - ASSERT_OK_AND_ASSIGN( - array, ArrayFromJSON(type, R"([["ab", "cd", null], null, ["cd", "cd"]])")); - ASSERT_OK(array->ValidateFull()); - - // Build expected array - ASSERT_OK_AND_ASSIGN(indices, ArrayFromJSON(index_type, "[0, 1, null, 1, 1]")); - ASSERT_OK_AND_ASSIGN(values, ArrayFromJSON(value_type, R"(["ab", "cd"])")); - ASSERT_OK_AND_ASSIGN(dicts, DictionaryArray::FromArrays(dict_type, indices, values)); - ASSERT_OK_AND_ASSIGN(offsets, ArrayFromJSON(int32(), "[0, null, 3, 5]")); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *dicts)); - - AssertArraysEqual(*expected, *array, /*verbose=*/true); -} - -TEST(TestDictArrayFromJSON, Basics) { - auto type = dictionary(int32(), utf8()); - auto array = - DictArrayFromJSON(type, "[null, 2, 1, 0]", R"(["whiskey", "tango", "foxtrot"])"); - - ASSERT_OK_AND_ASSIGN(auto expected_indices, ArrayFromJSON(int32(), "[null, 2, 1, 0]")); - ASSERT_OK_AND_ASSIGN(auto expected_dictionary, - ArrayFromJSON(utf8(), R"(["whiskey", "tango", "foxtrot"])")); - - ASSERT_ARRAYS_EQUAL(DictionaryArray(type, expected_indices, expected_dictionary), - *array); -} - -TEST(TestDictArrayFromJSON, Errors) { - auto type = dictionary(int32(), utf8()); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, - DictArrayFromJSON(type, "[\"not a valid index\"]", "[\"\"]", &array)); - ASSERT_RAISES(Invalid, DictArrayFromJSON(type, "[0, 1]", "[1]", - &array)); // dict value isn't string -} - -TEST(TestChunkedArrayFromJSON, Basics) { - auto type = int32(); - std::shared_ptr chunked_array; - ASSERT_OK(ChunkedArrayFromJSON(type, {}, &chunked_array)); - ASSERT_OK(chunked_array->ValidateFull()); - ASSERT_EQ(chunked_array->num_chunks(), 0); - AssertTypeEqual(type, chunked_array->type()); - - ASSERT_OK(ChunkedArrayFromJSON(type, {"[1, 2]", "[3, null, 4]"}, &chunked_array)); - ASSERT_OK(chunked_array->ValidateFull()); - ASSERT_EQ(chunked_array->num_chunks(), 2); - std::shared_ptr expected_chunk; - ASSERT_OK_AND_ASSIGN(expected_chunk, ArrayFromJSON(type, "[1, 2]")); - AssertArraysEqual(*expected_chunk, *chunked_array->chunk(0), /*verbose=*/true); - ASSERT_OK_AND_ASSIGN(expected_chunk, ArrayFromJSON(type, "[3, null, 4]")); - AssertArraysEqual(*expected_chunk, *chunked_array->chunk(1), /*verbose=*/true); -} - -TEST(TestScalarFromJSON, Basics) { - // Sanity check for common types (not exhaustive) - std::shared_ptr scalar; - AssertJSONScalar(int64(), "4", true, 4); - AssertJSONScalar(int64(), "null", false, 0); - AssertJSONScalar>(utf8(), R"("")", true, - Buffer::FromString("")); - AssertJSONScalar>(utf8(), R"("foo")", true, - Buffer::FromString("foo")); - AssertJSONScalar>(utf8(), R"(null)", false, - Buffer::FromString("")); - AssertJSONScalar(null(), "null", false, nullptr); - AssertJSONScalar(boolean(), "true", true, true); - AssertJSONScalar(boolean(), "false", true, false); - AssertJSONScalar(boolean(), "null", false, false); - AssertJSONScalar(boolean(), "0", true, false); - AssertJSONScalar(boolean(), "1", true, true); - AssertJSONScalar(float64(), "1.0", true, 1.0); - AssertJSONScalar(float64(), "-0.0", true, -0.0); - ASSERT_OK(ScalarFromJSON(float64(), "NaN", &scalar)); - ASSERT_TRUE(std::isnan(checked_cast(*scalar).value)); - ASSERT_OK(ScalarFromJSON(float64(), "Inf", &scalar)); - ASSERT_TRUE(std::isinf(checked_cast(*scalar).value)); -} - -TEST(TestScalarFromJSON, Errors) { - std::shared_ptr scalar; - ASSERT_RAISES(Invalid, ScalarFromJSON(int64(), "[0]", &scalar)); - ASSERT_RAISES(Invalid, ScalarFromJSON(int64(), "[9223372036854775808]", &scalar)); - ASSERT_RAISES(Invalid, ScalarFromJSON(int64(), "[-9223372036854775809]", &scalar)); - ASSERT_RAISES(Invalid, ScalarFromJSON(uint64(), "[18446744073709551616]", &scalar)); - ASSERT_RAISES(Invalid, ScalarFromJSON(uint64(), "[-1]", &scalar)); - ASSERT_RAISES(Invalid, ScalarFromJSON(binary(), "0", &scalar)); - ASSERT_RAISES(Invalid, ScalarFromJSON(binary(), "[]", &scalar)); - ASSERT_RAISES(Invalid, ScalarFromJSON(boolean(), "0.0", &scalar)); - ASSERT_RAISES(Invalid, ScalarFromJSON(boolean(), "\"true\"", &scalar)); -} - -TEST(TestDictScalarFromJSON, Basics) { - auto type = dictionary(int32(), utf8()); - auto dict = R"(["whiskey", "tango", "foxtrot"])"; - ASSERT_OK_AND_ASSIGN(auto expected_dictionary, ArrayFromJSON(utf8(), dict)); - - for (auto index : {"null", "2", "1", "0"}) { - auto scalar = DictScalarFromJSON(type, index, dict); - auto expected_index = ScalarFromJSON(int32(), index); - AssertScalarsEqual(*DictionaryScalar::Make(expected_index, expected_dictionary), - *scalar, /*verbose=*/true); - ASSERT_OK(scalar->ValidateFull()); - } -} - -TEST(TestDictScalarFromJSON, Errors) { - auto type = dictionary(int32(), utf8()); - std::shared_ptr scalar; - - ASSERT_RAISES(Invalid, - DictScalarFromJSON(type, "\"not a valid index\"", "[\"\"]", &scalar)); - ASSERT_RAISES(Invalid, DictScalarFromJSON(type, "0", "[1]", - &scalar)); // dict value isn't string -} - -} // namespace json -} // namespace internal -} // namespace ipc -} // namespace arrow diff --git a/cpp/src/arrow/ipc/meson.build b/cpp/src/arrow/ipc/meson.build new file mode 100644 index 00000000000..78a346eefef --- /dev/null +++ b/cpp/src/arrow/ipc/meson.build @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +exc = executable( + 'arrow-feather-test', + sources: ['feather_test.cc'], + dependencies: [arrow_ipc_deps, arrow_test_dep], +) +test('arrow-feather-test', exc) + +ipc_tests = ['message_internal_test', 'read_write_test', 'tensor_test'] + +foreach ipc_test : ipc_tests + test_name = 'arrow-ipc-@0@'.format(ipc_test.replace('_', '-')) + exc = executable( + test_name, + sources: ['@0@.cc'.format(ipc_test)], + dependencies: [arrow_ipc_deps, arrow_test_dep], + ) + test(test_name, exc) +endforeach + +install_headers( + [ + 'api.h', + 'dictionary.h', + 'feather.h', + 'message.h', + 'options.h', + 'reader.h', + 'test_common.h', + 'type_fwd.h', + 'util.h', + 'writer.h', + ], + subdir: 'arrow/ipc', +) + +if needs_utilities or needs_integration + file_to_stream_exc = executable( + 'arrow-file-to-stream', + sources: ['file_to_stream.cc'], + dependencies: [arrow_dep], + install: needs_utilities, + ) + + stream_to_file_exc = executable( + 'arrow-stream-to-file', + sources: ['stream_to_file.cc'], + dependencies: [arrow_dep], + install: needs_utilities, + ) +endif + +exc = executable( + 'arrow-ipc-read-write-benchmark', + sources: ['read_write_benchmark.cc'], + dependencies: [arrow_benchmark_dep], +) +benchmark('arrow-ipc-read-write-benchmark', exc) + +if needs_fuzzing or (needs_utilities + and needs_testing + and needs_lz4 + and needs_zstd +) + fuzz_corpus_exc = executable( + 'arrow-ipc-generate-fuzz-corpus', + sources: ['generate_fuzz_corpus.cc'], + dependencies: [arrow_test_dep_no_main], + ) + + tensor_fuzz_corpus_exc = executable( + 'arrow-ipc-generate-tensor-fuzz-corpus', + sources: ['generate_tensor_fuzz_corpus.cc'], + dependencies: [arrow_test_dep_no_main], + ) +endif + +ipc_fuzz_targets = ['file_fuzz', 'stream_fuzz', 'tensor_stream_fuzz'] + +if needs_fuzzing + foreach ipc_fuzz_target : ipc_fuzz_targets + target_name = 'arrow-ipc-@0@'.format(ipc_fuzz_target.replace('_', '-')) + executable( + target_name, + sources: ['@0@.cc'.format(ipc_fuzz_target)], + dependencies: [arrow_dep], + override_options: ['-Db_sanitize=fuzzer'], + ) + endforeach +endif diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index bed60854209..7919878f148 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -48,6 +48,38 @@ class MemoryPool; namespace ipc { +namespace { + +Status MaybeAlignMetadata(std::shared_ptr* metadata) { + if (reinterpret_cast((*metadata)->data()) % 8 != 0) { + // If the metadata memory is not aligned, we copy it here to avoid + // potential UBSAN issues from Flatbuffers + ARROW_ASSIGN_OR_RAISE(*metadata, (*metadata)->CopySlice(0, (*metadata)->size())); + } + return Status::OK(); +} + +Status CheckMetadataAndGetBodyLength(const Buffer& metadata, int64_t* body_length) { + const flatbuf::Message* fb_message = nullptr; + RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &fb_message)); + *body_length = fb_message->bodyLength(); + if (*body_length < 0) { + return Status::IOError("Invalid IPC message: negative bodyLength"); + } + return Status::OK(); +} + +Status WritePadding(io::OutputStream* stream, int64_t nbytes) { + while (nbytes > 0) { + const int64_t bytes_to_write = std::min(nbytes, kArrowAlignment); + RETURN_NOT_OK(stream->Write(kPaddingBytes, bytes_to_write)); + nbytes -= bytes_to_write; + } + return Status::OK(); +} + +} // namespace + class Message::MessageImpl { public: explicit MessageImpl(std::shared_ptr metadata, std::shared_ptr body) @@ -176,25 +208,6 @@ bool Message::Equals(const Message& other) const { } } -Status MaybeAlignMetadata(std::shared_ptr* metadata) { - if (reinterpret_cast((*metadata)->data()) % 8 != 0) { - // If the metadata memory is not aligned, we copy it here to avoid - // potential UBSAN issues from Flatbuffers - ARROW_ASSIGN_OR_RAISE(*metadata, (*metadata)->CopySlice(0, (*metadata)->size())); - } - return Status::OK(); -} - -Status CheckMetadataAndGetBodyLength(const Buffer& metadata, int64_t* body_length) { - const flatbuf::Message* fb_message = nullptr; - RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &fb_message)); - *body_length = fb_message->bodyLength(); - if (*body_length < 0) { - return Status::IOError("Invalid IPC message: negative bodyLength"); - } - return Status::OK(); -} - Result> Message::ReadFrom(std::shared_ptr metadata, io::InputStream* stream) { std::unique_ptr result; @@ -228,15 +241,6 @@ Result> Message::ReadFrom(const int64_t offset, return result; } -Status WritePadding(io::OutputStream* stream, int64_t nbytes) { - while (nbytes > 0) { - const int64_t bytes_to_write = std::min(nbytes, kArrowAlignment); - RETURN_NOT_OK(stream->Write(kPaddingBytes, bytes_to_write)); - nbytes -= bytes_to_write; - } - return Status::OK(); -} - Status Message::SerializeTo(io::OutputStream* stream, const IpcWriteOptions& options, int64_t* output_length) const { int32_t metadata_length = 0; @@ -281,6 +285,8 @@ std::string FormatMessageType(MessageType type) { return "unknown"; } +namespace { + Status ReadFieldsSubset(int64_t offset, int32_t metadata_length, io::RandomAccessFile* file, const FieldsLoaderFunction& fields_loader, @@ -299,8 +305,8 @@ Status ReadFieldsSubset(int64_t offset, int32_t metadata_length, } internal::IoRecordedRandomAccessFile io_recorded_random_access_file(required_size); RETURN_NOT_OK(fields_loader(batch, &io_recorded_random_access_file)); - auto const& read_ranges = io_recorded_random_access_file.GetReadRanges(); - for (auto const& range : read_ranges) { + const auto& read_ranges = io_recorded_random_access_file.GetReadRanges(); + for (const auto& range : read_ranges) { auto read_result = file->ReadAt(offset + metadata_length + range.offset, range.length, body->mutable_data() + range.offset); if (!read_result.ok()) { @@ -311,6 +317,8 @@ Status ReadFieldsSubset(int64_t offset, int32_t metadata_length, return Status::OK(); } +} // namespace + Result> ReadMessage(std::shared_ptr metadata, std::shared_ptr body) { std::unique_ptr result; diff --git a/cpp/src/arrow/ipc/options.h b/cpp/src/arrow/ipc/options.h index 48b6758212b..ec0e2a5b6f9 100644 --- a/cpp/src/arrow/ipc/options.h +++ b/cpp/src/arrow/ipc/options.h @@ -25,6 +25,7 @@ #include "arrow/ipc/type_fwd.h" #include "arrow/status.h" #include "arrow/type_fwd.h" +#include "arrow/util/align_util.h" #include "arrow/util/compression.h" #include "arrow/util/visibility.h" @@ -128,6 +129,18 @@ struct ARROW_EXPORT IpcWriteOptions { static IpcWriteOptions Defaults(); }; +/// \brief Alignment of data in memory +/// Alignment values larger than 0 are taken directly as byte alignment value +/// See util::EnsureAlignment(..., int64_t alignment, ...) +enum class Alignment : int64_t { + /// \brief data is aligned depending on the actual data type + kDataTypeSpecificAlignment = util::kValueAlignment, + /// \brief no particular alignment enforced + kAnyAlignment = 0, + /// \brief data is aligned to 64-byte boundary + k64ByteAlignment = 64 +}; + /// \brief Options for reading Arrow IPC messages struct ARROW_EXPORT IpcReadOptions { /// \brief The maximum permitted schema nesting depth. @@ -161,6 +174,16 @@ struct ARROW_EXPORT IpcReadOptions { /// RecordBatchStreamReader and StreamDecoder classes. bool ensure_native_endian = true; + /// \brief How to align data if mis-aligned + /// + /// Data is copied to aligned memory locations allocated via the + /// MemoryPool configured as \ref arrow::ipc::IpcReadOptions::memory_pool. + /// Some use cases might require data to have a specific alignment, for example, + /// for the data buffer of an Int32 array to be aligned on a 4-byte boundary. + /// + /// Default (kAnyAlignment) keeps the alignment as is, so no copy of data occurs. + Alignment ensure_alignment = Alignment::kAnyAlignment; + /// \brief Options to control caching behavior when pre-buffering is requested /// /// The lazy property will always be reset to true to deliver the expected behavior diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 8bd28e4d584..84ec923ce80 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -579,6 +579,29 @@ TEST_F(TestIpcRoundTrip, SpecificMetadataVersion) { TestMetadataVersion(MetadataVersion::V5); } +TEST_F(TestIpcRoundTrip, ListWithSlicedValues) { + // This tests serialization of a sliced ListArray that got sliced "the Rust + // way": by slicing the value_offsets buffer, but keeping top-level offset at + // 0. + auto child_data = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5]")->data(); + + // Offsets buffer [2, 5] + TypedBufferBuilder offsets_builder; + ASSERT_OK(offsets_builder.Reserve(2)); + ASSERT_OK(offsets_builder.Append(2)); + ASSERT_OK(offsets_builder.Append(5)); + ASSERT_OK_AND_ASSIGN(auto offsets_buffer, offsets_builder.Finish()); + + auto list_data = ArrayData::Make(list(int32()), + /*num_rows=*/1, + /*buffers=*/{nullptr, offsets_buffer}); + list_data->child_data = {child_data}; + std::shared_ptr list_array = MakeArray(list_data); + ASSERT_OK(list_array->ValidateFull()); + + CheckRoundtrip(list_array); +} + TEST(TestReadMessage, CorruptedSmallInput) { std::string data = "abc"; auto reader = io::BufferReader::FromString(data); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index ed7f97b8019..1ec28366267 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -47,6 +47,7 @@ #include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/align_util.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" @@ -636,8 +637,17 @@ Result> LoadRecordBatchSubset( arrow::internal::SwapEndianArrayData(filtered_column)); } } - return RecordBatch::Make(std::move(filtered_schema), metadata->length(), - std::move(filtered_columns)); + auto batch = RecordBatch::Make(std::move(filtered_schema), metadata->length(), + std::move(filtered_columns)); + + if (ARROW_PREDICT_FALSE(context.options.ensure_alignment != Alignment::kAnyAlignment)) { + return util::EnsureAlignment(batch, + // the numerical value of ensure_alignment enum is taken + // literally as byte alignment + static_cast(context.options.ensure_alignment), + context.options.memory_pool); + } + return batch; } Result> LoadRecordBatch( @@ -1152,9 +1162,11 @@ Result> RecordBatchStreamReader::Open( // ---------------------------------------------------------------------- // Reader implementation +namespace { + // Common functions used in both the random-access file reader and the // asynchronous generator -static inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) { +inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) { return FileBlock{block->offset(), block->metaDataLength(), block->bodyLength()}; } @@ -1167,7 +1179,7 @@ Status CheckAligned(const FileBlock& block) { return Status::OK(); } -static Result> ReadMessageFromBlock( +Result> ReadMessageFromBlock( const FileBlock& block, io::RandomAccessFile* file, const FieldsLoaderFunction& fields_loader) { RETURN_NOT_OK(CheckAligned(block)); @@ -1179,7 +1191,7 @@ static Result> ReadMessageFromBlock( return message; } -static Future> ReadMessageFromBlockAsync( +Future> ReadMessageFromBlockAsync( const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& io_context) { if (!bit_util::IsMultipleOf8(block.offset) || !bit_util::IsMultipleOf8(block.metadata_length) || @@ -1199,7 +1211,7 @@ class RecordBatchFileReaderImpl; /// A generator of record batches. /// /// All batches are yielded in order. -class ARROW_EXPORT WholeIpcFileRecordBatchGenerator { +class WholeIpcFileRecordBatchGenerator { public: using Item = std::shared_ptr; @@ -1236,7 +1248,7 @@ class ARROW_EXPORT WholeIpcFileRecordBatchGenerator { /// a subset of columns from the file. /// /// All batches are yielded in order. -class ARROW_EXPORT SelectiveIpcFileRecordBatchGenerator { +class SelectiveIpcFileRecordBatchGenerator { public: using Item = std::shared_ptr; @@ -1878,75 +1890,6 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { bool swap_endian_; }; -Result> RecordBatchFileReader::Open( - io::RandomAccessFile* file, const IpcReadOptions& options) { - ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return Open(file, footer_offset, options); -} - -Result> RecordBatchFileReader::Open( - io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) { - auto result = std::make_shared(); - RETURN_NOT_OK(result->Open(file, footer_offset, options)); - return result; -} - -Result> RecordBatchFileReader::Open( - const std::shared_ptr& file, const IpcReadOptions& options) { - ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return Open(file, footer_offset, options); -} - -Result> RecordBatchFileReader::Open( - const std::shared_ptr& file, int64_t footer_offset, - const IpcReadOptions& options) { - auto result = std::make_shared(); - RETURN_NOT_OK(result->Open(file, footer_offset, options)); - return result; -} - -Future> RecordBatchFileReader::OpenAsync( - const std::shared_ptr& file, const IpcReadOptions& options) { - ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return OpenAsync(file, footer_offset, options); -} - -Future> RecordBatchFileReader::OpenAsync( - io::RandomAccessFile* file, const IpcReadOptions& options) { - ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return OpenAsync(file, footer_offset, options); -} - -Future> RecordBatchFileReader::OpenAsync( - const std::shared_ptr& file, int64_t footer_offset, - const IpcReadOptions& options) { - auto result = std::make_shared(); - return result->OpenAsync(file, footer_offset, options) - .Then([=]() -> Result> { return result; }); -} - -Future> RecordBatchFileReader::OpenAsync( - io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) { - auto result = std::make_shared(); - return result->OpenAsync(file, footer_offset, options) - .Then([=]() -> Result> { return result; }); -} - -Result RecordBatchFileReader::ToRecordBatches() { - RecordBatchVector batches; - const auto n = num_record_batches(); - for (int i = 0; i < n; ++i) { - ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatch(i)); - batches.emplace_back(std::move(batch)); - } - return batches; -} - -Result> RecordBatchFileReader::ToTable() { - ARROW_ASSIGN_OR_RAISE(auto batches, ToRecordBatches()); - return Table::FromRecordBatches(schema(), std::move(batches)); -} - Future SelectiveIpcFileRecordBatchGenerator::operator()() { int index = index_++; @@ -2036,6 +1979,77 @@ Result> WholeIpcFileRecordBatchGenerator::ReadRecor return batch_with_metadata.batch; } +} // namespace + +Result> RecordBatchFileReader::Open( + io::RandomAccessFile* file, const IpcReadOptions& options) { + ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); + return Open(file, footer_offset, options); +} + +Result> RecordBatchFileReader::Open( + io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) { + auto result = std::make_shared(); + RETURN_NOT_OK(result->Open(file, footer_offset, options)); + return result; +} + +Result> RecordBatchFileReader::Open( + const std::shared_ptr& file, const IpcReadOptions& options) { + ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); + return Open(file, footer_offset, options); +} + +Result> RecordBatchFileReader::Open( + const std::shared_ptr& file, int64_t footer_offset, + const IpcReadOptions& options) { + auto result = std::make_shared(); + RETURN_NOT_OK(result->Open(file, footer_offset, options)); + return result; +} + +Future> RecordBatchFileReader::OpenAsync( + const std::shared_ptr& file, const IpcReadOptions& options) { + ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); + return OpenAsync(file, footer_offset, options); +} + +Future> RecordBatchFileReader::OpenAsync( + io::RandomAccessFile* file, const IpcReadOptions& options) { + ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); + return OpenAsync(file, footer_offset, options); +} + +Future> RecordBatchFileReader::OpenAsync( + const std::shared_ptr& file, int64_t footer_offset, + const IpcReadOptions& options) { + auto result = std::make_shared(); + return result->OpenAsync(file, footer_offset, options) + .Then([=]() -> Result> { return result; }); +} + +Future> RecordBatchFileReader::OpenAsync( + io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) { + auto result = std::make_shared(); + return result->OpenAsync(file, footer_offset, options) + .Then([=]() -> Result> { return result; }); +} + +Result RecordBatchFileReader::ToRecordBatches() { + RecordBatchVector batches; + const auto n = num_record_batches(); + for (int i = 0; i < n; ++i) { + ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatch(i)); + batches.emplace_back(std::move(batch)); + } + return batches; +} + +Result> RecordBatchFileReader::ToTable() { + ARROW_ASSIGN_OR_RAISE(auto batches, ToRecordBatches()); + return Table::FromRecordBatches(schema(), std::move(batches)); +} + Status Listener::OnEOS() { return Status::OK(); } Status Listener::OnSchemaDecoded(std::shared_ptr schema) { return Status::OK(); } @@ -2520,6 +2534,8 @@ Result> ReadSparseTensorPayload(const IpcPayload& } // namespace internal +namespace { + Result> ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file) { std::shared_ptr type; @@ -2570,6 +2586,8 @@ Result> ReadSparseTensor(const Buffer& metadata, } } +} // namespace + Result> ReadSparseTensor(const Message& message) { CHECK_HAS_BODY(message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 3d7137e4965..dea40f18e81 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -183,14 +183,6 @@ Status MakeListArray(const std::shared_ptr& child_array, int num_lists, return (**out).Validate(); } -} // namespace - -Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, - std::shared_ptr* out) { - return MakeListArray(child_array, num_lists, include_nulls, pool, out); -} - Status MakeRandomListViewArray(const std::shared_ptr& child_array, int num_lists, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { @@ -217,12 +209,6 @@ Status MakeRandomLargeListViewArray(const std::shared_ptr& child_array, return Status::OK(); } -Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, - std::shared_ptr* out) { - return MakeListArray(child_array, num_lists, include_nulls, pool, out); -} - Status MakeRandomMapArray(const std::shared_ptr& key_array, const std::shared_ptr& item_array, int num_maps, bool include_nulls, MemoryPool* pool, @@ -240,6 +226,20 @@ Status MakeRandomMapArray(const std::shared_ptr& key_array, return (**out).Validate(); } +} // namespace + +Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + return MakeListArray(child_array, num_lists, include_nulls, pool, out); +} + +Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + return MakeListArray(child_array, num_lists, include_nulls, pool, out); +} + Status MakeRandomBooleanArray(const int length, bool include_nulls, std::shared_ptr* out) { std::vector values(length); @@ -421,7 +421,7 @@ Status MakeNullRecordBatch(std::shared_ptr* out) { return Status::OK(); } -Status MakeListRecordBatch(std::shared_ptr* out) { +Status MakeListRecordBatchSized(const int length, std::shared_ptr* out) { // Make the schema auto f0 = field("f0", list(int32())); auto f1 = field("f1", list(list(int32()))); @@ -431,7 +431,6 @@ Status MakeListRecordBatch(std::shared_ptr* out) { // Example data MemoryPool* pool = default_memory_pool(); - const int length = 200; std::shared_ptr leaf_values, list_array, list_list_array, large_list_array; const bool include_nulls = true; RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); @@ -446,7 +445,11 @@ Status MakeListRecordBatch(std::shared_ptr* out) { return Status::OK(); } -Status MakeListViewRecordBatch(std::shared_ptr* out) { +Status MakeListRecordBatch(std::shared_ptr* out) { + return MakeListRecordBatchSized(200, out); +} + +Status MakeListViewRecordBatchSized(const int length, std::shared_ptr* out) { // Make the schema auto f0 = field("f0", list_view(int32())); auto f1 = field("f1", list_view(list_view(int32()))); @@ -456,7 +459,6 @@ Status MakeListViewRecordBatch(std::shared_ptr* out) { // Example data MemoryPool* pool = default_memory_pool(); - const int length = 200; std::shared_ptr leaf_values, list_array, list_list_array, large_list_array; const bool include_nulls = true; RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); @@ -471,6 +473,10 @@ Status MakeListViewRecordBatch(std::shared_ptr* out) { return Status::OK(); } +Status MakeListViewRecordBatch(std::shared_ptr* out) { + return MakeListViewRecordBatchSized(200, out); +} + Status MakeFixedSizeListRecordBatch(std::shared_ptr* out) { // Make the schema auto f0 = field("f0", fixed_size_list(int32(), 1)); @@ -608,6 +614,8 @@ Status MakeStruct(std::shared_ptr* out) { return Status::OK(); } +namespace { + Status AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { auto& child = array->child_data[1]; auto builder = MakeBuilder(child->type).ValueOrDie(); @@ -617,6 +625,8 @@ Status AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) { return Status::OK(); } +} // namespace + Status MakeRunEndEncoded(std::shared_ptr* out) { const int64_t logical_length = 10000; const int64_t slice_offset = 2000; diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 189de288795..6044ef207bc 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -104,9 +104,15 @@ Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeNullRecordBatch(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeListRecordBatchSized(int length, std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeListRecordBatch(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeListViewRecordBatchSized(int length, std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeListViewRecordBatch(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 90574c7cb61..cba484af158 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -324,34 +324,45 @@ class RecordBatchSerializer { // Share slicing logic between ListArray, BinaryArray and LargeBinaryArray using offset_type = typename ArrayType::offset_type; - auto offsets = array.value_offsets(); + if (array.length() == 0) { + *value_offsets = array.value_offsets(); + return Status::OK(); + } - int64_t required_bytes = sizeof(offset_type) * (array.length() + 1); - if (array.offset() != 0) { - // If we have a non-zero offset, then the value offsets do not start at - // zero. We must a) create a new offsets array with shifted offsets and - // b) slice the values array accordingly + const int64_t required_bytes = sizeof(offset_type) * (array.length() + 1); + + offset_type first_offset = 0; + RETURN_NOT_OK(MemoryManager::CopyBufferSliceToCPU( + array.data()->buffers[1], array.offset() * sizeof(offset_type), + sizeof(offset_type), reinterpret_cast(&first_offset))); + if (first_offset > 0) { + // If the offset of the first value is non-zero, then we must create a new + // offsets buffer with shifted offsets. + if (!array.data()->buffers[1]->is_cpu()) { + return Status::NotImplemented("Rebasing non-CPU offsets"); + } ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, AllocateBuffer(required_bytes, options_.memory_pool)); + const offset_type* source_offsets = array.raw_value_offsets(); auto dest_offsets = shifted_offsets->mutable_span_as(); - const offset_type start_offset = array.value_offset(0); + const offset_type start_offset = source_offsets[0]; - for (int i = 0; i < array.length(); ++i) { - dest_offsets[i] = array.value_offset(i) - start_offset; + for (int i = 0; i <= array.length(); ++i) { + dest_offsets[i] = source_offsets[i] - start_offset; } - // Final offset - dest_offsets[array.length()] = array.value_offset(array.length()) - start_offset; - offsets = std::move(shifted_offsets); + *value_offsets = std::move(shifted_offsets); } else { - // ARROW-6046: Slice offsets to used extent, in case we have a truncated - // slice - if (offsets != nullptr && offsets->size() > required_bytes) { - offsets = SliceBuffer(offsets, 0, required_bytes); + // ARROW-6046: if we have a truncated slice with unused leading or + // trailing data, then we slice it. + if (array.offset() > 0 || array.value_offsets()->size() > required_bytes) { + *value_offsets = SliceBuffer( + array.value_offsets(), array.offset() * sizeof(offset_type), required_bytes); + } else { + *value_offsets = array.value_offsets(); } } - *value_offsets = std::move(offsets); return Status::OK(); } @@ -367,6 +378,9 @@ class RecordBatchSerializer { // If we have a non-zero offset, it's likely that the smallest offset is // not zero. We must a) create a new offsets array with shifted offsets and // b) slice the values array accordingly. + if (!array.data()->buffers[1]->is_cpu()) { + return Status::NotImplemented("Rebasing non-CPU list view offsets"); + } ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, AllocateBuffer(required_bytes, options_.memory_pool)); @@ -1535,12 +1549,6 @@ Result> MakeStreamWriter( options, /*is_file_format=*/false); } -Result> NewStreamWriter( - io::OutputStream* sink, const std::shared_ptr& schema, - const IpcWriteOptions& options) { - return MakeStreamWriter(sink, schema, options); -} - Result> MakeFileWriter( io::OutputStream* sink, const std::shared_ptr& schema, const IpcWriteOptions& options, @@ -1560,13 +1568,6 @@ Result> MakeFileWriter( schema, options, /*is_file_format=*/true); } -Result> NewFileWriter( - io::OutputStream* sink, const std::shared_ptr& schema, - const IpcWriteOptions& options, - const std::shared_ptr& metadata) { - return MakeFileWriter(sink, schema, options, metadata); -} - namespace internal { Result> OpenRecordBatchWriter( diff --git a/cpp/src/arrow/json/CMakeLists.txt b/cpp/src/arrow/json/CMakeLists.txt index 95b299d8f0c..fa7d0607848 100644 --- a/cpp/src/arrow/json/CMakeLists.txt +++ b/cpp/src/arrow/json/CMakeLists.txt @@ -20,6 +20,7 @@ add_arrow_test(test chunked_builder_test.cc chunker_test.cc converter_test.cc + from_string_test.cc parser_test.cc reader_test.cc PREFIX diff --git a/cpp/src/arrow/json/from_string.cc b/cpp/src/arrow/json/from_string.cc new file mode 100644 index 00000000000..e35a362f5a2 --- /dev/null +++ b/cpp/src/arrow/json/from_string.cc @@ -0,0 +1,1073 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "arrow/array/array_dict.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_decimal.h" +#include "arrow/array/builder_dict.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/array/builder_time.h" +#include "arrow/array/builder_union.h" +#include "arrow/chunked_array.h" +#include "arrow/json/from_string.h" +#include "arrow/scalar.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/float16.h" +#include "arrow/util/logging_internal.h" +#include "arrow/util/value_parsing.h" + +#include "arrow/json/rapidjson_defs.h" + +#include +#include +#include +#include +#include + +namespace rj = arrow::rapidjson; + +namespace arrow { + +using internal::ParseValue; +using util::Float16; + +namespace json { + +using ::arrow::internal::checked_cast; +using ::arrow::internal::checked_pointer_cast; + +namespace { + +constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; + +const char* JsonTypeName(rj::Type json_type) { + switch (json_type) { + case rapidjson::kNullType: + return "null"; + case rapidjson::kFalseType: + return "false"; + case rapidjson::kTrueType: + return "true"; + case rapidjson::kObjectType: + return "object"; + case rapidjson::kArrayType: + return "array"; + case rapidjson::kStringType: + return "string"; + case rapidjson::kNumberType: + return "number"; + default: + return "unknown"; + } +} + +Status JSONTypeError(const char* expected_type, rj::Type json_type) { + return Status::Invalid("Expected ", expected_type, " or null, got JSON type ", + JsonTypeName(json_type)); +} + +class JSONConverter { + public: + virtual ~JSONConverter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status AppendValue(const rj::Value& json_obj) = 0; + + Status AppendNull() { return this->builder()->AppendNull(); } + + virtual Status AppendValues(const rj::Value& json_array) = 0; + + virtual std::shared_ptr builder() = 0; + + virtual Status Finish(std::shared_ptr* out) { + auto builder = this->builder(); + if (builder->length() == 0) { + // Make sure the builder was initialized + RETURN_NOT_OK(builder->Resize(1)); + } + return builder->Finish(out); + } + + protected: + std::shared_ptr type_; +}; + +Status GetConverter(const std::shared_ptr&, + std::shared_ptr* out); + +// CRTP +template +class ConcreteConverter : public JSONConverter { + public: + Result SizeOfJSONArray(const rj::Value& json_obj) { + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + return json_obj.Size(); + } + + Status AppendValues(const rj::Value& json_array) final { + auto self = static_cast(this); + ARROW_ASSIGN_OR_RAISE(auto size, SizeOfJSONArray(json_array)); + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(self->AppendValue(json_array[i])); + } + return Status::OK(); + } + + const std::shared_ptr& value_type() { + if (type_->id() != Type::DICTIONARY) { + return type_; + } + return checked_cast(*type_).value_type(); + } + + template + Status MakeConcreteBuilder(std::shared_ptr* out) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), this->type_, &builder)); + *out = checked_pointer_cast(std::move(builder)); + DCHECK(*out); + return Status::OK(); + } +}; + +// ------------------------------------------------------------------------ +// Converter for null arrays + +class NullConverter final : public ConcreteConverter { + public: + explicit NullConverter(const std::shared_ptr& type) { + type_ = type; + builder_ = std::make_shared(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + return JSONTypeError("null", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for boolean arrays + +class BooleanConverter final : public ConcreteConverter { + public: + explicit BooleanConverter(const std::shared_ptr& type) { + type_ = type; + builder_ = std::make_shared(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsBool()) { + return builder_->Append(json_obj.GetBool()); + } + if (json_obj.IsInt()) { + return builder_->Append(json_obj.GetInt() != 0); + } + return JSONTypeError("boolean", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Helpers for numeric converters + +// Convert single signed integer value (also {Date,Time}{32,64} and Timestamp) +template +enable_if_physical_signed_integer ConvertNumber(const rj::Value& json_obj, + const DataType& type, + typename T::c_type* out) { + if (json_obj.IsInt64()) { + int64_t v64 = json_obj.GetInt64(); + *out = static_cast(v64); + if (*out == v64) { + return Status::OK(); + } else { + return Status::Invalid("Value ", v64, " out of bounds for ", type); + } + } else { + *out = static_cast(0); + return JSONTypeError("signed int", json_obj.GetType()); + } +} + +// Convert single unsigned integer value +template +enable_if_unsigned_integer ConvertNumber(const rj::Value& json_obj, + const DataType& type, + typename T::c_type* out) { + if (json_obj.IsUint64()) { + uint64_t v64 = json_obj.GetUint64(); + *out = static_cast(v64); + if (*out == v64) { + return Status::OK(); + } else { + return Status::Invalid("Value ", v64, " out of bounds for ", type); + } + } else { + *out = static_cast(0); + return JSONTypeError("unsigned int", json_obj.GetType()); + } +} + +// Convert float16/HalfFloatType +template +enable_if_half_float ConvertNumber(const rj::Value& json_obj, + const DataType& type, uint16_t* out) { + if (json_obj.IsDouble()) { + double f64 = json_obj.GetDouble(); + *out = Float16(f64).bits(); + return Status::OK(); + } else if (json_obj.IsUint()) { + uint32_t u32t = json_obj.GetUint(); + double f64 = static_cast(u32t); + *out = Float16(f64).bits(); + return Status::OK(); + } else if (json_obj.IsInt()) { + int32_t i32t = json_obj.GetInt(); + double f64 = static_cast(i32t); + *out = Float16(f64).bits(); + return Status::OK(); + } else { + *out = static_cast(0); + return JSONTypeError("unsigned int", json_obj.GetType()); + } +} + +// Convert single floating point value +template +enable_if_physical_floating_point ConvertNumber(const rj::Value& json_obj, + const DataType& type, + typename T::c_type* out) { + if (json_obj.IsNumber()) { + *out = static_cast(json_obj.GetDouble()); + return Status::OK(); + } else { + *out = static_cast(0); + return JSONTypeError("number", json_obj.GetType()); + } +} + +// ------------------------------------------------------------------------ +// Converter for int arrays + +template ::BuilderType> +class IntegerConverter final + : public ConcreteConverter> { + using c_type = typename Type::c_type; + + static constexpr auto is_signed = std::is_signed::value; + + public: + explicit IntegerConverter(const std::shared_ptr& type) { this->type_ = type; } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + c_type value; + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); + return builder_->Append(value); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for float arrays + +template ::BuilderType> +class FloatConverter final : public ConcreteConverter> { + using c_type = typename Type::c_type; + + public: + explicit FloatConverter(const std::shared_ptr& type) { this->type_ = type; } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + c_type value; + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); + return builder_->Append(value); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for decimal arrays + +template +class DecimalConverter final + : public ConcreteConverter< + DecimalConverter> { + public: + explicit DecimalConverter(const std::shared_ptr& type) { + this->type_ = type; + decimal_type_ = &checked_cast(*this->value_type()); + } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (json_obj.IsString()) { + int32_t precision, scale; + DecimalValue d; + auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); + RETURN_NOT_OK(DecimalValue::FromString(view, &d, &precision, &scale)); + if (scale != decimal_type_->scale()) { + return Status::Invalid("Invalid scale for decimal: expected ", + decimal_type_->scale(), ", got ", scale); + } + return builder_->Append(d); + } + return JSONTypeError("decimal string", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; + const DecimalSubtype* decimal_type_; +}; + +template ::BuilderType> +using Decimal32Converter = DecimalConverter; +template ::BuilderType> +using Decimal64Converter = DecimalConverter; +template ::BuilderType> +using Decimal128Converter = DecimalConverter; +template ::BuilderType> +using Decimal256Converter = DecimalConverter; + +// ------------------------------------------------------------------------ +// Converter for timestamp arrays + +class TimestampConverter final : public ConcreteConverter { + public: + explicit TimestampConverter(const std::shared_ptr& type) + : timestamp_type_{checked_cast(type.get())} { + this->type_ = type; + builder_ = std::make_shared(type, default_memory_pool()); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + int64_t value; + if (json_obj.IsNumber()) { + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); + } else if (json_obj.IsString()) { + std::string_view view(json_obj.GetString(), json_obj.GetStringLength()); + if (!ParseValue(*timestamp_type_, view.data(), view.size(), &value)) { + return Status::Invalid("couldn't parse timestamp from ", view); + } + } else { + return JSONTypeError("timestamp", json_obj.GetType()); + } + return builder_->Append(value); + } + + std::shared_ptr builder() override { return builder_; } + + private: + const TimestampType* timestamp_type_; + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for day-time interval arrays + +class DayTimeIntervalConverter final + : public ConcreteConverter { + public: + explicit DayTimeIntervalConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(default_memory_pool()); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + DayTimeIntervalType::DayMilliseconds value; + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + if (json_obj.Size() != 2) { + return Status::Invalid( + "day time interval pair must have exactly two elements, had ", json_obj.Size()); + } + RETURN_NOT_OK(ConvertNumber(json_obj[0], *this->type_, &value.days)); + RETURN_NOT_OK( + ConvertNumber(json_obj[1], *this->type_, &value.milliseconds)); + return builder_->Append(value); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + +class MonthDayNanoIntervalConverter final + : public ConcreteConverter { + public: + explicit MonthDayNanoIntervalConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(default_memory_pool()); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + MonthDayNanoIntervalType::MonthDayNanos value; + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + if (json_obj.Size() != 3) { + return Status::Invalid( + "month_day_nano_interval must have exactly 3 elements, had ", json_obj.Size()); + } + RETURN_NOT_OK(ConvertNumber(json_obj[0], *this->type_, &value.months)); + RETURN_NOT_OK(ConvertNumber(json_obj[1], *this->type_, &value.days)); + RETURN_NOT_OK( + ConvertNumber(json_obj[2], *this->type_, &value.nanoseconds)); + + return builder_->Append(value); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for binary and string arrays + +template ::BuilderType> +class StringConverter final + : public ConcreteConverter> { + public: + explicit StringConverter(const std::shared_ptr& type) { this->type_ = type; } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (json_obj.IsString()) { + auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for fixed-size binary arrays + +template ::BuilderType> +class FixedSizeBinaryConverter final + : public ConcreteConverter> { + public: + explicit FixedSizeBinaryConverter(const std::shared_ptr& type) { + this->type_ = type; + } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (json_obj.IsString()) { + auto view = std::string_view(json_obj.GetString(), json_obj.GetStringLength()); + if (view.length() != static_cast(builder_->byte_width())) { + std::stringstream ss; + ss << "Invalid string length " << view.length() << " in JSON input for " + << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for list arrays + +template +class VarLengthListLikeConverter final + : public ConcreteConverter> { + public: + using BuilderType = typename TypeTraits::BuilderType; + + explicit VarLengthListLikeConverter(const std::shared_ptr& type) { + this->type_ = type; + } + + Status Init() override { + const auto& var_length_list_like_type = checked_cast(*this->type_); + RETURN_NOT_OK( + GetConverter(var_length_list_like_type.value_type(), &child_converter_)); + auto child_builder = child_converter_->builder(); + builder_ = + std::make_shared(default_memory_pool(), child_builder, this->type_); + return Status::OK(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + // Extend the child converter with this JSON array + ARROW_ASSIGN_OR_RAISE(auto size, this->SizeOfJSONArray(json_obj)); + RETURN_NOT_OK(builder_->Append(true, size)); + return child_converter_->AppendValues(json_obj); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; + std::shared_ptr child_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for map arrays + +class MapConverter final : public ConcreteConverter { + public: + explicit MapConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + const auto& map_type = checked_cast(*type_); + RETURN_NOT_OK(GetConverter(map_type.key_type(), &key_converter_)); + RETURN_NOT_OK(GetConverter(map_type.item_type(), &item_converter_)); + auto key_builder = key_converter_->builder(); + auto item_builder = item_converter_->builder(); + builder_ = std::make_shared(default_memory_pool(), key_builder, + item_builder, type_); + return Status::OK(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + RETURN_NOT_OK(builder_->Append()); + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + auto size = json_obj.Size(); + for (uint32_t i = 0; i < size; ++i) { + const auto& json_pair = json_obj[i]; + if (!json_pair.IsArray()) { + return JSONTypeError("array", json_pair.GetType()); + } + if (json_pair.Size() != 2) { + return Status::Invalid("key item pair must have exactly two elements, had ", + json_pair.Size()); + } + if (json_pair[0].IsNull()) { + return Status::Invalid("null key is invalid"); + } + RETURN_NOT_OK(key_converter_->AppendValue(json_pair[0])); + RETURN_NOT_OK(item_converter_->AppendValue(json_pair[1])); + } + return Status::OK(); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; + std::shared_ptr key_converter_, item_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for fixed size list arrays + +class FixedSizeListConverter final : public ConcreteConverter { + public: + explicit FixedSizeListConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + const auto& list_type = checked_cast(*type_); + list_size_ = list_type.list_size(); + RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + auto child_builder = child_converter_->builder(); + builder_ = std::make_shared(default_memory_pool(), + child_builder, type_); + return Status::OK(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + RETURN_NOT_OK(builder_->Append()); + // Extend the child converter with this JSON array + RETURN_NOT_OK(child_converter_->AppendValues(json_obj)); + if (json_obj.GetArray().Size() != static_cast(list_size_)) { + return Status::Invalid("incorrect list size ", json_obj.GetArray().Size()); + } + return Status::OK(); + } + + std::shared_ptr builder() override { return builder_; } + + private: + int32_t list_size_; + std::shared_ptr builder_; + std::shared_ptr child_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for struct arrays + +class StructConverter final : public ConcreteConverter { + public: + explicit StructConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + std::vector> child_builders; + for (const auto& field : type_->fields()) { + std::shared_ptr child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + builder_ = std::make_shared(type_, default_memory_pool(), + std::move(child_builders)); + return Status::OK(); + } + + // Append a JSON value that is either an array of N elements in order + // or an object mapping struct names to values (omitted struct members + // are mapped to null). + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (json_obj.IsArray()) { + auto size = json_obj.Size(); + auto expected_size = static_cast(type_->num_fields()); + if (size != expected_size) { + return Status::Invalid("Expected array of size ", expected_size, + ", got array of size ", size); + } + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); + } + return builder_->Append(); + } + if (json_obj.IsObject()) { + auto remaining = json_obj.MemberCount(); + auto num_children = type_->num_fields(); + for (int32_t i = 0; i < num_children; ++i) { + const auto& field = type_->field(i); + auto it = json_obj.FindMember(field->name()); + if (it != json_obj.MemberEnd()) { + --remaining; + RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); + } else { + RETURN_NOT_OK(child_converters_[i]->AppendNull()); + } + } + if (remaining > 0) { + rj::StringBuffer sb; + rj::Writer writer(sb); + json_obj.Accept(writer); + return Status::Invalid("Unexpected members in JSON object for type ", + type_->ToString(), " Object: ", sb.GetString()); + } + return builder_->Append(); + } + return JSONTypeError("array or object", json_obj.GetType()); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; + std::vector> child_converters_; +}; + +// ------------------------------------------------------------------------ +// Converter for union arrays + +class UnionConverter final : public ConcreteConverter { + public: + explicit UnionConverter(const std::shared_ptr& type) { type_ = type; } + + Status Init() override { + auto union_type = checked_cast(type_.get()); + mode_ = union_type->mode(); + type_id_to_child_num_.clear(); + type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); + int child_i = 0; + for (auto type_id : union_type->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; + } + std::vector> child_builders; + for (const auto& field : type_->fields()) { + std::shared_ptr child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + if (mode_ == UnionMode::DENSE) { + builder_ = std::make_shared(default_memory_pool(), + std::move(child_builders), type_); + } else { + builder_ = std::make_shared(default_memory_pool(), + std::move(child_builders), type_); + } + return Status::OK(); + } + + // Append a JSON value that must be a 2-long array, containing the type_id + // and value of the UnionArray's slot. + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + if (json_obj.Size() != 2) { + return Status::Invalid("Expected [type_id, value] pair, got array of size ", + json_obj.Size()); + } + const auto& id_obj = json_obj[0]; + if (!id_obj.IsInt()) { + return JSONTypeError("int", id_obj.GetType()); + } + + auto id = static_cast(id_obj.GetInt()); + auto child_num = type_id_to_child_num_[id]; + if (child_num == -1) { + return Status::Invalid("type_id ", id, " not found in ", *type_); + } + + auto child_converter = child_converters_[child_num]; + if (mode_ == UnionMode::SPARSE) { + RETURN_NOT_OK(checked_cast(*builder_).Append(id)); + for (auto&& other_converter : child_converters_) { + if (other_converter != child_converter) { + RETURN_NOT_OK(other_converter->AppendNull()); + } + } + } else { + RETURN_NOT_OK(checked_cast(*builder_).Append(id)); + } + return child_converter->AppendValue(json_obj[1]); + } + + std::shared_ptr builder() override { return builder_; } + + private: + UnionMode::type mode_; + std::shared_ptr builder_; + std::vector> child_converters_; + std::vector type_id_to_child_num_; +}; + +// ------------------------------------------------------------------------ +// General conversion functions + +Status ConversionNotImplemented(const std::shared_ptr& type) { + return Status::NotImplemented("JSON conversion to ", type->ToString(), + " not implemented"); +} + +Status GetDictConverter(const std::shared_ptr& type, + std::shared_ptr* out) { + std::shared_ptr res; + + const auto value_type = checked_cast(*type).value_type(); + +#define SIMPLE_CONVERTER_CASE(ID, CLASS, TYPE) \ + case ID: \ + res = std::make_shared>>(type); \ + break; + +#define PARAM_CONVERTER_CASE(ID, CLASS, TYPE) \ + case ID: \ + res = std::make_shared>>(type); \ + break; + + switch (value_type->id()) { + PARAM_CONVERTER_CASE(Type::INT8, IntegerConverter, Int8Type) + PARAM_CONVERTER_CASE(Type::INT16, IntegerConverter, Int16Type) + PARAM_CONVERTER_CASE(Type::INT32, IntegerConverter, Int32Type) + PARAM_CONVERTER_CASE(Type::INT64, IntegerConverter, Int64Type) + PARAM_CONVERTER_CASE(Type::UINT8, IntegerConverter, UInt8Type) + PARAM_CONVERTER_CASE(Type::UINT16, IntegerConverter, UInt16Type) + PARAM_CONVERTER_CASE(Type::UINT32, IntegerConverter, UInt32Type) + PARAM_CONVERTER_CASE(Type::UINT64, IntegerConverter, UInt64Type) + PARAM_CONVERTER_CASE(Type::FLOAT, FloatConverter, FloatType) + PARAM_CONVERTER_CASE(Type::DOUBLE, FloatConverter, DoubleType) + PARAM_CONVERTER_CASE(Type::STRING, StringConverter, StringType) + PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) + PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) + PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) + PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType) + PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, + FixedSizeBinaryType) + SIMPLE_CONVERTER_CASE(Type::DECIMAL32, Decimal32Converter, Decimal32Type) + SIMPLE_CONVERTER_CASE(Type::DECIMAL64, Decimal64Converter, Decimal64Type) + SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) + SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter, Decimal256Type) + default: + return ConversionNotImplemented(type); + } + +#undef SIMPLE_CONVERTER_CASE +#undef PARAM_CONVERTER_CASE + + RETURN_NOT_OK(res->Init()); + *out = res; + return Status::OK(); +} + +Status GetConverter(const std::shared_ptr& type, + std::shared_ptr* out) { + if (type->id() == Type::DICTIONARY) { + return GetDictConverter(type, out); + } + + std::shared_ptr res; + +#define SIMPLE_CONVERTER_CASE(ID, CLASS) \ + case ID: \ + res = std::make_shared(type); \ + break; + + switch (type->id()) { + SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::UINT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) + SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DURATION, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::NA, NullConverter) + SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) + SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) + SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LIST_VIEW, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST_VIEW, + VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) + SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) + SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) + SIMPLE_CONVERTER_CASE(Type::DECIMAL32, Decimal32Converter<>) + SIMPLE_CONVERTER_CASE(Type::DECIMAL64, Decimal64Converter<>) + SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) + SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) + SIMPLE_CONVERTER_CASE(Type::SPARSE_UNION, UnionConverter) + SIMPLE_CONVERTER_CASE(Type::DENSE_UNION, UnionConverter) + SIMPLE_CONVERTER_CASE(Type::INTERVAL_MONTHS, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::INTERVAL_DAY_TIME, DayTimeIntervalConverter) + SIMPLE_CONVERTER_CASE(Type::INTERVAL_MONTH_DAY_NANO, MonthDayNanoIntervalConverter) + default: + return ConversionNotImplemented(type); + } + +#undef SIMPLE_CONVERTER_CASE + + RETURN_NOT_OK(res->Init()); + *out = res; + return Status::OK(); +} + +} // namespace + +Result> ArrayFromJSONString(const std::shared_ptr& type, + std::string_view json_string) { + std::shared_ptr converter; + RETURN_NOT_OK(GetConverter(type, &converter)); + + rj::Document json_doc; + json_doc.Parse(json_string.data(), json_string.length()); + if (json_doc.HasParseError()) { + return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", + GetParseError_En(json_doc.GetParseError())); + } + + // The JSON document should be an array, append it + RETURN_NOT_OK(converter->AppendValues(json_doc)); + std::shared_ptr out; + RETURN_NOT_OK(converter->Finish(&out)); + return out; +} + +Result> ArrayFromJSONString(const std::shared_ptr& type, + const std::string& json_string) { + return ArrayFromJSONString(type, std::string_view(json_string)); +} + +Result> ArrayFromJSONString(const std::shared_ptr& type, + const char* json_string) { + return ArrayFromJSONString(type, std::string_view(json_string)); +} + +Result> ChunkedArrayFromJSONString( + const std::shared_ptr& type, const std::vector& json_strings) { + ArrayVector out_chunks; + out_chunks.reserve(json_strings.size()); + for (const std::string& chunk_json : json_strings) { + out_chunks.emplace_back(); + ARROW_ASSIGN_OR_RAISE(out_chunks.back(), ArrayFromJSONString(type, chunk_json)); + } + return std::make_shared(std::move(out_chunks), type); +} + +Result> DictArrayFromJSONString( + const std::shared_ptr& type, std::string_view indices_json, + std::string_view dictionary_json) { + if (type->id() != Type::DICTIONARY) { + return Status::TypeError("DictArrayFromJSON requires dictionary type, got ", *type); + } + + const auto& dictionary_type = checked_cast(*type); + + ARROW_ASSIGN_OR_RAISE(auto indices, + ArrayFromJSONString(dictionary_type.index_type(), indices_json)); + ARROW_ASSIGN_OR_RAISE(auto dictionary, ArrayFromJSONString(dictionary_type.value_type(), + dictionary_json)); + return DictionaryArray::FromArrays(type, std::move(indices), std::move(dictionary)); +} + +Result> ScalarFromJSONString( + const std::shared_ptr& type, std::string_view json_string) { + std::shared_ptr converter; + RETURN_NOT_OK(GetConverter(type, &converter)); + + rj::Document json_doc; + json_doc.Parse(json_string.data(), json_string.length()); + if (json_doc.HasParseError()) { + return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", + GetParseError_En(json_doc.GetParseError())); + } + + std::shared_ptr array; + RETURN_NOT_OK(converter->AppendValue(json_doc)); + RETURN_NOT_OK(converter->Finish(&array)); + DCHECK_EQ(array->length(), 1); + return array->GetScalar(0); +} + +Result> DictScalarFromJSONString( + const std::shared_ptr& type, std::string_view index_json, + std::string_view dictionary_json) { + if (type->id() != Type::DICTIONARY) { + return Status::TypeError("DictScalarFromJSONString requires dictionary type, got ", + *type); + } + + const auto& dictionary_type = checked_cast(*type); + + std::shared_ptr dictionary; + ARROW_ASSIGN_OR_RAISE(auto index, + ScalarFromJSONString(dictionary_type.index_type(), index_json)); + ARROW_ASSIGN_OR_RAISE( + dictionary, ArrayFromJSONString(dictionary_type.value_type(), dictionary_json)); + + return DictionaryScalar::Make(std::move(index), std::move(dictionary)); +} + +} // namespace json +} // namespace arrow diff --git a/cpp/src/arrow/json/from_string.h b/cpp/src/arrow/json/from_string.h new file mode 100644 index 00000000000..bd5ed3d46a3 --- /dev/null +++ b/cpp/src/arrow/json/from_string.h @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement a simple JSON representation format for arrays + +#pragma once + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace json { + +/// \defgroup array-from-json-string FromJSONString Helpers +/// +/// These helpers are intended to be used in examples, tests, or for quick +/// prototyping and are not intended to be used where performance matters. +/// +/// See the User Guide for +/// more information. +/// +/// @{ + +/// \brief Create an Array from a JSON string +/// +/// \code {.cpp} +/// Result> maybe_array = +/// ArrayFromJSONString(int64(), "[2, 3, null, 7, 11]"); +/// \endcode +ARROW_EXPORT +Result> ArrayFromJSONString(const std::shared_ptr&, + const std::string& json); + +/// \copydoc ArrayFromJSONString(const std::shared_ptr&, const std::string&) +ARROW_EXPORT +Result> ArrayFromJSONString(const std::shared_ptr&, + std::string_view json); + +/// \copydoc ArrayFromJSONString(const std::shared_ptr&, const std::string&) +ARROW_EXPORT +Result> ArrayFromJSONString(const std::shared_ptr&, + const char* json); + +/// \brief Create a ChunkedArray from a JSON string +/// +/// \code {.cpp} +/// Result> maybe_chunked_array = +/// ChunkedArrayFromJSONString(int64(), {R"([5, 10])", R"([null])", R"([16])"}); +/// \endcode +ARROW_EXPORT +Result> ChunkedArrayFromJSONString( + const std::shared_ptr& type, const std::vector& json_strings); + +/// \brief Create a DictionaryArray from a JSON string +/// +/// \code {.cpp} +/// Result> maybe_dict_array = +/// DictArrayFromJSONString(dictionary(int32(), utf8()), "[0, 1, 0, 2, 0, 3]", +/// R"(["k1", "k2", "k3", "k4"])"); +/// \endcode +ARROW_EXPORT +Result> DictArrayFromJSONString(const std::shared_ptr&, + std::string_view indices_json, + std::string_view dictionary_json); + +/// \brief Create a Scalar from a JSON string +/// \code {.cpp} +/// Result> maybe_scalar = +/// ScalarFromJSONString(float64(), "42", &scalar); +/// \endcode +ARROW_EXPORT +Result> ScalarFromJSONString(const std::shared_ptr&, + std::string_view json); + +/// \brief Create a DictionaryScalar from a JSON string +/// \code {.cpp} +/// Result> maybe_dict_scalar = +/// DictScalarFromJSONString(dictionary(int32(), utf8()), "3", R"(["k1", "k2", "k3", +/// "k4"])", &scalar); +/// \endcode +ARROW_EXPORT +Result> DictScalarFromJSONString( + const std::shared_ptr&, std::string_view index_json, + std::string_view dictionary_json); + +/// @} + +} // namespace json +} // namespace arrow diff --git a/cpp/src/arrow/json/from_string_test.cc b/cpp/src/arrow/json/from_string_test.cc new file mode 100644 index 00000000000..654450462e3 --- /dev/null +++ b/cpp/src/arrow/json/from_string_test.cc @@ -0,0 +1,1592 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/array/builder_decimal.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/array/builder_time.h" +#include "arrow/chunked_array.h" +#include "arrow/json/from_string.h" +#include "arrow/scalar.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bitmap_builders.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/float16.h" + +#if defined(_MSC_VER) +// "warning C4307: '+': integral constant overflow" +# pragma warning(disable : 4307) +#endif + +namespace arrow { + +using util::Float16; + +namespace json { + +using ::arrow::internal::BytesToBits; +using ::arrow::internal::checked_cast; +using ::arrow::internal::checked_pointer_cast; + +using ListAndListViewTypes = + ::testing::Types; + +// Avoid undefined behaviour on signed overflow +template +Signed SafeSignedAdd(Signed u, Signed v) { + using Unsigned = typename std::make_unsigned::type; + return static_cast(static_cast(u) + static_cast(v)); +} + +// Special case for 8-bit ints (must output their decimal value, not the +// corresponding ASCII character) +void JSONArrayInternal(std::ostream* ss, int8_t value) { + *ss << static_cast(value); +} + +void JSONArrayInternal(std::ostream* ss, uint8_t value) { + *ss << static_cast(value); +} + +template +void JSONArrayInternal(std::ostream* ss, Value&& value) { + *ss << value; +} + +template +void JSONArrayInternal(std::ostream* ss, Value&& value, Tail&&... tail) { + JSONArrayInternal(ss, std::forward(value)); + *ss << ", "; + JSONArrayInternal(ss, std::forward(tail)...); +} + +template +std::string JSONArray(Args&&... args) { + std::stringstream ss; + ss << "["; + JSONArrayInternal(&ss, std::forward(args)...); + ss << "]"; + return ss.str(); +} + +template +void AssertJSONArray(const std::shared_ptr& type, const std::string& json, + const std::vector& values) { + std::shared_ptr expected; + + ASSERT_OK_AND_ASSIGN(auto actual, ArrayFromJSONString(type, json)); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector(type, values, &expected); + AssertArraysEqual(*expected, *actual); +} + +template +void AssertJSONArray(const std::shared_ptr& type, const std::string& json, + const std::vector& is_valid, + const std::vector& values) { + std::shared_ptr expected; + + ASSERT_OK_AND_ASSIGN(auto actual, ArrayFromJSONString(type, json)); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector(type, is_valid, values, &expected); + AssertArraysEqual(*expected, *actual); +} + +void AssertJSONDictArray(const std::shared_ptr& index_type, + const std::shared_ptr& value_type, + const std::string& json, + const std::string& expected_indices_json, + const std::string& expected_values_json) { + auto type = dictionary(index_type, value_type); + + ASSERT_OK_AND_ASSIGN(auto expected_indices, + ArrayFromJSONString(index_type, expected_indices_json)); + ASSERT_OK_AND_ASSIGN(auto expected_values, + ArrayFromJSONString(value_type, expected_values_json)); + + ASSERT_OK_AND_ASSIGN(auto actual, ArrayFromJSONString(type, json)); + ASSERT_OK(actual->ValidateFull()); + + const auto& dict_array = checked_cast(*actual); + AssertArraysEqual(*expected_indices, *dict_array.indices()); + AssertArraysEqual(*expected_values, *dict_array.dictionary()); +} + +template +void AssertJSONScalar(const std::shared_ptr& type, const std::string& json, + const bool is_valid, const C_TYPE value) { + SCOPED_TRACE(json); + std::shared_ptr expected; + + ASSERT_OK_AND_ASSIGN(auto actual, ScalarFromJSONString(type, json)); + if (is_valid) { + ASSERT_OK_AND_ASSIGN(expected, MakeScalar(type, value)); + } else { + expected = MakeNullScalar(type); + } + AssertScalarsEqual(*expected, *actual, /*verbose=*/true); +} + +TEST(TestHelper, JSONArray) { + // Test the JSONArray helper func + std::string s = + JSONArray(123, -4.5, static_cast(-12), static_cast(34)); + ASSERT_EQ(s, "[123, -4.5, -12, 34]"); + s = JSONArray(9223372036854775807LL, 9223372036854775808ULL, -9223372036854775807LL - 1, + 18446744073709551615ULL); + ASSERT_EQ(s, + "[9223372036854775807, 9223372036854775808, -9223372036854775808, " + "18446744073709551615]"); +} + +TEST(TestHelper, SafeSignedAdd) { + ASSERT_EQ(0, SafeSignedAdd(-128, -128)); + ASSERT_EQ(1, SafeSignedAdd(-128, -127)); + ASSERT_EQ(-128, SafeSignedAdd(1, 127)); + ASSERT_EQ(-2147483648LL, SafeSignedAdd(1, 2147483647)); +} + +template +class TestIntegersFromString : public ::testing::Test { + public: + std::shared_ptr type() { return TypeTraits::type_singleton(); } +}; + +TYPED_TEST_SUITE_P(TestIntegersFromString); + +TYPED_TEST_P(TestIntegersFromString, Basics) { + using T = TypeParam; + using c_type = typename T::c_type; + + std::shared_ptr expected, actual; + auto type = this->type(); + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[4, 0, 5]", {4, 0, 5}); + AssertJSONArray(type, "[4, null, 5]", {true, false, true}, {4, 0, 5}); + + // Test limits + const auto min_val = std::numeric_limits::min(); + const auto max_val = std::numeric_limits::max(); + std::string json_string = JSONArray(0, 1, min_val); + AssertJSONArray(type, json_string, {0, 1, min_val}); + json_string = JSONArray(0, 1, max_val); + AssertJSONArray(type, json_string, {0, 1, max_val}); +} + +TYPED_TEST_P(TestIntegersFromString, Errors) { + std::shared_ptr array; + auto type = this->type(); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "0")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "{}")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0.0]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"0\"]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0]]")); +} + +TYPED_TEST_P(TestIntegersFromString, OutOfBounds) { + using T = TypeParam; + using c_type = typename T::c_type; + + std::shared_ptr array; + auto type = this->type(); + + if (type->id() == Type::UINT64) { + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[18446744073709551616]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[-1]")); + } else if (type->id() == Type::INT64) { + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[9223372036854775808]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[-9223372036854775809]")); + } else if (std::is_signed::value) { + const auto lower = SafeSignedAdd(std::numeric_limits::min(), -1); + const auto upper = SafeSignedAdd(std::numeric_limits::max(), +1); + auto json_string = JSONArray(lower); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, json_string)); + json_string = JSONArray(upper); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, json_string)); + } else { + const auto upper = static_cast(std::numeric_limits::max()) + 1; + auto json_string = JSONArray(upper); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, json_string)); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[-1]")); + } +} + +TYPED_TEST_P(TestIntegersFromString, Dictionary) { + std::shared_ptr array; + std::shared_ptr value_type = this->type(); + + if (value_type->id() == Type::HALF_FLOAT) { + // Unsupported, skip + return; + } + + AssertJSONDictArray(int8(), value_type, "[1, 2, 3, null, 3, 1]", + /*indices=*/"[0, 1, 2, null, 2, 0]", + /*values=*/"[1, 2, 3]"); +} + +REGISTER_TYPED_TEST_SUITE_P(TestIntegersFromString, Basics, Errors, OutOfBounds, + Dictionary); + +INSTANTIATE_TYPED_TEST_SUITE_P(TestInt8FromString, TestIntegersFromString, Int8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(TestInt16FromString, TestIntegersFromString, Int16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(TestInt32FromString, TestIntegersFromString, Int32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(TestInt64FromString, TestIntegersFromString, Int64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt8FromString, TestIntegersFromString, UInt8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt16FromString, TestIntegersFromString, UInt16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt32FromString, TestIntegersFromString, UInt32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt64FromString, TestIntegersFromString, UInt64Type); + +template +class TestStringsFromString : public ::testing::Test { + public: + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return T::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } + } +}; + +TYPED_TEST_SUITE_P(TestStringsFromString); + +TYPED_TEST_P(TestStringsFromString, Basics) { + using T = TypeParam; + auto type = this->type(); + + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[\"\", \"foo\"]", {"", "foo"}); + AssertJSONArray(type, "[\"\", null]", {true, false}, {"", ""}); + // NUL character in string + std::string s = "some"; + s += '\x00'; + s += "char"; + AssertJSONArray(type, "[\"\", \"some\\u0000char\"]", {"", s}); + // UTF8 sequence in string + AssertJSONArray(type, "[\"\xc3\xa9\"]", {"\xc3\xa9"}); + + if (!T::is_utf8) { + // Arbitrary binary (non-UTF8) sequence in string + s = "\xff\x9f"; + AssertJSONArray(type, "[\"" + s + "\"]", {s}); + } + + // Bytes < 0x20 can be represented as JSON unicode escapes + s = '\x00'; + s += "\x1f"; + AssertJSONArray(type, "[\"\\u0000\\u001f\"]", {s}); +} + +TYPED_TEST_P(TestStringsFromString, Errors) { + auto type = this->type(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[]]")); +} + +TYPED_TEST_P(TestStringsFromString, Dictionary) { + auto value_type = this->type(); + + AssertJSONDictArray(int16(), value_type, R"(["foo", "bar", null, "bar", "foo"])", + /*indices=*/"[0, 1, null, 1, 0]", + /*values=*/R"(["foo", "bar"])"); +} + +REGISTER_TYPED_TEST_SUITE_P(TestStringsFromString, Basics, Errors, Dictionary); + +INSTANTIATE_TYPED_TEST_SUITE_P(TestStringFromString, TestStringsFromString, StringType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryFromString, TestStringsFromString, BinaryType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeStringFromString, TestStringsFromString, + LargeStringType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeBinaryFromString, TestStringsFromString, + LargeBinaryType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestStringViewFromString, TestStringsFromString, + StringViewType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryViewFromString, TestStringsFromString, + BinaryViewType); + +TEST(TestNullFromString, Basics) { + std::shared_ptr type = null(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[null, null]", {nullptr, nullptr}); +} + +TEST(TestNullFromString, Errors) { + std::shared_ptr type = null(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[NaN]")); +} + +TEST(TestBooleanFromString, Basics) { + std::shared_ptr type = boolean(); + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[false, true, false]", {false, true, false}); + AssertJSONArray(type, "[false, true, null]", {true, true, false}, + {false, true, false}); + // Supports integer literal casting + AssertJSONArray(type, "[0, 1, 0]", {false, true, false}); + AssertJSONArray(type, "[0, 1, null]", {true, true, false}, + {false, true, false}); +} + +TEST(TestBooleanFromString, Errors) { + std::shared_ptr type = boolean(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0.0]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"true\"]")); +} + +TEST(TestHalfFloatFromString, Basics) { + std::shared_ptr type = float16(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, -1, 2.5, -2.5, 3e4, 3e-4]", + {15360, 48128, 16640, 49408, 30547, 3306}); + AssertJSONArray(type, "[0.0, -0.0, Inf, -Inf, null]", + {true, true, true, true, false}, + {0, 32768, 31744, 64512, 0}); + + // Check NaN separately as AssertArraysEqual simply memcmp's array contents + // and NaNs can have many bit representations. + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[NaN]")); + ASSERT_OK(actual->ValidateFull()); + uint16_t value = checked_cast(*actual).Value(0); + ASSERT_TRUE(Float16::FromBits(value).is_nan()); +} + +TEST(TestHalfFloatFromString, Errors) { + std::shared_ptr type = float16(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[true]")); +} + +TEST(TestFloatFromString, Basics) { + std::shared_ptr type = float32(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0f, 2.5f, -3.0e4f}); + AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, + {-0.0f, INFINITY, -INFINITY, 0.0f}); + + // Check NaN separately as AssertArraysEqual simply memcmp's array contents + // and NaNs can have many bit representations. + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[NaN]")); + ASSERT_OK(actual->ValidateFull()); + float value = checked_cast(*actual).Value(0); + ASSERT_TRUE(std::isnan(value)); +} + +TEST(TestFloatFromString, Errors) { + std::shared_ptr type = float32(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[true]")); +} + +TEST(TestDoubleFromString, Basics) { + std::shared_ptr type = float64(); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[1, 2.5, -3e4]", {1.0, 2.5, -3.0e4}); + AssertJSONArray(type, "[-0.0, Inf, -Inf, null]", {true, true, true, false}, + {-0.0, INFINITY, -INFINITY, 0.0}); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[NaN]")); + ASSERT_OK(actual->ValidateFull()); + double value = checked_cast(*actual).Value(0); + ASSERT_TRUE(std::isnan(value)); +} + +TEST(TestDoubleFromString, Errors) { + std::shared_ptr type = float64(); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[true]")); +} + +TEST(TestTimestampFromString, Basics) { + // Timestamp type + auto type = timestamp(TimeUnit::SECOND); + AssertJSONArray( + type, R"(["1970-01-01","2000-02-29","3989-07-14","1900-02-28"])", + {0, 951782400, 63730281600LL, -2203977600LL}); + + type = timestamp(TimeUnit::NANO); + AssertJSONArray( + type, R"(["1970-01-01","2000-02-29","1900-02-28"])", + {0, 951782400000000000LL, -2203977600000000000LL}); +} + +TEST(TestDateFromString, Basics) { + auto type = date32(); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + type = date64(); + AssertJSONArray(type, R"([86400000, null, 172800000])", {true, false, true}, + {86400000, 0, 172800000}); +} + +TEST(TestTimeFromString, Basics) { + auto type = time32(TimeUnit::SECOND); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + type = time32(TimeUnit::MILLI); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + + type = time64(TimeUnit::MICRO); + AssertJSONArray(type, R"([1, null, 9999999999])", {true, false, true}, + {1, 0, 9999999999LL}); + type = time64(TimeUnit::NANO); + AssertJSONArray(type, R"([1, null, 9999999999999])", {true, false, true}, + {1, 0, 9999999999999LL}); +} + +TEST(TestDurationFromString, Basics) { + auto type = duration(TimeUnit::SECOND); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::MILLI); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::MICRO); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::NANO); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); +} + +TEST(TestMonthIntervalFromString, Basics) { + auto type = month_interval(); + AssertJSONArray(type, R"([123, -456, null])", {true, true, false}, + {123, -456, 0}); +} + +TEST(TestDayTimeIntervalFromString, Basics) { + auto type = day_time_interval(); + AssertJSONArray(type, R"([[1, -600], null])", {true, false}, + {{1, -600}, {}}); +} + +TEST(MonthDayNanoIntervalFromString, Basics) { + auto type = month_day_nano_interval(); + AssertJSONArray(type, R"([[1, -600, 5000], null])", + {true, false}, {{1, -600, 5000}, {}}); +} + +TEST(TestFixedSizeBinaryFromString, Basics) { + std::shared_ptr type = fixed_size_binary(3); + std::shared_ptr expected, actual; + + AssertJSONArray(type, "[]", {}); + AssertJSONArray(type, "[\"foo\", \"bar\"]", + {"foo", "bar"}); + AssertJSONArray(type, "[null, \"foo\"]", + {false, true}, {"", "foo"}); + // Arbitrary binary (non-UTF8) sequence in string + std::string s = "\xff\x9f\xcc"; + AssertJSONArray(type, "[\"" + s + "\"]", {s}); +} + +TEST(TestFixedSizeBinaryFromString, Errors) { + std::shared_ptr type = fixed_size_binary(3); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[]]")); + // Invalid length + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"\"]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"abcd\"]")); +} + +TEST(TestFixedSizeBinaryFromString, Dictionary) { + std::shared_ptr type = fixed_size_binary(3); + + AssertJSONDictArray(int8(), type, R"(["foo", "bar", "foo", null])", + /*indices=*/"[0, 1, 0, null]", + /*values=*/R"(["foo", "bar"])"); + + // Invalid length + std::shared_ptr array; + ASSERT_RAISES(Invalid, ArrayFromJSONString(dictionary(int8(), type), R"(["x"])")); +} + +template +void TestDecimalBasic(std::shared_ptr type) { + std::shared_ptr expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + { + DecimalBuilder builder(type); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[\"123.4567\", \"-78.9000\"]")); + ASSERT_OK(actual->ValidateFull()); + { + DecimalBuilder builder(type); + ASSERT_OK(builder.Append(DecimalValue(1234567))); + ASSERT_OK(builder.Append(DecimalValue(-789000))); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[\"123.4567\", null]")); + ASSERT_OK(actual->ValidateFull()); + { + DecimalBuilder builder(type); + ASSERT_OK(builder.Append(DecimalValue(1234567))); + ASSERT_OK(builder.AppendNull()); + ASSERT_OK(builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestDecimal32FromString, Basics) { + TestDecimalBasic(decimal32(8, 4)); +} + +TEST(TestDecimal64FromString, Basics) { + TestDecimalBasic(decimal64(10, 4)); +} + +TEST(TestDecimal128FromString, Basics) { + TestDecimalBasic(decimal128(10, 4)); +} + +TEST(TestDecimal256FromString, Basics) { + TestDecimalBasic(decimal256(10, 4)); +} + +TEST(TestDecimalFromString, Errors) { + for (std::shared_ptr type : + {decimal32(8, 4), decimal64(10, 4), decimal128(10, 4), decimal256(10, 4)}) { + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[12.3456]")); + // Bad scale + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"12.345\"]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"12.34560\"]")); + } +} + +TEST(TestDecimalFromString, Dictionary) { + for (std::shared_ptr type : + {decimal32(8, 2), decimal64(10, 2), decimal128(10, 2), decimal256(10, 2)}) { + AssertJSONDictArray(int32(), type, + R"(["123.45", "-78.90", "-78.90", null, "123.45"])", + /*indices=*/"[0, 1, 1, null, 0]", + /*values=*/R"(["123.45", "-78.90"])"); + } +} + +template +class TestVarLengthListFromString : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + using OffsetType = typename TypeTraits::OffsetType; + + static constexpr bool is_list_view_type = is_list_view(TypeClass::type_id); + + void TestIntegerList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr offsets, sizes, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + ArrayFromVector({}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[[4, 5], [], [6]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 2, 2, 3}, &offsets); + ArrayFromVector({4, 5, 6}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 0, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[[], [null], [6, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + } + + void TestIntegerListErrors() { + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0.0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[9223372036854775808]]")); + } + + void TestNullList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(null()); + std::shared_ptr offsets, sizes, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + values = std::make_shared(0); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[[], [null], [null, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + values = std::make_shared(3); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); + } + + void TestIntegerListList() { + auto pool = default_memory_pool(); + std::shared_ptr type = + std::make_shared(std::make_shared(uint8())); + std::shared_ptr offsets, sizes, values, nested, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, + ArrayFromJSONString(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({1, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } + ArrayFromVector({0, 2, 3}, &offsets); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } + ASSERT_EQ(actual->length(), 2); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN( + actual, ArrayFromJSONString(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 0, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } + ArrayFromVector({0, 0, 1, 4, 5}, &offsets); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 3, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } + ASSERT_EQ(actual->length(), 4); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[null, [null], [[null]]]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto& child_builder = checked_cast(*list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.AppendNull()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.Append(true, 0)); + ASSERT_OK(list_builder.Finish(&expected)); + } + } +}; + +TYPED_TEST_SUITE(TestVarLengthListFromString, ListAndListViewTypes); + +TYPED_TEST(TestVarLengthListFromString, IntegerList) { this->TestIntegerList(); } + +TYPED_TEST(TestVarLengthListFromString, IntegerListErrors) { + this->TestIntegerListErrors(); +} + +TYPED_TEST(TestVarLengthListFromString, NullList) { this->TestNullList(); } + +TYPED_TEST(TestVarLengthListFromString, IntegerListList) { this->TestIntegerListList(); } + +TEST(TestMapFromString, IntegerToInteger) { + auto type = map(int16(), int16()); + std::shared_ptr expected, actual; + + const char* input = R"( +[ + [[0, 1], [1, 1], [2, 2], [3, 3], [4, 5], [5, 8]], + null, + [[0, null], [1, null], [2, 0], [3, 1], [4, null], [5, 2]], + [] + ] +)"; + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, input)); + + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder)); + auto& map_builder = checked_cast(*builder); + auto& key_builder = checked_cast(*map_builder.key_builder()); + auto& item_builder = checked_cast(*map_builder.item_builder()); + + ASSERT_OK(map_builder.Append()); + ASSERT_OK(key_builder.AppendValues({0, 1, 2, 3, 4, 5})); + ASSERT_OK(item_builder.AppendValues({1, 1, 2, 3, 5, 8})); + ASSERT_OK(map_builder.AppendNull()); + ASSERT_OK(map_builder.Append()); + ASSERT_OK(key_builder.AppendValues({0, 1, 2, 3, 4, 5})); + ASSERT_OK(item_builder.AppendValues({-1, -1, 0, 1, -1, 2}, {0, 0, 1, 1, 0, 1})); + ASSERT_OK(map_builder.Append()); + ASSERT_OK(map_builder.Finish(&expected)); + + ASSERT_ARRAYS_EQUAL(*actual, *expected); +} + +TEST(TestMapFromString, StringToInteger) { + auto type = map(utf8(), int32()); + const char* input = R"( +[ + [["joe", 0], ["mark", null]], + null, + [["cap", 8]], + [] + ] +)"; + ASSERT_OK_AND_ASSIGN(auto actual, ArrayFromJSONString(type, input)); + std::vector offsets = {0, 2, 2, 3, 3}; + ASSERT_OK_AND_ASSIGN(auto expected_keys, + ArrayFromJSONString(utf8(), R"(["joe", "mark", "cap"])")); + ASSERT_OK_AND_ASSIGN(auto expected_values, + ArrayFromJSONString(int32(), "[0, null, 8]")); + ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, + BytesToBits(std::vector({1, 0, 1, 1}))); + auto expected = + std::make_shared(type, 4, Buffer::Wrap(offsets), expected_keys, + expected_values, expected_null_bitmap, 1); + ASSERT_ARRAYS_EQUAL(*actual, *expected); +} + +TEST(TestMapFromString, Errors) { + auto type = map(int16(), int16()); + std::shared_ptr array; + + // list of pairs isn't an array + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0]")); + // pair isn't an array + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[null]]")); + // pair with length != 2 + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[[0]]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[[0, 1, 2]]]")); + // null key + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[[null, 0]]]")); + // key or value fails to convert + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[[0.0, 0]]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[[0, 0.0]]]")); +} + +TEST(TestMapFromString, IntegerMapToStringList) { + auto type = map(map(int16(), int16()), list(utf8())); + std::shared_ptr expected, actual; + + const char* input = R"( +[ + [ + [ + [], + [null, "empty"] + ], + [ + [[0, 1]], + null + ], + [ + [[0, 0], [1, 1]], + ["bootstrapping tautology?", "lispy", null, "i can see eternity"] + ] + ], + null + ] +)"; + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, input)); + + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder)); + auto& map_builder = checked_cast(*builder); + auto& key_builder = checked_cast(*map_builder.key_builder()); + auto& key_key_builder = checked_cast(*key_builder.key_builder()); + auto& key_item_builder = checked_cast(*key_builder.item_builder()); + auto& item_builder = checked_cast(*map_builder.item_builder()); + auto& item_value_builder = + checked_cast(*item_builder.value_builder()); + + ASSERT_OK(map_builder.Append()); + ASSERT_OK(key_builder.Append()); + ASSERT_OK(item_builder.Append()); + ASSERT_OK(item_value_builder.AppendNull()); + ASSERT_OK(item_value_builder.Append("empty")); + + ASSERT_OK(key_builder.Append()); + ASSERT_OK(item_builder.AppendNull()); + ASSERT_OK(key_key_builder.AppendValues({0})); + ASSERT_OK(key_item_builder.AppendValues({1})); + + ASSERT_OK(key_builder.Append()); + ASSERT_OK(item_builder.Append()); + ASSERT_OK(key_key_builder.AppendValues({0, 1})); + ASSERT_OK(key_item_builder.AppendValues({0, 1})); + ASSERT_OK(item_value_builder.Append("bootstrapping tautology?")); + ASSERT_OK(item_value_builder.Append("lispy")); + ASSERT_OK(item_value_builder.AppendNull()); + ASSERT_OK(item_value_builder.Append("i can see eternity")); + + ASSERT_OK(map_builder.AppendNull()); + + ASSERT_OK(map_builder.Finish(&expected)); + ASSERT_ARRAYS_EQUAL(*actual, *expected); +} + +TEST(TestFixedSizeListFromString, IntegerList) { + auto pool = default_memory_pool(); + auto type = fixed_size_list(int64(), 2); + std::shared_ptr values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({}, &values); + expected = std::make_shared(type, 0, values); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[[4, 5], [0, 0], [6, 7]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({4, 5, 0, 0, 6, 7}, &values); + expected = std::make_shared(type, 3, values); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, + ArrayFromJSONString(type, "[[null, null], [0, null], [6, null]]")); + ASSERT_OK(actual->ValidateFull()); + auto is_valid = std::vector{false, false, true, false, true, false}; + ArrayFromVector(is_valid, {0, 0, 0, 0, 6, 0}, &values); + expected = std::make_shared(type, 3, values); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[null, [null, null], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto value_builder = checked_cast(list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(value_builder->AppendNull()); + ASSERT_OK(value_builder->AppendNull()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestFixedSizeListFromString, IntegerListErrors) { + std::shared_ptr type = fixed_size_list(int64(), 2); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0.0, 1.0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[9223372036854775808, 0]]")); +} + +TEST(TestFixedSizeListFromString, NullList) { + auto pool = default_memory_pool(); + std::shared_ptr type = fixed_size_list(null(), 2); + std::shared_ptr values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + values = std::make_shared(0); + expected = std::make_shared(type, 0, values); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN( + actual, ArrayFromJSONString(type, "[[null, null], [null, null], [null, null]]")); + ASSERT_OK(actual->ValidateFull()); + values = std::make_shared(6); + expected = std::make_shared(type, 3, values); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[null, [null, null], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto value_builder = checked_cast(list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(value_builder->AppendNull()); + ASSERT_OK(value_builder->AppendNull()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestFixedSizeListFromString, IntegerListList) { + auto pool = default_memory_pool(); + auto nested_type = fixed_size_list(uint8(), 2); + std::shared_ptr type = fixed_size_list(nested_type, 1); + std::shared_ptr values, nested, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, + ArrayFromJSONString(type, "[[[1, 4]], [[2, 5]], [[3, 6]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({1, 4, 2, 5, 3, 6}, &values); + nested = std::make_shared(nested_type, 3, values); + expected = std::make_shared(type, 3, nested); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[[[1, null]], [null], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto nested_builder = + checked_cast(list_builder.value_builder()); + auto value_builder = checked_cast(nested_builder->value_builder()); + + ASSERT_OK(list_builder.Append()); + ASSERT_OK(nested_builder->Append()); + ASSERT_OK(value_builder->Append(1)); + ASSERT_OK(value_builder->AppendNull()); + + ASSERT_OK(list_builder.Append()); + ASSERT_OK(nested_builder->AppendNull()); + + ASSERT_OK(list_builder.AppendNull()); + + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStructFromString, SimpleStruct) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = struct_({field_a, field_b}); + std::shared_ptr a, b, expected, actual; + std::shared_ptr null_bitmap; + std::vector is_valid; + std::vector> children; + + // Trivial + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({}, &a); + ArrayFromVector({}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 0, children); + AssertArraysEqual(*expected, *actual); + + // Non-empty + ArrayFromVector({5, 6}, &a); + ArrayFromVector({true, false}, &b); + children.assign({a, b}); + expected = std::make_shared(type, 2, children); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[[5, true], [6, false]]")); + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); + ASSERT_OK_AND_ASSIGN( + actual, + ArrayFromJSONString(type, "[{\"a\": 5, \"b\": true}, {\"b\": false, \"a\": 6}]")); + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); + + // With nulls + is_valid = {false, true, false, false}; + ArrayFromVector(is_valid, {0, 5, 6, 0}, &a); + is_valid = {false, false, true, false}; + ArrayFromVector(is_valid, {false, true, false, false}, &b); + children.assign({a, b}); + BitmapFromVector({false, true, true, true}, &null_bitmap); + expected = std::make_shared(type, 4, children, null_bitmap, 1); + + ASSERT_OK_AND_ASSIGN( + actual, + ArrayFromJSONString(type, "[null, [5, null], [null, false], [null, null]]")); + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); + // When using object notation, null members can be omitted + ASSERT_OK_AND_ASSIGN( + actual, + ArrayFromJSONString(type, "[null, {\"a\": 5, \"b\": null}, {\"b\": false}, {}]")); + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStructFromString, NestedStruct) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto field_c = field("c", float64()); + std::shared_ptr nested_type = struct_({field_a, field_b}); + auto field_nested = field("nested", nested_type); + std::shared_ptr type = struct_({field_nested, field_c}); + std::shared_ptr expected, actual; + std::shared_ptr null_bitmap; + std::vector is_valid; + std::vector> children(2); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSONString(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({}, &children[0]); + ArrayFromVector({}, &children[1]); + children[0] = std::make_shared(nested_type, 0, children); + ArrayFromVector({}, &children[1]); + expected = std::make_shared(type, 0, children); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN( + actual, ArrayFromJSONString(type, "[[[5, true], 1.5], [[6, false], -3e2]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({5, 6}, &children[0]); + ArrayFromVector({true, false}, &children[1]); + children[0] = std::make_shared(nested_type, 2, children); + ArrayFromVector({1.5, -300.0}, &children[1]); + expected = std::make_shared(type, 2, children); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN( + actual, ArrayFromJSONString(type, "[null, [[5, null], null], [null, -3e2]]")); + ASSERT_OK(actual->ValidateFull()); + is_valid = {false, true, false}; + ArrayFromVector(is_valid, {0, 5, 0}, &children[0]); + is_valid = {false, false, false}; + ArrayFromVector(is_valid, {false, false, false}, &children[1]); + BitmapFromVector({false, true, false}, &null_bitmap); + children[0] = std::make_shared(nested_type, 3, children, null_bitmap, 2); + is_valid = {false, false, true}; + ArrayFromVector(is_valid, {0.0, 0.0, -300.0}, &children[1]); + BitmapFromVector({false, true, true}, &null_bitmap); + expected = std::make_shared(type, 3, children, null_bitmap, 1); + AssertArraysEqual(*expected, *actual); +} + +TEST(TestStructFromString, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = struct_({field_a, field_b}); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[0, true]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0, true, 1]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[true, 0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[{\"b\": 0, \"a\": true}]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[{\"c\": 0}]")); +} + +TEST(TestDenseUnionFromString, Basics) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + + auto type = dense_union({field_a, field_b}, {4, 8}); + ASSERT_OK_AND_ASSIGN( + auto array_parsed, + ArrayFromJSONString(type, + "[null, [4, 122], [8, true], [4, null], null, [8, false]]")); + auto array = checked_pointer_cast(array_parsed); + + ASSERT_OK_AND_ASSIGN(auto expected_types, + ArrayFromJSONString(int8(), "[4, 4, 8, 4, 4, 8]")); + ASSERT_OK_AND_ASSIGN(auto expected_offsets, + ArrayFromJSONString(int32(), "[0, 1, 0, 2, 3, 1]")); + ASSERT_OK_AND_ASSIGN(auto expected_a, + ArrayFromJSONString(int8(), "[null, 122, null, null]")); + ASSERT_OK_AND_ASSIGN(auto expected_b, ArrayFromJSONString(boolean(), "[true, false]")); + + ASSERT_OK_AND_ASSIGN( + auto expected, DenseUnionArray::Make(*expected_types, *expected_offsets, + {expected_a, expected_b}, {"a", "b"}, {4, 8})); + + ASSERT_ARRAYS_EQUAL(*expected, *array); + + // ensure that the array is as dense as we expect + ASSERT_TRUE(array->value_offsets()->Equals(*expected_offsets->data()->buffers[1])); + ASSERT_ARRAYS_EQUAL(*expected_a, *array->field(0)); + ASSERT_ARRAYS_EQUAL(*expected_b, *array->field(1)); +} + +TEST(TestSparseUnionFromString, Basics) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + + auto type = sparse_union({field_a, field_b}, {4, 8}); + ASSERT_OK_AND_ASSIGN( + auto array, + ArrayFromJSONString(type, "[[4, 122], [8, true], [4, null], null, [8, false]]")); + + ASSERT_OK_AND_ASSIGN(auto expected_types, + ArrayFromJSONString(int8(), "[4, 8, 4, 4, 8]")); + ASSERT_OK_AND_ASSIGN(auto expected_a, + ArrayFromJSONString(int8(), "[122, null, null, null, null]")); + ASSERT_OK_AND_ASSIGN(auto expected_b, + ArrayFromJSONString(boolean(), "[null, true, null, null, false]")); + + ASSERT_OK_AND_ASSIGN(auto expected, + SparseUnionArray::Make(*expected_types, {expected_a, expected_b}, + {"a", "b"}, {4, 8})); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnionFromString, ListOfUnion) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto union_type = dense_union({field_a, field_b}, {4, 8}); + auto list_type = list(union_type); + ASSERT_OK_AND_ASSIGN(auto parsed_array, + ArrayFromJSONString(list_type, + "[" + "[[4, 122], [8, true]]," + "[[4, null], null, [8, false]]" + "]")); + auto array = checked_pointer_cast(parsed_array); + + ASSERT_OK_AND_ASSIGN(auto expected_types, + ArrayFromJSONString(int8(), "[4, 8, 4, 4, 8]")); + ASSERT_OK_AND_ASSIGN(auto expected_offsets, + ArrayFromJSONString(int32(), "[0, 0, 1, 2, 1]")); + ASSERT_OK_AND_ASSIGN(auto expected_a, ArrayFromJSONString(int8(), "[122, null, null]")); + ASSERT_OK_AND_ASSIGN(auto expected_b, ArrayFromJSONString(boolean(), "[true, false]")); + + ASSERT_OK_AND_ASSIGN( + auto expected_values, + DenseUnionArray::Make(*expected_types, *expected_offsets, {expected_a, expected_b}, + {"a", "b"}, {4, 8})); + ASSERT_OK_AND_ASSIGN(auto expected_list_offsets, + ArrayFromJSONString(int32(), "[0, 2, 5]")); + ASSERT_OK_AND_ASSIGN(auto expected, + ListArray::FromArrays(*expected_list_offsets, *expected_values)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); + + // ensure that the array is as dense as we expect + auto array_values = checked_pointer_cast(array->values()); + ASSERT_TRUE(array_values->value_offsets()->Equals( + *checked_pointer_cast(expected_values)->value_offsets())); + ASSERT_ARRAYS_EQUAL(*expected_a, *array_values->field(0)); + ASSERT_ARRAYS_EQUAL(*expected_b, *array_values->field(1)); +} + +TEST(TestSparseUnionFromString, ListOfUnion) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + auto union_type = sparse_union({field_a, field_b}, {4, 8}); + auto list_type = list(union_type); + ASSERT_OK_AND_ASSIGN(auto array, ArrayFromJSONString(list_type, + "[" + "[[4, 122], [8, true]]," + "[[4, null], null, [8, false]]" + "]")); + + ASSERT_OK_AND_ASSIGN(auto expected_types, + ArrayFromJSONString(int8(), "[4, 8, 4, 4, 8]")); + ASSERT_OK_AND_ASSIGN(auto expected_a, + ArrayFromJSONString(int8(), "[122, null, null, null, null]")); + ASSERT_OK_AND_ASSIGN(auto expected_b, + ArrayFromJSONString(boolean(), "[null, true, null, null, false]")); + + ASSERT_OK_AND_ASSIGN(auto expected_values, + SparseUnionArray::Make(*expected_types, {expected_a, expected_b}, + {"a", "b"}, {4, 8})); + ASSERT_OK_AND_ASSIGN(auto expected_list_offsets, + ArrayFromJSONString(int32(), "[0, 2, 5]")); + ASSERT_OK_AND_ASSIGN(auto expected, + ListArray::FromArrays(*expected_list_offsets, *expected_values)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnionFromString, UnionOfStructs) { + std::vector> fields = { + field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), + field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), + field("foxtrot", list(int8()))})), + field("q", struct_({field("quebec", utf8())}))}; + auto type = dense_union(fields, {0, 23, 47}); + ASSERT_OK_AND_ASSIGN( + auto array_parsed, + ArrayFromJSONString(type, R"([[0, {"alpha": 0.0, "bravo": "charlie"}], + [23, {"whiskey": 99}], + [0, {"bravo": "mike"}], + null, + [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}] + ])")); + auto array = checked_pointer_cast(array_parsed); + + ASSERT_OK_AND_ASSIGN(auto expected_types, + ArrayFromJSONString(int8(), "[0, 23, 0, 0, 23]")); + ASSERT_OK_AND_ASSIGN(auto expected_offsets, + ArrayFromJSONString(int32(), "[0, 0, 1, 2, 1]")); + ASSERT_OK_AND_ASSIGN(auto expected_fields_0, ArrayFromJSONString(fields[0]->type(), R"([ + {"alpha": 0.0, "bravo": "charlie"}, + {"bravo": "mike"}, + null + ])")); + ASSERT_OK_AND_ASSIGN(auto expected_fields_1, ArrayFromJSONString(fields[1]->type(), R"([ + {"whiskey": 99}, + {"tango": 8.25, "foxtrot": [0, 2, 3]} + ])")); + ASSERT_OK_AND_ASSIGN(auto expected_fields_2, + ArrayFromJSONString(fields[2]->type(), "[]")); + ArrayVector expected_fields = {expected_fields_0, expected_fields_1, expected_fields_2}; + + ASSERT_OK_AND_ASSIGN( + auto expected, + DenseUnionArray::Make(*expected_types, *expected_offsets, expected_fields, + {"ab", "wtf", "q"}, {0, 23, 47})); + + ASSERT_ARRAYS_EQUAL(*expected, *array); + + // ensure that the array is as dense as we expect + ASSERT_TRUE(array->value_offsets()->Equals(*expected_offsets->data()->buffers[1])); + for (int i = 0; i < type->num_fields(); ++i) { + ASSERT_ARRAYS_EQUAL(*checked_cast(*expected).field(i), + *array->field(i)); + } +} + +TEST(TestSparseUnionFromString, UnionOfStructs) { + std::vector> fields = { + field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})), + field("wtf", struct_({field("whiskey", int8()), field("tango", float64()), + field("foxtrot", list(int8()))})), + field("q", struct_({field("quebec", utf8())}))}; + auto type = sparse_union(fields, {0, 23, 47}); + ASSERT_OK_AND_ASSIGN(auto array, ArrayFromJSONString(type, R"([ + [0, {"alpha": 0.0, "bravo": "charlie"}], + [23, {"whiskey": 99}], + [0, {"bravo": "mike"}], + null, + [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}] + ])")); + + ASSERT_OK_AND_ASSIGN(auto expected_types, + ArrayFromJSONString(int8(), "[0, 23, 0, 0, 23]")); + ASSERT_OK_AND_ASSIGN(auto expected_fields_0, ArrayFromJSONString(fields[0]->type(), R"([ + {"alpha": 0.0, "bravo": "charlie"}, + null, + {"bravo": "mike"}, + null, + null + ])")); + ASSERT_OK_AND_ASSIGN(auto expected_fields_1, ArrayFromJSONString(fields[1]->type(), R"([ + null, + {"whiskey": 99}, + null, + null, + {"tango": 8.25, "foxtrot": [0, 2, 3]} + ])")); + ASSERT_OK_AND_ASSIGN( + auto expected_fields_2, + ArrayFromJSONString(fields[2]->type(), "[null, null, null, null, null]")) + ArrayVector expected_fields = {expected_fields_0, expected_fields_1, expected_fields_2}; + + ASSERT_OK_AND_ASSIGN(auto expected, + SparseUnionArray::Make(*expected_types, expected_fields, + {"ab", "wtf", "q"}, {0, 23, 47})); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDenseUnionFromString, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = dense_union({field_a, field_b}, {4, 8}); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"not a valid type_id\"]")); + ASSERT_RAISES(Invalid, + ArrayFromJSONString(type, "[[0, 99]]")); // 0 is not one of {4, 8} + ASSERT_RAISES(Invalid, + ArrayFromJSONString(type, "[[4, \"\"]]")); // "" is not a valid int8() + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"not a pair\"]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[8, true, 1]]")); +} + +TEST(TestSparseUnionFromString, Errors) { + auto field_a = field("a", int8()); + auto field_b = field("b", boolean()); + std::shared_ptr type = sparse_union({field_a, field_b}, {4, 8}); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"not a valid type_id\"]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0, 99]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[4, \"\"]]")); + + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[\"not a pair\"]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSONString(type, "[[8, true, 1]]")); +} + +TEST(TestNestedDictionaryFromString, ListOfDict) { + auto index_type = int8(); + auto value_type = utf8(); + auto dict_type = dictionary(index_type, value_type); + auto type = list(dict_type); + + std::shared_ptr array, expected, indices, values, dicts, offsets; + + ASSERT_OK_AND_ASSIGN( + array, ArrayFromJSONString(type, R"([["ab", "cd", null], null, ["cd", "cd"]])")); + ASSERT_OK(array->ValidateFull()); + + // Build expected array + ASSERT_OK_AND_ASSIGN(indices, ArrayFromJSONString(index_type, "[0, 1, null, 1, 1]")); + ASSERT_OK_AND_ASSIGN(values, ArrayFromJSONString(value_type, R"(["ab", "cd"])")); + ASSERT_OK_AND_ASSIGN(dicts, DictionaryArray::FromArrays(dict_type, indices, values)); + ASSERT_OK_AND_ASSIGN(offsets, ArrayFromJSONString(int32(), "[0, null, 3, 5]")); + ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *dicts)); + + AssertArraysEqual(*expected, *array, /*verbose=*/true); +} + +TEST(TestDictArrayFromJSONString, Basics) { + auto type = dictionary(int32(), utf8()); + auto array = + DictArrayFromJSON(type, "[null, 2, 1, 0]", R"(["whiskey", "tango", "foxtrot"])"); + + ASSERT_OK_AND_ASSIGN(auto expected_indices, + ArrayFromJSONString(int32(), "[null, 2, 1, 0]")); + ASSERT_OK_AND_ASSIGN(auto expected_dictionary, + ArrayFromJSONString(utf8(), R"(["whiskey", "tango", "foxtrot"])")); + + ASSERT_ARRAYS_EQUAL(DictionaryArray(type, expected_indices, expected_dictionary), + *array); +} + +TEST(TestDictArrayFromJSONString, Errors) { + auto type = dictionary(int32(), utf8()); + + ASSERT_RAISES(Invalid, + DictArrayFromJSONString(type, "[\"not a valid index\"]", "[\"\"]")); + ASSERT_RAISES(Invalid, DictArrayFromJSONString(type, "[0, 1]", + "[1]")); // dict value isn't string +} + +TEST(TestChunkedArrayFromJSONString, Basics) { + auto type = int32(); + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArrayFromJSONString(type, {})); + ASSERT_OK(chunked_array->ValidateFull()); + ASSERT_EQ(chunked_array->num_chunks(), 0); + AssertTypeEqual(type, chunked_array->type()); + + ASSERT_OK_AND_ASSIGN(auto chunked_array_two, + ChunkedArrayFromJSONString(type, {"[1, 2]", "[3, null, 4]"})); + ASSERT_OK(chunked_array_two->ValidateFull()); + ASSERT_EQ(chunked_array_two->num_chunks(), 2); + std::shared_ptr expected_chunk; + ASSERT_OK_AND_ASSIGN(expected_chunk, ArrayFromJSONString(type, "[1, 2]")); + AssertArraysEqual(*expected_chunk, *chunked_array_two->chunk(0), /*verbose=*/true); + ASSERT_OK_AND_ASSIGN(expected_chunk, ArrayFromJSONString(type, "[3, null, 4]")); + AssertArraysEqual(*expected_chunk, *chunked_array_two->chunk(1), /*verbose=*/true); +} + +TEST(TestScalarFromJSONString, Basics) { + // Sanity check for common types (not exhaustive) + AssertJSONScalar(int64(), "4", true, 4); + AssertJSONScalar(int64(), "null", false, 0); + AssertJSONScalar>(utf8(), R"("")", true, + Buffer::FromString("")); + AssertJSONScalar>(utf8(), R"("foo")", true, + Buffer::FromString("foo")); + AssertJSONScalar>(utf8(), R"(null)", false, + Buffer::FromString("")); + AssertJSONScalar(null(), "null", false, nullptr); + AssertJSONScalar(boolean(), "true", true, true); + AssertJSONScalar(boolean(), "false", true, false); + AssertJSONScalar(boolean(), "null", false, false); + AssertJSONScalar(boolean(), "0", true, false); + AssertJSONScalar(boolean(), "1", true, true); + AssertJSONScalar(float64(), "1.0", true, 1.0); + AssertJSONScalar(float64(), "-0.0", true, -0.0); + ASSERT_OK_AND_ASSIGN(auto nan_scalar, ScalarFromJSONString(float64(), "NaN")); + ASSERT_TRUE(std::isnan(checked_cast(*nan_scalar).value)); + ASSERT_OK_AND_ASSIGN(auto inf_scalar, ScalarFromJSONString(float64(), "Inf")); + ASSERT_TRUE(std::isinf(checked_cast(*inf_scalar).value)); +} + +TEST(TestScalarFromJSONString, Errors) { + ASSERT_RAISES(Invalid, ScalarFromJSONString(int64(), "[0]")); + ASSERT_RAISES(Invalid, ScalarFromJSONString(int64(), "[9223372036854775808]")); + ASSERT_RAISES(Invalid, ScalarFromJSONString(int64(), "[-9223372036854775809]")); + ASSERT_RAISES(Invalid, ScalarFromJSONString(uint64(), "[18446744073709551616]")); + ASSERT_RAISES(Invalid, ScalarFromJSONString(uint64(), "[-1]")); + ASSERT_RAISES(Invalid, ScalarFromJSONString(binary(), "0")); + ASSERT_RAISES(Invalid, ScalarFromJSONString(binary(), "[]")); + ASSERT_RAISES(Invalid, ScalarFromJSONString(boolean(), "0.0")); + ASSERT_RAISES(Invalid, ScalarFromJSONString(boolean(), "\"true\"")); +} + +TEST(TestDictScalarFromJSONString, Basics) { + auto type = dictionary(int32(), utf8()); + auto dict = R"(["whiskey", "tango", "foxtrot"])"; + ASSERT_OK_AND_ASSIGN(auto expected_dictionary, ArrayFromJSONString(utf8(), dict)); + + for (auto index : {"null", "2", "1", "0"}) { + auto scalar = DictScalarFromJSON(type, index, dict); + auto expected_index = ScalarFromJSON(int32(), index); + AssertScalarsEqual(*DictionaryScalar::Make(expected_index, expected_dictionary), + *scalar, /*verbose=*/true); + ASSERT_OK(scalar->ValidateFull()); + } +} + +TEST(TestDictScalarFromJSONString, Errors) { + auto type = dictionary(int32(), utf8()); + + ASSERT_RAISES(Invalid, + DictScalarFromJSONString(type, "\"not a valid index\"", "[\"\"]")); + ASSERT_RAISES(Invalid, + DictScalarFromJSONString(type, "0", "[1]")); // dict value isn't string +} + +} // namespace json +} // namespace arrow diff --git a/cpp/src/arrow/json/meson.build b/cpp/src/arrow/json/meson.build new file mode 100644 index 00000000000..bc1567df9ec --- /dev/null +++ b/cpp/src/arrow/json/meson.build @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +exc = executable( + 'arrow-json-test', + sources: [ + 'chunked_builder_test.cc', + 'chunker_test.cc', + 'converter_test.cc', + 'from_string_test.cc', + 'parser_test.cc', + 'reader_test.cc', + ], + dependencies: [arrow_test_dep, rapidjson_dep], +) +test('arrow-json-test', exc) + +exc = executable( + 'arrow-json-parser-benchmark', + sources: ['parser_benchmark.cc'], + dependencies: [arrow_benchmark_dep, rapidjson_dep], +) +benchmark('arrow-json-parser-benchmark', exc) + +install_headers( + [ + 'api.h', + 'chunked_builder.h', + 'chunker.h', + 'converter.h', + 'from_string.h', + 'object_parser.h', + 'object_writer.h', + 'options.h', + 'parser.h', + 'rapidjson_defs.h', + 'reader.h', + 'test_common.h', + 'type_fwd.h', + ], + subdir: 'arrow/json', +) + +arrow_json_dep = declare_dependency( + include_directories: include_directories('.'), + dependencies: arrow_dep, +) +meson.override_dependency('arrow-json', arrow_json_dep) + +pkg.generate( + filebase: 'arrow-json', + name: 'Apache Arrow JSON', + description: 'JSON reader module for Apache Arrow', + requires: ['arrow'], +) diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index dbc32489937..53f856d8012 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -35,10 +35,10 @@ #include "arrow/array/builder_binary.h" #include "arrow/buffer_builder.h" #include "arrow/type.h" -#include "arrow/util/bitset_stack.h" +#include "arrow/util/bitset_stack_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging_internal.h" -#include "arrow/util/trie.h" +#include "arrow/util/trie_internal.h" #include "arrow/visit_type_inline.h" namespace arrow { diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index e9c338eac66..5590ba41c91 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -69,12 +69,14 @@ arrow_components = { 'compute/kernels/scalar_cast_numeric.cc', 'compute/kernels/scalar_cast_string.cc', 'compute/kernels/scalar_cast_temporal.cc', + 'compute/kernels/temporal_internal.cc', 'compute/kernels/util_internal.cc', 'compute/kernels/vector_hash.cc', 'compute/kernels/vector_selection.cc', 'compute/kernels/vector_selection_filter_internal.cc', 'compute/kernels/vector_selection_internal.cc', 'compute/kernels/vector_selection_take_internal.cc', + 'compute/kernels/vector_swizzle.cc', ], }, 'arrow_io': { @@ -94,60 +96,6 @@ arrow_components = { 'include_dirs': [include_directories('../../thirdparty/hadoop/include')], 'dependencies': [dl_dep], }, - 'arrow_util': { - 'sources': [ - 'util/align_util.cc', - 'util/async_util.cc', - 'util/atfork_internal.cc', - 'util/basic_decimal.cc', - 'util/bit_block_counter.cc', - 'util/bit_run_reader.cc', - 'util/bit_util.cc', - 'util/bitmap.cc', - 'util/bitmap_builders.cc', - 'util/bitmap_ops.cc', - 'util/bpacking.cc', - 'util/byte_size.cc', - 'util/cancel.cc', - 'util/compression.cc', - 'util/counting_semaphore.cc', - 'util/cpu_info.cc', - 'util/crc32.cc', - 'util/debug.cc', - 'util/decimal.cc', - 'util/delimiting.cc', - 'util/dict_util.cc', - 'util/fixed_width_internal.cc', - 'util/float16.cc', - 'util/formatting.cc', - 'util/future.cc', - 'util/hashing.cc', - 'util/int_util.cc', - 'util/io_util.cc', - 'util/list_util.cc', - 'util/logger.cc', - 'util/logging.cc', - 'util/key_value_metadata.cc', - 'util/math_internal.cc', - 'util/memory.cc', - 'util/mutex.cc', - 'util/ree_util.cc', - 'util/string.cc', - 'util/string_builder.cc', - 'util/task_group.cc', - 'util/tdigest.cc', - 'util/thread_pool.cc', - 'util/time.cc', - 'util/tracing.cc', - 'util/trie.cc', - 'util/union_util.cc', - 'util/unreachable.cc', - 'util/uri.cc', - 'util/utf8.cc', - 'util/value_parsing.cc', - ], - 'dependencies': [threads_dep], - }, 'memory_pool': {'sources': ['memory_pool.cc']}, 'vendored': { 'sources': [ @@ -215,6 +163,105 @@ arrow_components = { }, } +arrow_util_srcs = [ + 'util/align_util.cc', + 'util/async_util.cc', + 'util/atfork_internal.cc', + 'util/basic_decimal.cc', + 'util/bit_block_counter.cc', + 'util/bit_run_reader.cc', + 'util/bit_util.cc', + 'util/bitmap.cc', + 'util/bitmap_builders.cc', + 'util/bitmap_ops.cc', + 'util/bpacking.cc', + 'util/byte_size.cc', + 'util/byte_stream_split_internal.cc', + 'util/cancel.cc', + 'util/compression.cc', + 'util/counting_semaphore.cc', + 'util/cpu_info.cc', + 'util/crc32.cc', + 'util/debug.cc', + 'util/decimal.cc', + 'util/delimiting.cc', + 'util/dict_util.cc', + 'util/fixed_width_internal.cc', + 'util/float16.cc', + 'util/formatting.cc', + 'util/future.cc', + 'util/hashing.cc', + 'util/int_util.cc', + 'util/io_util.cc', + 'util/list_util.cc', + 'util/logger.cc', + 'util/logging.cc', + 'util/key_value_metadata.cc', + 'util/math_internal.cc', + 'util/memory.cc', + 'util/mutex.cc', + 'util/ree_util.cc', + 'util/secure_string.cc', + 'util/string.cc', + 'util/string_util.cc', + 'util/task_group.cc', + 'util/tdigest.cc', + 'util/thread_pool.cc', + 'util/time.cc', + 'util/tracing.cc', + 'util/trie.cc', + 'util/union_util.cc', + 'util/unreachable.cc', + 'util/uri.cc', + 'util/utf8.cc', + 'util/value_parsing.cc', +] + +arrow_util_deps = [threads_dep] + +if needs_brotli + arrow_util_srcs += ['util/compression_brotli.cc'] + arrow_util_deps += [dependency('libbrotlidec'), dependency('libbrotlienc')] +endif + +if needs_bz2 + arrow_util_srcs += ['util/compression_bz2.cc'] + bzip2 = cpp_compiler.find_library( + 'bz2', + has_headers: ['bzlib.h'], + required: false, + ) + if bzip2.found() + arrow_util_deps += bzip2 + else + arrow_util_deps += dependency('bzip2') + endif +endif + +if needs_lz4 + arrow_util_srcs += ['util/compression_lz4.cc'] + arrow_util_deps += dependency('liblz4') +endif + +if needs_snappy + arrow_util_srcs += ['util/compression_snappy.cc'] + arrow_util_deps += dependency('snappy', 'Snappy') +endif + +if needs_zlib + arrow_util_srcs += ['util/compression_zlib.cc'] + arrow_util_deps += dependency('zlib') +endif + +if needs_zstd + arrow_util_srcs += ['util/compression_zstd.cc'] + arrow_util_deps += dependency('libzstd') +endif + +arrow_components += { + 'arrow_util': {'sources': arrow_util_srcs, 'dependencies': arrow_util_deps}, +} + arrow_testing_srcs = [ 'io/test_common.cc', 'ipc/test_common.cc', @@ -259,7 +306,10 @@ if needs_csv endif if needs_json or needs_integration - rapidjson_dep = dependency('rapidjson', include_type: 'system') + rapidjson_dep = dependency( + 'RapidJSON', + fallback: ['rapidjson', 'rapidjson_dep'], + ) else rapidjson_dep = disabler() endif @@ -280,30 +330,81 @@ if needs_filesystem if needs_azure arrow_filesystem_srcs += ['filesystem/azurefs.cc'] - cmake = import('cmake') - azure_opt = cmake.subproject_options() - azure_opt.add_cmake_defines( - {'BUILD_PERFORMANCE_TESTS': 'FALSE'}, - {'BUILD_SAMPLES': 'FALSE'}, - {'BUILD_TESTING': 'FALSE'}, - {'BUILD_WINDOWS_UWP': 'TRUE'}, - {'CMAKE_UNITY_BUILD': 'FALSE'}, - {'DISABLE_AZURE_CORE_OPENTELEMETRY': 'TRUE'}, - {'ENV{AZURE_SDK_DISABLE_AUTO_VCPKG}': 'TRUE'}, - {'WARNINGS_AS_ERRORS': 'FALSE'}, + + azure_core_dep = dependency( + 'azure-core-cpp', + allow_fallback: false, + modules: ['Azure::azure-core'], + required: false, ) - azure_opt.append_compile_args('cpp', '-fPIC') - azure_proj = cmake.subproject('azure', options: azure_opt) - - azure_dep = declare_dependency( - dependencies: [ - azure_proj.dependency('azure-core'), - azure_proj.dependency('azure-identity'), - azure_proj.dependency('azure-storage-blobs'), - azure_proj.dependency('azure-storage-common'), - azure_proj.dependency('azure-storage-files-datalake'), - ], + azure_identity_dep = dependency( + 'azure-identity-cpp', + allow_fallback: false, + modules: ['Azure::azure-identity'], + required: false, + ) + azure_storage_blobs_dep = dependency( + 'azure-storage-blobs-cpp', + allow_fallback: false, + modules: ['Azure::azure-storage-blobs'], + required: false, + ) + azure_storage_common_dep = dependency( + 'azure-storage-common-cpp', + allow_fallback: false, + modules: ['Azure::azure-storage-common'], + required: false, + ) + azure_storage_files_datalake_dep = dependency( + 'azure-storage-files-datalake-cpp', + allow_fallback: false, + modules: ['Azure::azure-storage-files-datalake'], + required: false, + ) + + if ( + azure_core_dep.found() + and azure_identity_dep.found() + and azure_storage_blobs_dep.found() + and azure_storage_common_dep.found() + and azure_storage_files_datalake_dep.found() ) + azure_dep = declare_dependency( + dependencies: [ + azure_core_dep, + azure_identity_dep, + azure_storage_blobs_dep, + azure_storage_common_dep, + azure_storage_files_datalake_dep, + ], + ) + else + cmake = import('cmake') + azure_opt = cmake.subproject_options() + azure_opt.add_cmake_defines( + {'BUILD_PERFORMANCE_TESTS': 'FALSE'}, + {'BUILD_SAMPLES': 'FALSE'}, + {'BUILD_TESTING': 'FALSE'}, + {'BUILD_WINDOWS_UWP': 'TRUE'}, + {'CMAKE_UNITY_BUILD': 'FALSE'}, + {'DISABLE_AZURE_CORE_OPENTELEMETRY': 'TRUE'}, + {'ENV{AZURE_SDK_DISABLE_AUTO_VCPKG}': 'TRUE'}, + {'WARNINGS_AS_ERRORS': 'FALSE'}, + ) + azure_opt.append_compile_args('cpp', '-fPIC') + azure_proj = cmake.subproject('azure', options: azure_opt) + + azure_dep = declare_dependency( + dependencies: [ + azure_proj.dependency('azure-core'), + azure_proj.dependency('azure-identity'), + azure_proj.dependency('azure-storage-blobs'), + azure_proj.dependency('azure-storage-common'), + azure_proj.dependency('azure-storage-files-datalake'), + ], + ) + endif + arrow_filesystem_deps += [azure_dep] endif @@ -340,14 +441,12 @@ if needs_ipc 'ipc/writer.cc', ] - flatbuffers_dep = dependency('flatbuffers') + flatbuffers_incdir = include_directories( + '../../thirdparty/flatbuffers/include', + ) + flatbuffers_dep = declare_dependency(include_directories: flatbuffers_incdir) arrow_ipc_deps = [flatbuffers_dep] - if needs_json - arrow_ipc_srcs += 'ipc/json_simple.cc' - arrow_ipc_deps += rapidjson_dep - endif - arrow_components += { 'arrow_ipc': {'sources': arrow_ipc_srcs, 'dependencies': arrow_ipc_deps}, } @@ -363,6 +462,7 @@ if needs_json 'json/chunked_builder.cc', 'json/chunker.cc', 'json/converter.cc', + 'json/from_string.cc', 'json/object_parser.cc', 'json/object_writer.cc', 'json/parser.cc', @@ -389,12 +489,82 @@ arrow_lib = library( include_directories: arrow_includes, dependencies: arrow_deps, install: true, + gnu_symbol_visibility: 'inlineshidden', + cpp_shared_args: ['-DARROW_EXPORTING'], ) arrow_dep = declare_dependency( include_directories: [include_dir], link_with: arrow_lib, ) +meson.override_dependency('arrow', arrow_dep) + +if needs_compute + arrow_compute_lib_sources = [ + 'compute/initialize.cc', + 'compute/kernels/aggregate_basic.cc', + 'compute/kernels/aggregate_mode.cc', + 'compute/kernels/aggregate_pivot.cc', + 'compute/kernels/aggregate_quantile.cc', + 'compute/kernels/aggregate_tdigest.cc', + 'compute/kernels/aggregate_var_std.cc', + 'compute/kernels/hash_aggregate.cc', + 'compute/kernels/hash_aggregate_numeric.cc', + 'compute/kernels/hash_aggregate_pivot.cc', + 'compute/kernels/pivot_internal.cc', + 'compute/kernels/scalar_arithmetic.cc', + 'compute/kernels/scalar_boolean.cc', + 'compute/kernels/scalar_compare.cc', + 'compute/kernels/scalar_if_else.cc', + 'compute/kernels/scalar_nested.cc', + 'compute/kernels/scalar_random.cc', + 'compute/kernels/scalar_round.cc', + 'compute/kernels/scalar_set_lookup.cc', + 'compute/kernels/scalar_string_ascii.cc', + 'compute/kernels/scalar_string_utf8.cc', + 'compute/kernels/scalar_temporal_binary.cc', + 'compute/kernels/scalar_temporal_unary.cc', + 'compute/kernels/scalar_validity.cc', + 'compute/kernels/util_internal.cc', + 'compute/kernels/vector_array_sort.cc', + 'compute/kernels/vector_cumulative_ops.cc', + 'compute/kernels/vector_nested.cc', + 'compute/kernels/vector_pairwise.cc', + 'compute/kernels/vector_rank.cc', + 'compute/kernels/vector_replace.cc', + 'compute/kernels/vector_run_end_encode.cc', + 'compute/kernels/vector_select_k.cc', + 'compute/kernels/vector_sort.cc', + 'compute/kernels/vector_statistics.cc', + 'compute/key_hash_internal.cc', + 'compute/key_map_internal.cc', + 'compute/light_array_internal.cc', + 'compute/row/encode_internal.cc', + 'compute/row/compare_internal.cc', + 'compute/row/grouper.cc', + 'compute/row/row_encoder_internal.cc', + 'compute/row/row_internal.cc', + 'compute/util.cc', + 'compute/util_internal.cc', + ] + + arrow_compute_lib = library( + 'arrow-compute', + sources: arrow_compute_lib_sources, + dependencies: arrow_dep, + install: true, + gnu_symbol_visibility: 'inlineshidden', + cpp_shared_args: ['-DARROW_COMPUTE_EXPORTING'], + ) + arrow_compute_dep = declare_dependency( + link_with: arrow_compute_lib, + include_directories: include_dir, + dependencies: arrow_dep, + ) + meson.override_dependency('arrow-compute', arrow_compute_dep) +else + arrow_compute_dep = disabler() +endif # Meson does not allow you to glob for headers to install. See also # https://mesonbuild.com/FAQ.html#why-cant-i-specify-target-files-with-a-wildcard @@ -468,19 +638,41 @@ else gmock_dep = disabler() endif -arrow_test_lib = static_library( - 'arrow_testing', - sources: arrow_testing_srcs, - dependencies: [arrow_dep, filesystem_dep, gmock_dep, gtest_main_dep], -) +if needs_testing + arrow_testing_lib = static_library( + 'arrow_testing', + sources: arrow_testing_srcs, + dependencies: [arrow_dep, filesystem_dep, gmock_dep, gtest_dep], + ) + + arrow_testing_dep = declare_dependency(link_with: [arrow_testing_lib]) + meson.override_dependency('arrow-testing', arrow_testing_dep) +else + arrow_testing_dep = disabler() +endif if needs_tests arrow_test_dep = declare_dependency( - link_with: [arrow_test_lib], - dependencies: [arrow_dep, filesystem_dep, gmock_dep, gtest_main_dep], + dependencies: [ + arrow_dep, + arrow_testing_dep, + filesystem_dep, + gmock_dep, + gtest_main_dep, + ], + ) + arrow_test_dep_no_main = declare_dependency( + dependencies: [ + arrow_dep, + arrow_testing_dep, + filesystem_dep, + gmock_dep, + gtest_dep, + ], ) else arrow_test_dep = disabler() + arrow_test_dep_no_main = disabler() endif arrow_tests = { @@ -544,13 +736,18 @@ endforeach if needs_benchmarks benchmark_main_dep = dependency( - 'benchmark-main', + 'benchmark_main', + fallback: ['google-benchmark', 'google_benchmark_main_dep'], default_options: {'tests': 'disabled'}, ) arrow_benchmark_dep = declare_dependency( - link_with: [arrow_test_lib], - dependencies: [arrow_dep, benchmark_main_dep, gtest_dep], + dependencies: [ + arrow_dep, + arrow_testing_dep, + benchmark_main_dep, + gtest_dep, + ], ) else arrow_benchmark_dep = disabler() @@ -607,10 +804,14 @@ pkg.generate( subdir('testing') +subdir('array') +subdir('compute') subdir('c') +subdir('extension') subdir('io') subdir('tensor') subdir('util') +subdir('vendored') gflags_dep = dependency('gflags', include_type: 'system') if needs_integration or needs_tests @@ -621,6 +822,26 @@ if needs_csv subdir('csv') endif +if needs_acero + subdir('acero') +endif + if needs_filesystem subdir('filesystem') endif + +if needs_flight + subdir('flight') +endif + +if needs_json + subdir('json') +endif + +if needs_ipc + subdir('ipc') +endif + +if get_option('tensorflow').enabled() + subdir('adapters/tensorflow') +endif diff --git a/cpp/src/arrow/pch.h b/cpp/src/arrow/pch.h deleted file mode 100644 index 31da37b824b..00000000000 --- a/cpp/src/arrow/pch.h +++ /dev/null @@ -1,30 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Often-used headers, for precompiling. -// If updating this header, please make sure you check compilation speed -// before checking in. Adding headers which are not used extremely often -// may incur a slowdown, since it makes the precompiled header heavier to load. - -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/record_batch.h" -#include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index c5905d0c8c5..72344992859 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -59,7 +59,9 @@ class PrettyPrinter { : options_(options), indent_(options.indent), sink_(sink) {} inline void Write(std::string_view data); + inline void Write(std::string_view data, int max_chars); inline void WriteIndented(std::string_view data); + inline void WriteIndented(std::string_view data, int max_chars); inline void Newline(); inline void Indent(); inline void IndentAfterNewline(); @@ -104,11 +106,26 @@ void PrettyPrinter::CloseArray(const Array& array) { (*sink_) << options_.array_delimiters.close; } -void PrettyPrinter::Write(std::string_view data) { (*sink_) << data; } +void PrettyPrinter::Write(std::string_view data) { + Write(data, options_.element_size_limit); +} + +void PrettyPrinter::Write(std::string_view data, int max_chars) { + (*sink_) << data.substr(0, max_chars); + if (data.size() > static_cast(max_chars)) { + (*sink_) << " (... " << data.size() - static_cast(max_chars) + << " chars omitted)"; + } +} void PrettyPrinter::WriteIndented(std::string_view data) { Indent(); - Write(data); + Write(data, options_.element_size_limit); +} + +void PrettyPrinter::WriteIndented(std::string_view data, int max_chars) { + Indent(); + Write(data, max_chars); } void PrettyPrinter::Newline() { @@ -176,7 +193,7 @@ class ArrayPrinter : public PrettyPrinter { template Status WritePrimitiveValues(const ArrayType& array, Formatter* formatter) { - auto appender = [&](std::string_view v) { (*sink_) << v; }; + auto appender = [&](std::string_view v) { Write(v); }; auto format_func = [&](int64_t i) { (*formatter)(array.GetView(i), appender); return Status::OK(); @@ -222,19 +239,15 @@ class ArrayPrinter : public PrettyPrinter { return WritePrimitiveValues(array); } - Status WriteDataValues(const HalfFloatArray& array) { - // XXX do not know how to format half floats yet - StringFormatter formatter{array.type().get()}; - return WritePrimitiveValues(array, &formatter); - } - template enable_if_has_string_view WriteDataValues(const ArrayType& array) { return WriteValues(array, [&](int64_t i) { if constexpr (T::is_utf8) { - (*sink_) << "\"" << array.GetView(i) << "\""; + (*sink_) << "\""; + this->Write(array.GetView(i), options_.element_size_limit - 2); + (*sink_) << "\""; } else { - (*sink_) << HexEncode(array.GetView(i)); + this->Write(HexEncode(array.GetView(i))); } return Status::OK(); }); @@ -243,7 +256,7 @@ class ArrayPrinter : public PrettyPrinter { template enable_if_decimal WriteDataValues(const ArrayType& array) { return WriteValues(array, [&](int64_t i) { - (*sink_) << array.FormatValue(i); + this->Write(array.FormatValue(i)); return Status::OK(); }); } diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index ad68726716c..7e5eca4300b 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -58,14 +58,15 @@ struct ARROW_EXPORT PrettyPrintOptions { PrettyPrintOptions(int indent, // NOLINT runtime/explicit int window = 10, int indent_size = 2, std::string null_rep = "null", bool skip_new_lines = false, bool truncate_metadata = true, - int container_window = 2) + int container_window = 2, int element_size_limit = 100) : indent(indent), indent_size(indent_size), window(window), container_window(container_window), null_rep(std::move(null_rep)), skip_new_lines(skip_new_lines), - truncate_metadata(truncate_metadata) {} + truncate_metadata(truncate_metadata), + element_size_limit(element_size_limit) {} /// Create a PrettyPrintOptions instance with default values static PrettyPrintOptions Defaults() { return PrettyPrintOptions(); } @@ -99,6 +100,9 @@ struct ARROW_EXPORT PrettyPrintOptions { /// If true, display schema metadata when pretty-printing a Schema bool show_schema_metadata = true; + /// Limit each element to specified number of characters, defaults to 100 + int element_size_limit = 100; + /// Delimiters to use when printing an Array PrettyPrintDelimiters array_delimiters = PrettyPrintDelimiters::Defaults(); diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 108b212cca5..c90b03bbda3 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -19,12 +19,14 @@ #include +#include #include #include #include #include #include #include +#include #include #include "arrow/array.h" @@ -32,10 +34,13 @@ #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" namespace arrow { +using util::Float16; + class TestPrettyPrint : public ::testing::Test { public: void SetUp() {} @@ -47,37 +52,37 @@ class TestPrettyPrint : public ::testing::Test { }; template -void CheckStream(const T& obj, const PrettyPrintOptions& options, const char* expected) { +void CheckStream(const T& obj, const PrettyPrintOptions& options, + std::string_view expected) { std::ostringstream sink; ASSERT_OK(PrettyPrint(obj, options, &sink)); std::string result = sink.str(); - ASSERT_EQ(std::string(expected, strlen(expected)), result); + ASSERT_EQ(expected, result); } -void CheckArray(const Array& arr, const PrettyPrintOptions& options, const char* expected, - bool check_operator = true) { +void CheckArray(const Array& arr, const PrettyPrintOptions& options, + std::string_view expected, bool check_operator = true) { ARROW_SCOPED_TRACE("For datatype: ", arr.type()->ToString()); CheckStream(arr, options, expected); - if (options.indent == 0 && check_operator) { + if (options.indent == 0 && options.element_size_limit == 100 && check_operator) { std::stringstream ss; ss << arr; - std::string result = std::string(expected, strlen(expected)); - ASSERT_EQ(result, ss.str()); + ASSERT_EQ(expected, ss.str()); } } template -void Check(const T& obj, const PrettyPrintOptions& options, const char* expected) { +void Check(const T& obj, const PrettyPrintOptions& options, std::string_view expected) { std::string result; ASSERT_OK(PrettyPrint(obj, options, &result)); - ASSERT_EQ(std::string(expected, strlen(expected)), result); + ASSERT_EQ(expected, result); } template void CheckPrimitive(const std::shared_ptr& type, const PrettyPrintOptions& options, const std::vector& is_valid, - const std::vector& values, const char* expected, + const std::vector& values, std::string_view expected, bool check_operator = true) { std::shared_ptr array; ArrayFromVector(type, is_valid, values, &array); @@ -86,7 +91,7 @@ void CheckPrimitive(const std::shared_ptr& type, template void CheckPrimitive(const PrettyPrintOptions& options, const std::vector& is_valid, - const std::vector& values, const char* expected, + const std::vector& values, std::string_view expected, bool check_operator = true) { CheckPrimitive(TypeTraits::type_singleton(), options, is_valid, values, expected, check_operator); @@ -158,12 +163,12 @@ TEST_F(TestPrettyPrint, PrimitiveType) { ])expected"; CheckPrimitive({2, 10}, is_valid, values2, ex2_in2); - std::vector values3 = {"foo", "bar", "", "baz", ""}; + std::vector values3 = {"foo", "bar", "", "a longer string", ""}; static const char* ex3 = R"expected([ "foo", "bar", null, - "baz", + "a longer string", null ])expected"; CheckPrimitive({0, 10}, is_valid, values3, ex3); @@ -172,11 +177,23 @@ TEST_F(TestPrettyPrint, PrimitiveType) { "foo", "bar", null, - "baz", + "a longer string", null ])expected"; CheckPrimitive({2, 10}, is_valid, values3, ex3_in2); CheckPrimitive({2, 10}, is_valid, values3, ex3_in2); + + PrettyPrintOptions options{2, 10}; + options.element_size_limit = 8; + static const char* ex3_in3 = R"expected( [ + "foo", + "bar", + null, + "a long (... 9 chars omitted)", + null + ])expected"; + CheckPrimitive(options, is_valid, values3, ex3_in3); + CheckPrimitive(options, is_valid, values3, ex3_in3); } TEST_F(TestPrettyPrint, PrimitiveTypeNoNewlines) { @@ -317,6 +334,37 @@ TEST_F(TestPrettyPrint, UInt64) { expected); } +TEST_F(TestPrettyPrint, HalfFloat) { + static const char* expected = R"expected([ + -inf, + -1234, + -0, + 0, + 1, + 1.2001953125, + 2.5, + 3.9921875, + 4.125, + 10000, + 12344, + inf, + nan, + null +])expected"; + + std::vector values = { + Float16(-1e10f).bits(), Float16(-1234.0f).bits(), Float16(-0.0f).bits(), + Float16(0.0f).bits(), Float16(1.0f).bits(), Float16(1.2f).bits(), + Float16(2.5f).bits(), Float16(3.9921875f).bits(), Float16(4.125f).bits(), + Float16(1e4f).bits(), Float16(12345.0f).bits(), Float16(1e5f).bits(), + Float16(NAN).bits(), Float16(6.10f).bits()}; + + std::vector is_valid(values.size(), true); + is_valid.back() = false; + + CheckPrimitive({0, 10}, is_valid, values, expected); +} + TEST_F(TestPrettyPrint, DateTimeTypes) { std::vector is_valid = {true, true, false, true, false}; @@ -772,6 +820,12 @@ TEST_F(TestPrettyPrint, BinaryNoNewlines) { options.window = 2; expected = "[666F6F,626172,...,,FF]"; CheckPrimitive(options, is_valid, values, expected, false); + + // With truncated element size + options.element_size_limit = 1; + expected = + "[6 (... 5 chars omitted),6 (... 5 chars omitted),...,,F (... 1 chars omitted)]"; + CheckPrimitive(options, is_valid, values, expected, false); } template @@ -1103,6 +1157,12 @@ TEST_F(TestPrettyPrint, FixedSizeBinaryType) { CheckArray(*array, {0, 10}, ex); static const char* ex_2 = " [\n 666F6F,\n ...\n 62617A\n ]"; CheckArray(*array, {2, 1}, ex_2); + + auto options = PrettyPrintOptions{2, 1}; + options.element_size_limit = 3; + static const char* ex_3 = + " [\n 666 (... 3 chars omitted),\n ...\n 626 (... 3 chars omitted)\n ]"; + CheckArray(*array, options, ex_3); } TEST_F(TestPrettyPrint, DecimalTypes) { @@ -1115,6 +1175,12 @@ TEST_F(TestPrettyPrint, DecimalTypes) { static const char* ex = "[\n 123.4567,\n 456.7891,\n null\n]"; CheckArray(*array, {0}, ex); + + auto options = PrettyPrintOptions(); + options.element_size_limit = 3; + static const char* ex_2 = + "[\n 123 (... 5 chars omitted),\n 456 (... 5 chars omitted),\n null\n]"; + CheckArray(*array, options, ex_2); } } @@ -1417,6 +1483,7 @@ lorem: 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla accumsan sapien commodo massa, vel volutpat orci nisi eu justo. Nulla non blandit sapien. Quisque pretium vestibulum urna eu vehicula.')"; options.truncate_metadata = false; + options.element_size_limit = 10000; Check(*my_schema, options, expected_verbose); // Metadata that exactly fits diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 8bffa808a4e..f39b40f02c7 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -32,13 +32,16 @@ #include "arrow/array/builder_nested.h" #include "arrow/array/builder_union.h" #include "arrow/array/concatenate.h" +#include "arrow/array/statistics.h" #include "arrow/array/validate.h" #include "arrow/c/abi.h" +#include "arrow/compare.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/iterator.h" #include "arrow/util/logging_internal.h" #include "arrow/util/vector.h" @@ -308,40 +311,66 @@ const std::string& RecordBatch::column_name(int i) const { return schema_->field(i)->name(); } -bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata, - const EqualOptions& opts) const { - if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { - return false; - } +namespace { - if (!schema_->Equals(*other.schema(), check_metadata)) { - return false; +bool ContainFloatType(const std::shared_ptr& type) { + if (is_floating(type->id())) { + return true; } - if (device_type() != other.device_type()) { - return false; + for (const auto& field : type->fields()) { + if (ContainFloatType(field->type())) { + return true; + } } - for (int i = 0; i < num_columns(); ++i) { - if (!column(i)->Equals(other.column(i), opts)) { - return false; + return false; +} + +bool ContainFloatType(const Schema& schema) { + for (auto& field : schema.fields()) { + if (ContainFloatType(field->type())) { + return true; } } - - return true; + return false; } -bool RecordBatch::ApproxEquals(const RecordBatch& other, const EqualOptions& opts) const { - if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { +bool CanIgnoreNaNInEquality(const RecordBatch& batch, const EqualOptions& opts) { + if (opts.nans_equal()) { + return true; + } else if (!ContainFloatType(*batch.schema())) { + return true; + } else { return false; } +} - if (device_type() != other.device_type()) { - return false; +} // namespace + +bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata, + const EqualOptions& opts) const { + return Equals(other, opts.use_metadata(check_metadata)); +} + +bool RecordBatch::Equals(const RecordBatch& other, const EqualOptions& opts) const { + if (this == &other) { + if (CanIgnoreNaNInEquality(*this, opts)) { + return true; + } + } else { + if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { + return false; + } else if (opts.use_schema() && + !schema_->Equals(*other.schema(), opts.use_metadata())) { + return false; + } else if (device_type() != other.device_type()) { + return false; + } } for (int i = 0; i < num_columns(); ++i) { - if (!column(i)->ApproxEquals(other.column(i), opts)) { + if (!column(i)->Equals(other.column(i), opts)) { return false; } } @@ -521,9 +550,45 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat if (column_statistics->distinct_count.has_value()) { statistics.nth_statistics++; - statistics.key = ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT; - statistics.type = int64(); - statistics.value = column_statistics->distinct_count.value(); + if (std::holds_alternative(column_statistics->distinct_count.value())) { + statistics.key = ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT; + statistics.type = int64(); + statistics.value = std::get(column_statistics->distinct_count.value()); + } else { + statistics.key = ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE; + statistics.type = float64(); + statistics.value = std::get(column_statistics->distinct_count.value()); + } + + RETURN_NOT_OK(on_statistics(statistics)); + statistics.start_new_column = false; + } + + if (column_statistics->max_byte_width.has_value()) { + statistics.nth_statistics++; + if (std::holds_alternative(column_statistics->max_byte_width.value())) { + statistics.key = ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT; + statistics.type = int64(); + statistics.value = std::get(column_statistics->max_byte_width.value()); + } else { + statistics.key = ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE; + statistics.type = float64(); + statistics.value = std::get(column_statistics->max_byte_width.value()); + } + + RETURN_NOT_OK(on_statistics(statistics)); + statistics.start_new_column = false; + } + + if (column_statistics->average_byte_width.has_value()) { + statistics.nth_statistics++; + if (column_statistics->is_average_byte_width_exact) { + statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT; + } else { + statistics.key = ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE; + } + statistics.type = float64(); + statistics.value = column_statistics->average_byte_width.value(); RETURN_NOT_OK(on_statistics(statistics)); statistics.start_new_column = false; } @@ -556,6 +621,21 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat } return Status::OK(); } +struct StringBuilderVisitor { + template + enable_if_has_string_view Visit(const DataType&, + ArrayBuilder* raw_builder, + const std::string& value) { + using Builder = typename TypeTraits::BuilderType; + auto builder = static_cast(raw_builder); + return builder->Append(value); + } + + Status Visit(const DataType& type, ArrayBuilder*, const std::string&) { + return Status::Invalid("Only string types are supported and the current type is ", + type.ToString()); + } +}; } // namespace Result> RecordBatch::MakeStatisticsArray( @@ -580,7 +660,7 @@ Result> RecordBatch::MakeStatisticsArray( RETURN_NOT_OK(EnumerateStatistics(*this, [&](const EnumeratedStatistics& statistics) { int8_t i = 0; for (const auto& field : values_types) { - if (field->type()->id() == statistics.type->id()) { + if (field->type()->Equals(statistics.type)) { break; } i++; @@ -654,8 +734,10 @@ Result> RecordBatch::MakeStatisticsArray( if (statistics.start_new_column) { RETURN_NOT_OK(builder.Append()); if (statistics.nth_column.has_value()) { + // Add Columns RETURN_NOT_OK(columns_builder->Append(statistics.nth_column.value())); } else { + // Add RecordBatch RETURN_NOT_OK(columns_builder->AppendNull()); } RETURN_NOT_OK(values_builder->Append()); @@ -680,8 +762,8 @@ Result> RecordBatch::MakeStatisticsArray( return static_cast(builder)->Append(value); } Status operator()(const std::string& value) { - return static_cast(builder)->Append( - value.data(), static_cast(value.size())); + StringBuilderVisitor visitor; + return VisitTypeInline(*builder->type(), &visitor, builder, value); } } visitor; visitor.builder = values_builders[values_type_index].get(); diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index a3cd4103853..0d1d2d4ac35 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -118,22 +118,32 @@ class ARROW_EXPORT RecordBatch { static Result> FromStructArray( const std::shared_ptr& array, MemoryPool* pool = default_memory_pool()); - /// \brief Determine if two record batches are exactly equal + /// \brief Determine if two record batches are equal /// /// \param[in] other the RecordBatch to compare with - /// \param[in] check_metadata if true, check that Schema metadata is the same + /// \param[in] check_metadata if true, the schema metadata will be compared, + /// regardless of the value set in \ref EqualOptions::use_metadata /// \param[in] opts the options for equality comparisons /// \return true if batches are equal bool Equals(const RecordBatch& other, bool check_metadata = false, const EqualOptions& opts = EqualOptions::Defaults()) const; + /// \brief Determine if two record batches are equal + /// + /// \param[in] other the RecordBatch to compare with + /// \param[in] opts the options for equality comparisons + /// \return true if batches are equal + bool Equals(const RecordBatch& other, const EqualOptions& opts) const; + /// \brief Determine if two record batches are approximately equal /// /// \param[in] other the RecordBatch to compare with /// \param[in] opts the options for equality comparisons /// \return true if batches are approximately equal bool ApproxEquals(const RecordBatch& other, - const EqualOptions& opts = EqualOptions::Defaults()) const; + const EqualOptions& opts = EqualOptions::Defaults()) const { + return Equals(other, opts.use_schema(false).use_atol(true)); + } /// \return the record batch's schema const std::shared_ptr& schema() const { return schema_; } @@ -371,8 +381,8 @@ class ARROW_EXPORT RecordBatchReader { using iterator_category = std::input_iterator_tag; using difference_type = std::ptrdiff_t; using value_type = std::shared_ptr; - using pointer = value_type const*; - using reference = value_type const&; + using pointer = const value_type*; + using reference = const value_type&; RecordBatchReaderIterator() : batch_(RecordBatchEnd()), reader_(NULLPTR) {} @@ -390,7 +400,7 @@ class ARROW_EXPORT RecordBatchReader { } Result> operator*() { - ARROW_RETURN_NOT_OK(batch_.status()); + ARROW_RETURN_NOT_OK(batch_); return batch_; } diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index a659f8798e5..5c5717ff3fc 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -26,16 +26,21 @@ #include #include #include +#include #include #include #include "arrow/array/array_base.h" #include "arrow/array/array_dict.h" #include "arrow/array/array_nested.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" #include "arrow/array/data.h" +#include "arrow/array/statistics.h" #include "arrow/array/util.h" #include "arrow/c/abi.h" #include "arrow/chunked_array.h" +#include "arrow/compare.h" #include "arrow/config.h" #include "arrow/status.h" #include "arrow/table.h" @@ -45,9 +50,11 @@ #include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/type_traits.h" #include "arrow/util/float16.h" #include "arrow/util/iterator.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/visit_type_inline.h" namespace arrow { @@ -58,6 +65,50 @@ class TestRecordBatch : public ::testing::Test {}; TEST_F(TestRecordBatch, Equals) { const int length = 10; + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8()); + auto f2 = field("f2", int16()); + + auto schema = ::arrow::schema({f0, f1, f2}); + auto schema_same = ::arrow::schema({f0, f1, f2}); + auto schema_fewer_fields = ::arrow::schema({f0, f1}); + + random::RandomArrayGenerator gen(42); + + auto a_f0 = gen.ArrayOf(int32(), length); + auto a_f1 = gen.ArrayOf(uint8(), length); + auto a_f2 = gen.ArrayOf(int16(), length); + auto a_f0_half = a_f0->Slice(0, length / 2); + auto a_f1_half = a_f1->Slice(0, length / 2); + auto a_f0_different = gen.ArrayOf(int32(), length); + auto a_f1_different = gen.ArrayOf(uint8(), length); + + auto b = RecordBatch::Make(schema, length, {a_f0, a_f1, a_f2}); + auto b_same = RecordBatch::Make(schema_same, length, {a_f0, a_f1, a_f2}); + auto b_fewer_fields = RecordBatch::Make(schema_fewer_fields, length, {a_f0, a_f1}); + auto b_fewer_fields_half = + RecordBatch::Make(schema_fewer_fields, length / 2, {a_f0_half, a_f1_half}); + auto b_fewer_fields_different = + RecordBatch::Make(schema_fewer_fields, length, {a_f0_different, a_f1_different}); + + // Same Values + ASSERT_TRUE(b->Equals(*b_same)); + + // Different number of columns + ASSERT_FALSE(b->Equals(*b_fewer_fields)); + + // Different number of rows + ASSERT_FALSE(b_fewer_fields->Equals(*b_fewer_fields_half)); + + // Different values + ASSERT_FALSE(b_fewer_fields->Equals(*b_fewer_fields_different)); +} + +class TestRecordBatchEqualOptions : public TestRecordBatch {}; + +TEST_F(TestRecordBatchEqualOptions, MetadataAndSchema) { + int length = 10; + auto f0 = field("f0", int32()); auto f1 = field("f1", uint8()); auto f2 = field("f2", int16()); @@ -65,37 +116,49 @@ TEST_F(TestRecordBatch, Equals) { auto metadata = key_value_metadata({"foo"}, {"bar"}); - std::vector> fields = {f0, f1, f2}; auto schema = ::arrow::schema({f0, f1, f2}); - auto schema2 = ::arrow::schema({f0, f1}); - auto schema3 = ::arrow::schema({f0, f1, f2}, metadata); - auto schema4 = ::arrow::schema({f0, f1, f2b}); + auto schema_with_metadata = ::arrow::schema({f0, f1, f2}, metadata); + auto schema_renamed_field = ::arrow::schema({f0, f1, f2b}); random::RandomArrayGenerator gen(42); - auto a0 = gen.ArrayOf(int32(), length); - auto a1 = gen.ArrayOf(uint8(), length); - auto a2 = gen.ArrayOf(int16(), length); + auto a_f0 = gen.ArrayOf(int32(), length); + auto a_f1 = gen.ArrayOf(uint8(), length); + auto a_f2 = gen.ArrayOf(int16(), length); + auto a_f2b = a_f2; - auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - auto b2 = RecordBatch::Make(schema3, length, {a0, a1, a2}); - auto b3 = RecordBatch::Make(schema2, length, {a0, a1}); - auto b4 = RecordBatch::Make(schema, length, {a0, a1, a1}); - auto b5 = RecordBatch::Make(schema4, length, {a0, a1, a2}); + // All RecordBatches have the same values but different schemas. + auto b = RecordBatch::Make(schema, length, {a_f0, a_f1, a_f2}); + auto b_with_metadata = + RecordBatch::Make(schema_with_metadata, length, {a_f0, a_f1, a_f2}); + auto b_renamed_field = + RecordBatch::Make(schema_renamed_field, length, {a_f0, a_f1, a_f2b}); - ASSERT_TRUE(b1->Equals(*b1)); - ASSERT_FALSE(b1->Equals(*b3)); - ASSERT_FALSE(b1->Equals(*b4)); + auto options = EqualOptions::Defaults(); // Same values and types, but different field names - ASSERT_FALSE(b1->Equals(*b5)); + ASSERT_FALSE(b->Equals(*b_renamed_field)); + ASSERT_TRUE(b->Equals(*b_renamed_field, options.use_schema(false))); + ASSERT_TRUE(b->ApproxEquals(*b_renamed_field)); + ASSERT_TRUE(b->ApproxEquals(*b_renamed_field, options.use_schema(true))); // Different metadata - ASSERT_TRUE(b1->Equals(*b2)); - ASSERT_FALSE(b1->Equals(*b2, /*check_metadata=*/true)); + ASSERT_TRUE(b->Equals(*b_with_metadata)); + ASSERT_TRUE(b->Equals(*b_with_metadata, options)); + ASSERT_FALSE(b->Equals(*b_with_metadata, + /*check_metadata=*/true)); + ASSERT_FALSE(b->Equals(*b_with_metadata, + /*check_metadata=*/true, options.use_schema(true))); + ASSERT_TRUE(b->Equals(*b_with_metadata, + /*check_metadata=*/true, options.use_schema(false))); + ASSERT_TRUE(b->Equals(*b_with_metadata, options.use_schema(true).use_metadata(false))); + ASSERT_FALSE(b->Equals(*b_with_metadata, options.use_schema(true).use_metadata(true))); + ASSERT_TRUE(b->Equals(*b_with_metadata, options.use_schema(false).use_metadata(true))); + ASSERT_TRUE( + b->ApproxEquals(*b_with_metadata, options.use_schema(true).use_metadata(true))); } -TEST_F(TestRecordBatch, EqualOptions) { +TEST_F(TestRecordBatchEqualOptions, NaN) { int length = 2; auto f = field("f", float64()); @@ -108,13 +171,27 @@ TEST_F(TestRecordBatch, EqualOptions) { auto b1 = RecordBatch::Make(schema, length, {array1}); auto b2 = RecordBatch::Make(schema, length, {array2}); - EXPECT_FALSE(b1->Equals(*b2, /*check_metadata=*/false, - EqualOptions::Defaults().nans_equal(false))); - EXPECT_TRUE(b1->Equals(*b2, /*check_metadata=*/false, - EqualOptions::Defaults().nans_equal(true))); + EXPECT_FALSE(b1->Equals(*b2, EqualOptions::Defaults().nans_equal(false))); + EXPECT_TRUE(b1->Equals(*b2, EqualOptions::Defaults().nans_equal(true))); } -TEST_F(TestRecordBatch, ApproxEqualOptions) { +TEST_F(TestRecordBatchEqualOptions, SignedZero) { + int length = 2; + auto f = field("f", float64()); + + auto schema = ::arrow::schema({f}); + + std::shared_ptr array1, array2; + ArrayFromVector(float64(), {true, true}, {0.5, +0.0}, &array1); + ArrayFromVector(float64(), {true, true}, {0.5, -0.0}, &array2); + auto b1 = RecordBatch::Make(schema, length, {array1}); + auto b2 = RecordBatch::Make(schema, length, {array2}); + + ASSERT_FALSE(b1->Equals(*b2, EqualOptions::Defaults().signed_zeros_equal(false))); + ASSERT_TRUE(b1->Equals(*b2, EqualOptions::Defaults().signed_zeros_equal(true))); +} + +TEST_F(TestRecordBatchEqualOptions, Approx) { int length = 2; auto f = field("f", float64()); @@ -130,7 +207,98 @@ TEST_F(TestRecordBatch, ApproxEqualOptions) { EXPECT_FALSE(b1->ApproxEquals(*b2, EqualOptions::Defaults().nans_equal(false))); EXPECT_FALSE(b1->ApproxEquals(*b2, EqualOptions::Defaults().nans_equal(true))); - EXPECT_TRUE(b1->ApproxEquals(*b2, EqualOptions::Defaults().nans_equal(true).atol(0.1))); + auto options = EqualOptions::Defaults().nans_equal(true).atol(0.1); + EXPECT_FALSE(b1->Equals(*b2, options)); + EXPECT_TRUE(b1->Equals(*b2, options.use_atol(true))); + EXPECT_TRUE(b1->ApproxEquals(*b2, options)); +} + +class TestRecordBatchEqualsSameAddress : public TestRecordBatch {}; + +TEST_F(TestRecordBatchEqualsSameAddress, NonFloatType) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int64()); + + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); + auto a1 = ArrayFromJSON(f1->type(), "[0, 1, 2]"); + + auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); + auto b1 = b0; + + auto options = EqualOptions::Defaults(); + + ASSERT_TRUE(b0->Equals(*b1, options)); + ASSERT_TRUE(b0->Equals(*b1, options.nans_equal(true))); + + ASSERT_TRUE(b0->ApproxEquals(*b1, options)); + ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); +} + +TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithoutFloatType) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", struct_({{"f2", int64()}, {"f3", int8()}})); + + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); + auto a1 = ArrayFromJSON( + f1->type(), R"([{"f2": 1, "f3": 4}, {"f2": 2, "f3": 5}, {"f2":3, "f3": 6}])"); + + auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); + auto b1 = b0; + + auto options = EqualOptions::Defaults(); + + ASSERT_TRUE(b0->Equals(*b1, options)); + ASSERT_TRUE(b0->Equals(*b1, options.nans_equal(true))); + + ASSERT_TRUE(b0->ApproxEquals(*b1, options)); + ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); +} + +TEST_F(TestRecordBatchEqualsSameAddress, FloatType) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", float64()); + + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); + auto a1 = ArrayFromJSON(f1->type(), "[0.0, 1.0, 2.0, NaN]"); + + auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); + auto b1 = b0; + + auto options = EqualOptions::Defaults(); + + ASSERT_FALSE(b0->Equals(*b1, options)); + ASSERT_TRUE(b0->Equals(*b1, options.nans_equal(true))); + + ASSERT_FALSE(b0->ApproxEquals(*b1, options)); + ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); +} + +TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithFloatType) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", struct_({{"f2", int64()}, {"f3", float32()}})); + + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]"); + auto a1 = ArrayFromJSON( + f1->type(), R"([{"f2": 1, "f3": 4.0}, {"f2": 2, "f3": 4.0}, {"f2":3, "f3": NaN}])"); + + auto b0 = RecordBatch::Make(schema, 3, {a0, a1}); + auto b1 = b0; + + auto options = EqualOptions::Defaults(); + + ASSERT_FALSE(b0->Equals(*b1, options)); + ASSERT_TRUE(b0->Equals(*b1, options.nans_equal(true))); + + ASSERT_FALSE(b0->ApproxEquals(*b1, options)); + ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true))); } TEST_F(TestRecordBatch, Validate) { @@ -1033,15 +1201,32 @@ Result> BuildArray( } return builder.Finish(); } +struct StringBuilderVisitor { + template + enable_if_t::value, Status> Visit( + const DataType&, ArrayBuilder* raw_builder, + const std::vector& values) { + using Builder = typename TypeTraits::BuilderType; + auto builder = static_cast(raw_builder); + for (const auto& value : values) { + ARROW_RETURN_NOT_OK(builder->Append(value)); + } + return Status::OK(); + } -template > -Result> BuildArray(const std::vector& values) { - using BuilderType = typename TypeTraits::BuilderType; - BuilderType builder; - for (const auto& value : values) { - ARROW_RETURN_NOT_OK(builder.Append(value)); + Status Visit(const DataType& type, ArrayBuilder*, const std::vector&) { + return Status::Invalid("Only string types are supported and the current type is", + type.ToString()); } - return builder.Finish(); +}; +Result> BuildArray(const std::shared_ptr& string_type, + const std::vector& values) { + std::unique_ptr array_builder; + ARROW_RETURN_NOT_OK(MakeBuilder(default_memory_pool(), string_type, &array_builder)); + StringBuilderVisitor visitor; + ARROW_RETURN_NOT_OK( + VisitTypeInline(*string_type, &visitor, array_builder.get(), values)); + return array_builder->Finish(); } template @@ -1056,41 +1241,44 @@ std::vector StatisticsValuesToRawValues( template ::value>> -Result> BuildArray(const std::vector& values) { +Result> BuildArray(const std::vector& values, + const std::shared_ptr& array_type) { struct Builder { - const std::vector& values_; - explicit Builder(const std::vector& values) - : values_(values) {} + const std::vector& values; + const std::shared_ptr& array_type; + explicit Builder(const std::vector& values, + const std::shared_ptr& array_type) + : values(values), array_type(array_type) {} Result> operator()(const bool&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(raw_values); } Result> operator()(const int64_t&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(raw_values); } Result> operator()(const uint64_t&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(raw_values); } Result> operator()(const double&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(raw_values); } Result> operator()(const std::string&) { - auto values = StatisticsValuesToRawValues(values_); - return BuildArray(values); + auto raw_values = StatisticsValuesToRawValues(values); + return BuildArray(array_type, raw_values); } - } builder(values); + } builder(values, array_type); return std::visit(builder, values[0]); } Result> MakeStatisticsArray( const std::string& columns_json, const std::vector>& nested_statistics_keys, - const std::vector>& - nested_statistics_values) { + const std::vector>& nested_statistics_values, + const std::vector>& array_types = {}) { auto columns_type = int32(); auto columns_array = ArrayFromJSON(columns_type, columns_json); const auto n_columns = columns_array->length(); @@ -1137,6 +1325,7 @@ Result> MakeStatisticsArray( for (size_t i = 0; i < nested_statistics_keys.size(); ++i) { const auto& statistics_keys = nested_statistics_keys[i]; const auto& statistics_values = nested_statistics_values[i]; + const auto& array_type = (i < array_types.size()) ? array_types[i] : null(); statistics_offsets.push_back(offset); for (size_t j = 0; j < statistics_keys.size(); ++j) { const auto& key = statistics_keys[j]; @@ -1154,11 +1343,11 @@ Result> MakeStatisticsArray( } keys_indices.push_back(key_index); - auto values_type = ArrayStatistics::ValueToArrowType(value, arrow::null()); + auto values_type = ArrayStatistics::ValueToArrowType(value, array_type); int8_t values_type_code = 0; for (; values_type_code < static_cast(values_types.size()); ++values_type_code) { - if (values_types[values_type_code] == values_type) { + if (values_types[values_type_code]->Equals(values_type)) { break; } } @@ -1186,16 +1375,18 @@ Result> MakeStatisticsArray( struct_({field("column", columns_type), field("statistics", statistics_type)}); ARROW_ASSIGN_OR_RAISE(auto keys_indices_array, BuildArray(keys_indices)); - ARROW_ASSIGN_OR_RAISE(auto keys_dictionary_array, - BuildArray(keys_dictionary)); + // The statistics schema specifies the type of dictionary key is utf8(StringType) + ARROW_ASSIGN_OR_RAISE(auto keys_dictionary_array, BuildArray(utf8(), keys_dictionary)); ARROW_ASSIGN_OR_RAISE( auto keys_array, DictionaryArray::FromArrays(keys_type, keys_indices_array, keys_dictionary_array)); std::vector> values_arrays; - for (const auto& values : values_values) { + for (size_t i = 0; i < values_values.size(); ++i) { + const auto& values = values_values[i]; + const auto& array_type = (i < array_types.size()) ? array_types[i] : null(); ARROW_ASSIGN_OR_RAISE(auto values_array, - BuildArray(values)); + BuildArray(values, array_type)); values_arrays.push_back(values_array); } ARROW_ASSIGN_OR_RAISE(auto values_value_type_ids_array, @@ -1215,6 +1406,21 @@ Result> MakeStatisticsArray( std::move(statistics_array)}; return std::make_shared(struct_type, n_columns, struct_arrays); } + +std::shared_ptr GenerateString(const std::shared_ptr& data_type) { + if (data_type->id() == Type::FIXED_SIZE_BINARY) { + auto byte_width = data_type->byte_width(); + std::string a(byte_width, 'a'); + std::string b(byte_width, 'b'); + std::string c(byte_width, 'c'); + std::stringstream ss; + ss << R"([")" << a << R"(",")" << b << R"(",")" << c << R"("])"; + return ArrayFromJSON(data_type, ss.str()); + } else { + return ArrayFromJSON(data_type, R"(["a","b","c"])"); + } +} + }; // namespace TEST_F(TestRecordBatch, MakeStatisticsArrayRowCount) { @@ -1265,14 +1471,14 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCount) { +TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCountExact) { auto schema = ::arrow::schema({field("no-statistics", boolean()), field("int32", int32())}); auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); auto int32_array_data = ArrayFromJSON(int32(), "[1, null, -1]")->data()->Copy(); int32_array_data->statistics = std::make_shared(); int32_array_data->statistics->null_count = 1; - int32_array_data->statistics->distinct_count = 2; + int32_array_data->statistics->distinct_count = static_cast(2); auto int32_array = MakeArray(std::move(int32_array_data)); auto batch = RecordBatch::Make(schema, int32_array->length(), {no_statistics_array, int32_array}); @@ -1298,6 +1504,169 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCount) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } +TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCountApproximate) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("int32", int32())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto int32_array_data = ArrayFromJSON(int32(), "[1, null, -1]")->data()->Copy(); + int32_array_data->statistics = std::make_shared(); + int32_array_data->statistics->null_count = 1; + int32_array_data->statistics->distinct_count = 2.0; + auto int32_array = MakeArray(std::move(int32_array_data)); + auto batch = RecordBatch::Make(schema, int32_array->length(), + {no_statistics_array, int32_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_NULL_COUNT_EXACT, + ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{int64_t{1}}, + ArrayStatistics::ValueType{2.0}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayMaxByteWidthExact) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = ArrayFromJSON(utf8(), R"(["aa", null, "c"])")->data()->Copy(); + string_array_data->statistics = std::make_shared(); + string_array_data->statistics->null_count = 1; + string_array_data->statistics->max_byte_width = static_cast(2); + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_NULL_COUNT_EXACT, + ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{int64_t{1}}, + ArrayStatistics::ValueType{int64_t{2}}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayMaxByteWidthApproximate) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array_data = ArrayFromJSON(utf8(), R"(["aa", null, "c"])")->data()->Copy(); + string_array_data->statistics = std::make_shared(); + string_array_data->statistics->null_count = 1; + string_array_data->statistics->max_byte_width = 2.0; + auto string_array = MakeArray(std::move(string_array_data)); + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_NULL_COUNT_EXACT, + ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{int64_t{1}}, + ArrayStatistics::ValueType{2.0}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array = ArrayFromJSON(utf8(), R"(["aa", "bb", "ccc"])"); + string_array->data()->statistics = std::make_shared(); + string_array->data()->statistics->average_byte_width = 2.3; + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{2.3}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthExact) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("float64", float64())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto float_array = ArrayFromJSON(float64(), R"([1.0, 2.0, 3.0])"); + float_array->data()->statistics = std::make_shared(); + float_array->data()->statistics->average_byte_width = 8.0; + float_array->data()->statistics->is_average_byte_width_exact = true; + + auto batch = RecordBatch::Make(schema, float_array->length(), + {no_statistics_array, float_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{8.0}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + TEST_F(TestRecordBatch, MakeStatisticsArrayMinExact) { auto schema = ::arrow::schema({field("no-statistics", boolean()), field("uint32", uint32())}); @@ -1423,34 +1792,148 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayMaxApproximate) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayString) { - auto schema = - ::arrow::schema({field("no-statistics", boolean()), field("string", utf8())}); - auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); - auto string_array_data = ArrayFromJSON(utf8(), "[\"a\", null, \"c\"]")->data()->Copy(); - string_array_data->statistics = std::make_shared(); - string_array_data->statistics->is_max_exact = true; - string_array_data->statistics->max = "c"; - auto string_array = MakeArray(std::move(string_array_data)); - auto batch = RecordBatch::Make(schema, string_array->length(), - {no_statistics_array, string_array}); +template +class TestRecordBatchMakeStatisticsArrayBinary : public ::testing::Test { + public: + void TestMaxApproximation() { + ArrayStatistics::ValueType max("c"); + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("string", type())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto string_array = GenerateString(type()); + string_array->data()->statistics = std::make_shared(); + string_array->data()->statistics->max = max; + + auto batch = RecordBatch::Make(schema, string_array->length(), + {no_statistics_array, string_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + max, + }}, + {null(), type()})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); + } + + std::shared_ptr<::arrow::DataType> type() { + if constexpr (std::is_same_v) { + return fixed_size_binary(1); + } else { + return TypeTraits::type_singleton(); + } + } +}; + +TYPED_TEST_SUITE(TestRecordBatchMakeStatisticsArrayBinary, + AllBinaryOrBinrayViewLikeArrowTypes); +TYPED_TEST(TestRecordBatchMakeStatisticsArrayBinary, MaxApproximation) { + this->TestMaxApproximation(); +} + +// Validates that the union array creates two distinct child arrays for two +// FixedSizeBinaryArrays with unequal byte widths. +TEST_F(TestRecordBatch, MakeStatisticsArrayDifferentSizeFixedSizeBinary) { + auto fixed_size_type1 = fixed_size_binary(1); + auto fixed_size_type2 = fixed_size_binary(2); + + auto fixed_size_array1 = GenerateString(fixed_size_type1); + fixed_size_array1->data()->statistics = std::make_shared(); + fixed_size_array1->data()->statistics->max = + std::string(fixed_size_type1->byte_width(), 'c'); + + auto fixed_size_array2 = GenerateString(fixed_size_type2); + fixed_size_array2->data()->statistics = std::make_shared(); + fixed_size_array2->data()->statistics->max = + std::string(fixed_size_type2->byte_width(), 'c'); + + auto schema = ::arrow::schema( + {field("fixed_size1", fixed_size_type1), field("fixed_size2", fixed_size_type2)}); + auto batch = RecordBatch::Make(schema, fixed_size_array1->length(), + {fixed_size_array1, fixed_size_array2}); ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 0, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{ + std::string(fixed_size_type1->byte_width(), 'c')}, + }, + { + ArrayStatistics::ValueType{ + std::string(fixed_size_type2->byte_width(), 'c')}, + }}, + {null(), fixed_size_type1, fixed_size_type2})); + + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + +// Validates that the union array creates a single child array for two +// FixedSizeBinaryArrays with equal byte widths. +TEST_F(TestRecordBatch, MakeStatisticsArraySameSizeFixedSizeBinary) { + auto fixed_size_type = fixed_size_binary(2); + ArrayStatistics::ValueType max(std::string(fixed_size_type->byte_width(), 'c')); + + auto fixed_size_array1 = GenerateString(fixed_size_type); + fixed_size_array1->data()->statistics = std::make_shared(); + fixed_size_array1->data()->statistics->max = max; + + ASSERT_OK_AND_ASSIGN(auto fixed_size_array2, + fixed_size_array1->CopyTo(default_cpu_memory_manager())); + + auto schema = ::arrow::schema( + {field("fixed_size1", fixed_size_type), field("fixed_size2", fixed_size_type)}); + auto batch = RecordBatch::Make(schema, fixed_size_array1->length(), + {fixed_size_array1, fixed_size_array2}); + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 0, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }, + { + ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + max, + }, + { + max, + }}, + {null(), fixed_size_type, fixed_size_type})); - ASSERT_OK_AND_ASSIGN(auto expected_statistics_array, - MakeStatisticsArray("[null, 1]", - {{ - ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, - }, - { - ARROW_STATISTICS_KEY_MAX_VALUE_EXACT, - }}, - {{ - ArrayStatistics::ValueType{int64_t{3}}, - }, - { - ArrayStatistics::ValueType{"c"}, - }})); AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h index f8ae5b15d52..2b25de69486 100644 --- a/cpp/src/arrow/result.h +++ b/cpp/src/arrow/result.h @@ -231,7 +231,7 @@ class [[nodiscard]] Result : public util::EqualityComparable> { /// contents of a `Result`. `T` must be implicitly constructible from `U /// &&`. /// - /// Sets `other` to contain a non-OK status with a`StatusError::Invalid` + /// Sets `other` to contain a non-OK status with a `StatusError::Invalid` /// error code. /// /// \param other The Result object to move from and set to a non-OK status. @@ -377,6 +377,14 @@ class [[nodiscard]] Result : public util::EqualityComparable> { return MoveValueUnsafe(); } + /// Return a copy of the internally stored value or alternative if an error is stored. + T ValueOr(T alternative) const& { + if (!ok()) { + return alternative; + } + return ValueUnsafe(); + } + /// Retrieve the value if ok(), falling back to an alternative generated by the provided /// factory template @@ -489,19 +497,11 @@ class [[nodiscard]] Result : public util::EqualityComparable> { ARROW_ASSIGN_OR_RAISE_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \ lhs, rexpr); -namespace internal { - template -inline const Status& GenericToStatus(const Result& res) { - return res.status(); -} - -template -inline Status GenericToStatus(Result&& res) { - return std::move(res).status(); -} - -} // namespace internal +struct IntoStatus> { + static constexpr const Status& ToStatus(const Result& res) { return res.status(); } + static inline Status ToStatus(Result&& res) { return std::move(res).status(); } +}; template ::type> R ToResult(T t) { diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 7ef37301203..b96d930a444 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -37,6 +37,7 @@ #include "arrow/type_traits.h" #include "arrow/util/compare.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/visibility.h" #include "arrow/visit_type_inline.h" @@ -245,6 +246,12 @@ struct ARROW_EXPORT UInt64Scalar : public NumericScalar { struct ARROW_EXPORT HalfFloatScalar : public NumericScalar { using NumericScalar::NumericScalar; + + explicit HalfFloatScalar(util::Float16 value) + : NumericScalar(value.bits(), float16()) {} + + HalfFloatScalar(util::Float16 value, std::shared_ptr type) + : NumericScalar(value.bits(), std::move(type)) {} }; struct ARROW_EXPORT FloatScalar : public NumericScalar { @@ -969,6 +976,18 @@ struct MakeScalarImpl { return Status::OK(); } + // This isn't captured by the generic case above because `util::Float16` isn't implicity + // convertible to `uint16_t` (HalfFloat's ValueType) + template + std::enable_if_t, util::Float16> && + is_half_float_type::value, + Status> + Visit(const T& t) { + out_ = std::make_shared(static_cast(value_), + std::move(type_)); + return Status::OK(); + } + Status Visit(const ExtensionType& t) { ARROW_ASSIGN_OR_RAISE(auto storage, MakeScalar(t.storage_type(), static_cast(value_))); diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 6938bc0d887..4a34e5d13c2 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -39,6 +39,7 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type_traits.h" +#include "arrow/util/float16.h" namespace arrow { @@ -46,6 +47,7 @@ using compute::Cast; using compute::CastOptions; using internal::checked_cast; using internal::checked_pointer_cast; +using util::Float16; std::shared_ptr CheckMakeNullScalar(const std::shared_ptr& type) { const auto scalar = MakeNullScalar(type); @@ -201,22 +203,33 @@ TEST(TestScalar, IdentityCast) { */ } +template +using NumericArgType = std::conditional_t::value, Float16, + typename ArrowType::c_type>; + template class TestNumericScalar : public ::testing::Test { public: TestNumericScalar() = default; }; -TYPED_TEST_SUITE(TestNumericScalar, NumericArrowTypes); +using NumericArrowTypesPlusHalfFloat = + testing::Types; +TYPED_TEST_SUITE(TestNumericScalar, NumericArrowTypesPlusHalfFloat); TYPED_TEST(TestNumericScalar, Basics) { - using T = typename TypeParam::c_type; + using T = NumericArgType; using ScalarType = typename TypeTraits::ScalarType; T value = static_cast(1); auto scalar_val = std::make_shared(value); - ASSERT_EQ(value, scalar_val->value); + if constexpr (is_half_float_type::value) { + ASSERT_EQ(value, Float16::FromBits(scalar_val->value)); + } else { + ASSERT_EQ(value, scalar_val->value); + } ASSERT_TRUE(scalar_val->is_valid); ASSERT_OK(scalar_val->ValidateFull()); @@ -227,8 +240,13 @@ TYPED_TEST(TestNumericScalar, Basics) { auto scalar_other = std::make_shared(other_value); ASSERT_NE(*scalar_other, *scalar_val); - scalar_val->value = other_value; - ASSERT_EQ(other_value, scalar_val->value); + if constexpr (is_half_float_type::value) { + scalar_val->value = other_value.bits(); + ASSERT_EQ(other_value, Float16::FromBits(scalar_val->value)); + } else { + scalar_val->value = other_value; + ASSERT_EQ(other_value, scalar_val->value); + } ASSERT_EQ(*scalar_other, *scalar_val); ScalarType stack_val; @@ -255,72 +273,72 @@ TYPED_TEST(TestNumericScalar, Basics) { ASSERT_OK(two->ValidateFull()); ASSERT_TRUE(null->Equals(*null_value)); - ASSERT_TRUE(one->Equals(ScalarType(1))); - ASSERT_FALSE(one->Equals(ScalarType(2))); - ASSERT_TRUE(two->Equals(ScalarType(2))); - ASSERT_FALSE(two->Equals(ScalarType(3))); + ASSERT_TRUE(one->Equals(ScalarType(static_cast(1)))); + ASSERT_FALSE(one->Equals(ScalarType(static_cast(2)))); + ASSERT_TRUE(two->Equals(ScalarType(static_cast(2)))); + ASSERT_FALSE(two->Equals(ScalarType(static_cast(3)))); ASSERT_TRUE(null->ApproxEquals(*null_value)); - ASSERT_TRUE(one->ApproxEquals(ScalarType(1))); - ASSERT_FALSE(one->ApproxEquals(ScalarType(2))); - ASSERT_TRUE(two->ApproxEquals(ScalarType(2))); - ASSERT_FALSE(two->ApproxEquals(ScalarType(3))); + ASSERT_TRUE(one->ApproxEquals(ScalarType(static_cast(1)))); + ASSERT_FALSE(one->ApproxEquals(ScalarType(static_cast(2)))); + ASSERT_TRUE(two->ApproxEquals(ScalarType(static_cast(2)))); + ASSERT_FALSE(two->ApproxEquals(ScalarType(static_cast(3)))); } TYPED_TEST(TestNumericScalar, Hashing) { - using T = typename TypeParam::c_type; + using T = NumericArgType; using ScalarType = typename TypeTraits::ScalarType; std::unordered_set, Scalar::Hash, Scalar::PtrsEqual> set; set.emplace(std::make_shared()); - for (T i = 0; i < 10; ++i) { - set.emplace(std::make_shared(i)); + for (int i = 0; i < 10; ++i) { + ASSERT_TRUE(set.emplace(std::make_shared(static_cast(i))).second); } ASSERT_FALSE(set.emplace(std::make_shared()).second); - for (T i = 0; i < 10; ++i) { - ASSERT_FALSE(set.emplace(std::make_shared(i)).second); + for (int i = 0; i < 10; ++i) { + ASSERT_FALSE(set.emplace(std::make_shared(static_cast(i))).second); } } TYPED_TEST(TestNumericScalar, MakeScalar) { - using T = typename TypeParam::c_type; + using T = NumericArgType; using ScalarType = typename TypeTraits::ScalarType; auto type = TypeTraits::type_singleton(); std::shared_ptr three = MakeScalar(static_cast(3)); ASSERT_OK(three->ValidateFull()); - ASSERT_EQ(ScalarType(3), *three); + ASSERT_EQ(ScalarType(static_cast(3)), *three); - AssertMakeScalar(ScalarType(3), type, static_cast(3)); + AssertMakeScalar(ScalarType(static_cast(3)), type, static_cast(3)); - AssertParseScalar(type, "3", ScalarType(3)); + AssertParseScalar(type, "3", ScalarType(static_cast(3))); } template class TestRealScalar : public ::testing::Test { public: - using CType = typename T::c_type; + using ValueType = NumericArgType; using ScalarType = typename TypeTraits::ScalarType; void SetUp() { type_ = TypeTraits::type_singleton(); - scalar_val_ = std::make_shared(static_cast(1)); + scalar_val_ = std::make_shared(static_cast(1)); ASSERT_TRUE(scalar_val_->is_valid); - scalar_other_ = std::make_shared(static_cast(1.1)); + scalar_other_ = std::make_shared(static_cast(1.1)); ASSERT_TRUE(scalar_other_->is_valid); - scalar_zero_ = std::make_shared(static_cast(0.0)); - scalar_other_zero_ = std::make_shared(static_cast(0.0)); - scalar_neg_zero_ = std::make_shared(static_cast(-0.0)); + scalar_zero_ = std::make_shared(static_cast(0.0)); + scalar_other_zero_ = std::make_shared(static_cast(0.0)); + scalar_neg_zero_ = std::make_shared(static_cast(-0.0)); - const CType nan_value = std::numeric_limits::quiet_NaN(); + const auto nan_value = std::numeric_limits::quiet_NaN(); scalar_nan_ = std::make_shared(nan_value); ASSERT_TRUE(scalar_nan_->is_valid); - const CType other_nan_value = std::numeric_limits::quiet_NaN(); + const auto other_nan_value = std::numeric_limits::quiet_NaN(); scalar_other_nan_ = std::make_shared(other_nan_value); ASSERT_TRUE(scalar_other_nan_->is_valid); } @@ -387,6 +405,14 @@ class TestRealScalar : public ::testing::Test { ASSERT_FALSE(scalar_zero_->ApproxEquals(*scalar_neg_zero_, options)); } + void TestUseAtol() { + auto options = EqualOptions::Defaults().atol(0.2f); + + ASSERT_FALSE(scalar_val_->Equals(*scalar_other_, options)); + ASSERT_TRUE(scalar_val_->Equals(*scalar_other_, options.use_atol(true))); + ASSERT_TRUE(scalar_val_->ApproxEquals(*scalar_other_, options)); + } + void TestStructOf() { auto ty = struct_({field("float", type_)}); @@ -514,7 +540,9 @@ class TestRealScalar : public ::testing::Test { scalar_zero_, scalar_other_zero_, scalar_neg_zero_; }; -TYPED_TEST_SUITE(TestRealScalar, RealArrowTypes); +using RealArrowTypesPlusHalfFloat = + ::testing::Types; +TYPED_TEST_SUITE(TestRealScalar, RealArrowTypesPlusHalfFloat); TYPED_TEST(TestRealScalar, NanEquals) { this->TestNanEquals(); } @@ -522,6 +550,8 @@ TYPED_TEST(TestRealScalar, SignedZeroEquals) { this->TestSignedZeroEquals(); } TYPED_TEST(TestRealScalar, ApproxEquals) { this->TestApproxEquals(); } +TYPED_TEST(TestRealScalar, UseAtol) { this->TestUseAtol(); } + TYPED_TEST(TestRealScalar, StructOf) { this->TestStructOf(); } TYPED_TEST(TestRealScalar, ListOf) { this->TestListOf(); } @@ -795,8 +825,8 @@ TEST(TestFixedSizeBinaryScalar, MakeScalar) { AssertParseScalar(type, std::string_view(data), FixedSizeBinaryScalar(buf, type)); // Wrong length - ASSERT_RAISES(Invalid, MakeScalar(type, Buffer::FromString(data.substr(3))).status()); - ASSERT_RAISES(Invalid, Scalar::Parse(type, std::string_view(data).substr(3)).status()); + ASSERT_RAISES(Invalid, MakeScalar(type, Buffer::FromString(data.substr(3)))); + ASSERT_RAISES(Invalid, Scalar::Parse(type, std::string_view(data).substr(3))); } TEST(TestFixedSizeBinaryScalar, ValidateErrors) { @@ -1171,8 +1201,6 @@ TEST(TestDayTimeIntervalScalars, Basics) { ASSERT_TRUE(first->Equals(ts_val2)); } -// TODO test HalfFloatScalar - TYPED_TEST(TestNumericScalar, Cast) { auto type = TypeTraits::type_singleton(); @@ -1438,13 +1466,13 @@ TEST(TestStructScalar, FieldAccess) { ASSERT_OK_AND_ASSIGN(auto a, abc.field("a")); AssertScalarsEqual(*a, *abc.value[0]); - ASSERT_RAISES(Invalid, abc.field("b").status()); + ASSERT_RAISES(Invalid, abc.field("b")); ASSERT_OK_AND_ASSIGN(auto b, abc.field(1)); AssertScalarsEqual(*b, *abc.value[1]); - ASSERT_RAISES(Invalid, abc.field(5).status()); - ASSERT_RAISES(Invalid, abc.field("c").status()); + ASSERT_RAISES(Invalid, abc.field(5)); + ASSERT_RAISES(Invalid, abc.field("c")); ASSERT_OK_AND_ASSIGN(auto d, abc.field("d")); ASSERT_TRUE(d->Equals(*MakeNullScalar(int64()))); diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index 73c3659c106..b84070b3d28 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -301,7 +301,7 @@ Status ValidateSparseCSXIndex(const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, - char const* type_name) { + const char* type_name) { if (!is_integer(indptr_type->id())) { return Status::TypeError("Type of ", type_name, " indptr must be integer"); } @@ -325,7 +325,7 @@ void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, - char const* type_name) { + const char* type_name) { ARROW_CHECK_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape, indices_shape, type_name)); } diff --git a/cpp/src/arrow/sparse_tensor.h b/cpp/src/arrow/sparse_tensor.h index 4ec824dfa7d..5faae16bb25 100644 --- a/cpp/src/arrow/sparse_tensor.h +++ b/cpp/src/arrow/sparse_tensor.h @@ -208,14 +208,14 @@ Status ValidateSparseCSXIndex(const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, - char const* type_name); + const char* type_name); ARROW_EXPORT void CheckSparseCSXIndexValidity(const std::shared_ptr& indptr_type, const std::shared_ptr& indices_type, const std::vector& indptr_shape, const std::vector& indices_shape, - char const* type_name); + const char* type_name); template class SparseCSXIndex : public SparseIndexBase { @@ -344,7 +344,7 @@ class ARROW_EXPORT SparseCSRIndex internal::SparseCSXIndex; static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; - static constexpr char const* kTypeName = "SparseCSRIndex"; + static constexpr const char* kTypeName = "SparseCSRIndex"; using SparseCSXIndex::kCompressedAxis; using SparseCSXIndex::Make; @@ -375,7 +375,7 @@ class ARROW_EXPORT SparseCSCIndex internal::SparseMatrixCompressedAxis::COLUMN>; static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSC; - static constexpr char const* kTypeName = "SparseCSCIndex"; + static constexpr const char* kTypeName = "SparseCSCIndex"; using SparseCSXIndex::kCompressedAxis; using SparseCSXIndex::Make; @@ -398,7 +398,7 @@ class ARROW_EXPORT SparseCSCIndex class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase { public: static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF; - static constexpr char const* kTypeName = "SparseCSFIndex"; + static constexpr const char* kTypeName = "SparseCSFIndex"; /// \brief Make SparseCSFIndex from raw properties static Result> Make( diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 73477fef4a4..c9c28a11b1b 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -33,7 +33,7 @@ #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/logging_internal.h" -#include "arrow/util/sort.h" +#include "arrow/util/sort_internal.h" namespace arrow { diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 42e8929ce0b..8907d32ff7d 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -18,11 +18,12 @@ #include #include #include +#include #include #include "arrow/util/compare.h" #include "arrow/util/macros.h" -#include "arrow/util/string_builder.h" +#include "arrow/util/string_util.h" #include "arrow/util/visibility.h" #ifdef ARROW_EXTRA_ERROR_CONTEXT @@ -52,10 +53,10 @@ ARROW_RETURN_IF_(condition, status, ARROW_STRINGIFY(status)) /// \brief Propagate any non-successful Status to the caller -#define ARROW_RETURN_NOT_OK(status) \ - do { \ - ::arrow::Status __s = ::arrow::internal::GenericToStatus(status); \ - ARROW_RETURN_IF_(!__s.ok(), __s, ARROW_STRINGIFY(status)); \ +#define ARROW_RETURN_NOT_OK(status) \ + do { \ + ::arrow::Status __s = ::arrow::ToStatus(status); \ + ARROW_RETURN_IF_(!__s.ok(), __s, ARROW_STRINGIFY(status)); \ } while (false) /// \brief Given `expr` and `warn_msg`; log `warn_msg` if `expr` is a non-ok status @@ -67,15 +68,6 @@ } \ } while (false) -#define RETURN_NOT_OK_ELSE(s, else_) \ - do { \ - ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \ - if (!_s.ok()) { \ - else_; \ - return _s; \ - } \ - } while (false) - // This is an internal-use macro and should not be used in public headers. #ifndef RETURN_NOT_OK # define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s) @@ -83,8 +75,10 @@ namespace arrow { namespace internal { + class StatusConstant; -} + +} // namespace internal enum class StatusCode : char { OK = 0, @@ -124,6 +118,23 @@ class ARROW_EXPORT StatusDetail { } }; +/// \brief A type trait to declare a given type as Status-compatible. +/// +/// This trait structure can be implemented if a type (such as Result) embeds +/// error information that can be converted to the Status class. +/// It will make the given type usable directly in functions such as +/// Status::OrElse and error-checking macros such as ARROW_RETURN_NOT_OK. +template +struct IntoStatus; + +/// \brief Convert a Status-compatible object to Status +/// +/// This generic function delegates to the IntoStatus type trait. +template +constexpr decltype(auto) ToStatus(T&& t) { + return IntoStatus>::ToStatus(std::forward(t)); +} + /// \brief Status outcome object (success or error) /// /// The Status object is an object holding the outcome of an operation. @@ -170,13 +181,13 @@ class ARROW_EXPORT [[nodiscard]] Status : public util::EqualityComparable static Status FromArgs(StatusCode code, Args&&... args) { - return Status(code, util::StringBuilder(std::forward(args)...)); + return Status(code, internal::JoinToString(std::forward(args)...)); } template static Status FromDetailAndArgs(StatusCode code, std::shared_ptr detail, Args&&... args) { - return Status(code, util::StringBuilder(std::forward(args)...), + return Status(code, internal::JoinToString(std::forward(args)...), std::move(detail)); } @@ -350,6 +361,32 @@ class ARROW_EXPORT [[nodiscard]] Status : public util::EqualityComparable(args)...).WithDetail(detail()); } + /// \brief Apply a functor if the status indicates an error + /// + /// This can be used to execute fallback or cleanup actions. + /// + /// If the status indicates a success, it is returned as-is. + /// + /// If the status indicates an error, the given functor is called with the status + /// as argument. + /// If the functor returns a new Status, it is returned. + /// If the functor returns a Status-compatible object such as Result, it is + /// converted to Status and returned. + /// If the functor returns void, the original Status is returned. + template + Status OrElse(OnError&& on_error) { + using RT = decltype(on_error(Status())); + if (ARROW_PREDICT_TRUE(ok())) { + return *this; + } + if constexpr (std::is_void_v) { + on_error(*this); + return *this; + } else { + return ToStatus(on_error(*this)); + } + } + void Warn() const; void Warn(const std::string& message) const; @@ -463,13 +500,10 @@ Status& Status::operator&=(Status&& s) noexcept { } /// \endcond -namespace internal { - -// Extract Status from Status or Result -// Useful for the status check macros such as RETURN_NOT_OK. -inline const Status& GenericToStatus(const Status& st) { return st; } -inline Status GenericToStatus(Status&& st) { return std::move(st); } - -} // namespace internal +template <> +struct IntoStatus { + static constexpr const Status& ToStatus(const Status& st) { return st; } + static constexpr Status&& ToStatus(Status&& st) { return std::move(st); } +}; } // namespace arrow diff --git a/cpp/src/arrow/status_test.cc b/cpp/src/arrow/status_test.cc index 005bdf665f5..39a52bd2bad 100644 --- a/cpp/src/arrow/status_test.cc +++ b/cpp/src/arrow/status_test.cc @@ -20,6 +20,7 @@ #include #include +#include "arrow/result.h" #include "arrow/status.h" #include "arrow/status_internal.h" #include "arrow/testing/gtest_util.h" @@ -37,6 +38,25 @@ class TestStatusDetail : public StatusDetail { } // namespace +namespace my_namespace { + +struct StatusLike { + int value; // ok if 42 +}; + +} // namespace my_namespace + +template <> +struct IntoStatus { + static inline Status ToStatus(my_namespace::StatusLike v) { + if (v.value == 42) { + return Status::OK(); + } else { + return Status::UnknownError("StatusLike: ", v.value); + } + } +}; + TEST(StatusTest, TestCodeAndMessage) { Status ok = Status::OK(); ASSERT_EQ(StatusCode::OK, ok.code()); @@ -234,4 +254,92 @@ TEST(StatusTest, TestDetailEquality) { ASSERT_NE(status_without_detail, status_with_detail); } +TEST(StatusTest, OrElse) { + int called = 0; + + auto or_else_returning_status = [&](Status st) { + ++called; + return st.WithMessage("Prefixed: ", st.message()); + }; + auto or_else_returning_result = [&](Status st) { + ++called; + return Result(st.WithMessage("Prefixed: ", st.message())); + }; + auto or_else_returning_user_class = [&](Status st) { + ++called; + return my_namespace::StatusLike{43}; + }; + auto or_else_returning_void = [&](auto) { ++called; }; + + auto ok_status = Status::OK(); + auto error_status = Status::IOError("some message"); + Status st; + + st = ok_status.OrElse(or_else_returning_status); + ASSERT_TRUE(st.ok()); + st = ok_status.OrElse(or_else_returning_result); + ASSERT_TRUE(st.ok()); + st = ok_status.OrElse(or_else_returning_void); + ASSERT_TRUE(st.ok()); + st = ok_status.OrElse(or_else_returning_user_class); + ASSERT_TRUE(st.ok()); + ASSERT_EQ(called, 0); + + st = error_status.OrElse(or_else_returning_status); + ASSERT_EQ(st.code(), StatusCode::IOError); + ASSERT_EQ(st.message(), "Prefixed: some message"); + ASSERT_EQ(called, 1); + st = error_status.OrElse(or_else_returning_result); + ASSERT_EQ(st.code(), StatusCode::IOError); + ASSERT_EQ(st.message(), "Prefixed: some message"); + ASSERT_EQ(called, 2); + st = error_status.OrElse(or_else_returning_void); + ASSERT_EQ(st.code(), StatusCode::IOError); + ASSERT_EQ(st.message(), "some message"); + ASSERT_EQ(called, 3); + st = error_status.OrElse(or_else_returning_user_class); + ASSERT_EQ(st.code(), StatusCode::UnknownError); + ASSERT_EQ(st.message(), "StatusLike: 43"); + ASSERT_EQ(called, 4); +} + +std::string StripContext(const std::string& message) { +#ifdef ARROW_EXTRA_ERROR_CONTEXT + auto pos = message.find_first_of('\n'); + if (pos != message.npos) { + return message.substr(0, pos); + } +#endif + return message; +} + +TEST(StatusTest, ReturnIfNotOk) { + auto f = [](auto v) { + RETURN_NOT_OK(v); + return Status::OK(); + }; + + auto ok_status = Status::OK(); + auto error_status = Status::IOError("some message"); + Status st; + + st = f(ok_status); + ASSERT_TRUE(st.ok()); + st = f(error_status); + ASSERT_EQ(st.code(), StatusCode::IOError); + ASSERT_EQ(StripContext(st.message()), error_status.message()); + + st = f(Result(42)); + ASSERT_TRUE(st.ok()); + st = f(Result(error_status)); + ASSERT_EQ(st.code(), StatusCode::IOError); + ASSERT_EQ(StripContext(st.message()), error_status.message()); + + st = f(my_namespace::StatusLike{42}); + ASSERT_TRUE(st.ok()); + st = f(my_namespace::StatusLike{43}); + ASSERT_EQ(st.code(), StatusCode::UnknownError); + ASSERT_EQ(StripContext(st.message()), "StatusLike: 43"); +} + } // namespace arrow diff --git a/cpp/src/arrow/stl.h b/cpp/src/arrow/stl.h index ae5462c661a..2a9e4bdf77e 100644 --- a/cpp/src/arrow/stl.h +++ b/cpp/src/arrow/stl.h @@ -92,7 +92,7 @@ using CBuilderType = template inline Status AppendListValues(CBuilderType& value_builder, Range&& cell_range) { - for (auto const& value : cell_range) { + for (const auto& value : cell_range) { ARROW_RETURN_NOT_OK(ConversionTraits::AppendRow(value_builder, value)); } return Status::OK(); @@ -441,12 +441,12 @@ Status TableFromTupleRange(MemoryPool* pool, Range&& rows, std::vector> builders(n_columns); ARROW_RETURN_NOT_OK(internal::CreateBuildersRecursive::Make(pool, &builders)); - for (auto const& row : rows) { + for (const auto& row : rows) { ARROW_RETURN_NOT_OK(internal::RowIterator::Append(builders, row)); } std::vector> arrays; - for (auto const& builder : builders) { + for (const auto& builder : builders) { std::shared_ptr array; ARROW_RETURN_NOT_OK(builder->Finish(&array)); arrays.emplace_back(array); diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 4b66f453ca3..68a8a1951f1 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -30,6 +30,7 @@ #include "arrow/array/concatenate.h" #include "arrow/array/util.h" #include "arrow/chunked_array.h" +#include "arrow/compare.h" #include "arrow/compute/cast.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" @@ -534,19 +535,52 @@ Result> PromoteTableToSchema(const std::shared_ptr
return Table::Make(schema, std::move(columns)); } -bool Table::Equals(const Table& other, bool check_metadata) const { - if (this == &other) { +namespace { + +bool ContainFloat(const std::shared_ptr& type) { + if (is_floating(type->id())) { return true; } - if (!schema_->Equals(*other.schema(), check_metadata)) { - return false; + + for (const auto& field : type->fields()) { + if (ContainFloat(field->type())) { + return true; + } + } + return false; +} + +bool CanIgnoreNan(const Schema& schema, const EqualOptions& opts) { + if (opts.nans_equal()) { + return true; + } + + for (auto& field : schema.fields()) { + if (ContainFloat(field->type())) { + return false; + } } - if (this->num_columns() != other.num_columns()) { - return false; + return true; +} + +} // namespace + +bool Table::Equals(const Table& other, const EqualOptions& opts) const { + if (this == &other) { + if (CanIgnoreNan(*schema_, opts)) { + return true; + } + } else { + if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { + return false; + } else if (opts.use_schema() && + !schema_->Equals(*other.schema(), opts.use_metadata())) { + return false; + } } for (int i = 0; i < this->num_columns(); i++) { - if (!this->column(i)->Equals(other.column(i))) { + if (!this->column(i)->Equals(other.column(i), opts)) { return false; } } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 79675fa92b1..dee6f6fdd3c 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -23,6 +23,7 @@ #include #include "arrow/chunked_array.h" // IWYU pragma: keep +#include "arrow/compare.h" #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/type.h" @@ -203,17 +204,33 @@ class ARROW_EXPORT Table { /// \brief Return the number of rows (equal to each column's logical length) int64_t num_rows() const { return num_rows_; } - /// \brief Determine if tables are equal + /// \brief Determine if two tables are equal /// - /// Two tables can be equal only if they have equal schemas. - /// However, they may be equal even if they have different chunkings. - bool Equals(const Table& other, bool check_metadata = false) const; + /// \param[in] other the table to compare with + /// \param[in] opts the options for equality comparisons + /// \return true if two tables are equal + bool Equals(const Table& other, const EqualOptions& opts) const; + + /// \brief Determine if two tables are equal + /// + /// \param[in] other the table to compare with + /// \param[in] check_metadata if true, the schema metadata will be compared, + /// regardless of the value set in \ref EqualOptions::use_metadata + /// \param[in] opts the options for equality comparisons + /// \return true if two tables are equal + bool Equals(const Table& other, bool check_metadata = false, + const EqualOptions& opts = EqualOptions::Defaults()) const { + return Equals(other, opts.use_metadata(check_metadata)); + } /// \brief Make a new table by combining the chunks this table has. /// /// All the underlying chunks in the ChunkedArray of each column are /// concatenated into zero or one chunk. /// + /// To avoid buffer overflow, binary columns may be combined into + /// multiple chunks. Chunks will have the maximum possible length. + /// /// \param[in] pool The pool for buffer allocations Result> CombineChunks( MemoryPool* pool = default_memory_pool()) const; @@ -269,11 +286,6 @@ class ARROW_EXPORT TableBatchReader : public RecordBatchReader { int64_t max_chunksize_; }; -/// \defgroup concat-tables ConcatenateTables function. -/// -/// ConcatenateTables function. -/// @{ - /// \brief Controls the behavior of ConcatenateTables(). struct ARROW_EXPORT ConcatenateTablesOptions { /// If true, the schemas of the tables will be first unified with fields of @@ -308,7 +320,6 @@ struct ARROW_EXPORT ConcatenateTablesOptions { /// \param[in] memory_pool MemoryPool to be used if null-filled arrays need to /// be created or if existing column chunks need to endure type conversion /// \return new Table - ARROW_EXPORT Result> ConcatenateTables( const std::vector>& tables, diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index 5f6905ce672..692671910b8 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -29,6 +29,7 @@ #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/chunked_array.h" +#include "arrow/compare.h" #include "arrow/compute/cast.h" #include "arrow/record_batch.h" #include "arrow/status.h" @@ -152,38 +153,210 @@ TEST_F(TestTable, AllColumnsAndFields) { ASSERT_EQ(0, fields.size()); } -TEST_F(TestTable, Equals) { - const int length = 100; - MakeExample1(length); +TEST(TestTableEquality, Equals) { + const int32_t length = 10; - table_ = Table::Make(schema_, columns_); + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8()); + auto f2 = field("f2", int16()); - ASSERT_TRUE(table_->Equals(*table_)); - // Differing schema - auto f0 = field("f3", int32()); - auto f1 = field("f4", uint8()); - auto f2 = field("f5", int16()); - std::vector> fields = {f0, f1, f2}; - auto other_schema = std::make_shared(fields); - auto other = Table::Make(other_schema, columns_); - ASSERT_FALSE(table_->Equals(*other)); - // Differing columns - std::vector> other_columns = { - std::make_shared( - gen_.ArrayOf(int32(), length, /*null_probability=*/0.3)), - std::make_shared( - gen_.ArrayOf(uint8(), length, /*null_probability=*/0.3)), - std::make_shared( - gen_.ArrayOf(int16(), length, /*null_probability=*/0.3))}; - - other = Table::Make(schema_, other_columns); - ASSERT_FALSE(table_->Equals(*other)); - - // Differing schema metadata - other_schema = schema_->WithMetadata(::arrow::key_value_metadata({"key"}, {"value"})); - other = Table::Make(other_schema, columns_); - ASSERT_TRUE(table_->Equals(*other)); - ASSERT_FALSE(table_->Equals(*other, /*check_metadata=*/true)); + auto schema = ::arrow::schema({f0, f1, f2}); + auto schema_same = ::arrow::schema({f0, f1, f2}); + auto schema_fewer_fields = ::arrow::schema({f0, f1}); + + random::RandomArrayGenerator gen(42); + + auto a_f0 = gen.ArrayOf(int32(), length); + auto a_f1 = gen.ArrayOf(uint8(), length); + auto a_f2 = gen.ArrayOf(int16(), length); + auto a_f0_half = a_f0->Slice(0, length / 2); + auto a_f1_half = a_f1->Slice(0, length / 2); + auto a_f2_half = a_f2->Slice(0, length / 2); + auto a_f0_different = gen.ArrayOf(int32(), length); + auto a_f1_different = gen.ArrayOf(uint8(), length); + auto a_f2_different = gen.ArrayOf(uint16(), length); + + auto table = Table::Make(schema, {a_f0, a_f1, a_f2}, length); + auto table_same = Table::Make(schema_same, {a_f0, a_f1, a_f2}, length); + auto table_fewer_fields = Table::Make(schema_fewer_fields, {a_f0, a_f1}, length); + auto table_half = + Table::Make(schema_fewer_fields, {a_f0_half, a_f1_half, a_f2_half}, length / 2); + auto table_different = Table::Make( + schema_fewer_fields, {a_f0_different, a_f1_different, a_f2_different}, length); + + // Same Values + ASSERT_TRUE(table->Equals(*table_same)); + + // Different number of columns + ASSERT_FALSE(table->Equals(*table_fewer_fields)); + + // Different number of rows + ASSERT_FALSE(table->Equals(*table_half)); + + // Different values + ASSERT_FALSE(table->Equals(*table_different)); +} + +TEST(TestTableEquality, MetadataAndSchema) { + const int32_t length = 10; + + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8()); + auto f2 = field("f2", int16()); + auto f2_renamed = field("f2b", int16()); + + auto metadata = key_value_metadata({"foo"}, {"bar"}); + + auto schema = ::arrow::schema({f0, f1, f2}); + auto schema_with_metadata = schema->WithMetadata(metadata); + auto schema_renamed_field = ::arrow::schema({f0, f1, f2_renamed}); + + random::RandomArrayGenerator gen(42); + + auto a_f0 = gen.ArrayOf(int32(), length); + auto a_f1 = gen.ArrayOf(uint8(), length); + auto a_f2 = gen.ArrayOf(int16(), length); + auto a_f2_renamed = a_f2; + + // All Tables have the same values but different schemas. + auto table = Table::Make(schema, {a_f0, a_f1, a_f2}, length); + auto table_with_metadata = + Table::Make(schema_with_metadata, {a_f0, a_f1, a_f2}, length); + auto table_renamed_field = + Table::Make(schema_renamed_field, {a_f0, a_f1, a_f2_renamed}, length); + + auto options = EqualOptions::Defaults(); + + // Same values and types, but different field names + ASSERT_FALSE(table->Equals(*table_renamed_field)); + ASSERT_TRUE(table->Equals(*table_renamed_field, options.use_schema(false))); + + // Different metadata + ASSERT_TRUE(table->Equals(*table_with_metadata)); + ASSERT_TRUE(table->Equals(*table_with_metadata, options)); + ASSERT_FALSE(table->Equals(*table_with_metadata, + /*check_metadata=*/true)); + ASSERT_FALSE(table->Equals(*table_with_metadata, + /*check_metadata=*/true, options.use_schema(true))); + ASSERT_TRUE(table->Equals(*table_with_metadata, + /*check_metadata=*/true, options.use_schema(false))); + ASSERT_TRUE( + table->Equals(*table_with_metadata, options.use_schema(true).use_metadata(false))); + ASSERT_FALSE( + table->Equals(*table_with_metadata, options.use_schema(true).use_metadata(true))); + ASSERT_TRUE( + table->Equals(*table_with_metadata, options.use_schema(false).use_metadata(true))); +} + +TEST(TestTableEqualityFloatType, SameValue) { + auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); + auto table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": 6.0}])"}); + auto other_table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": 6.0}])"}); + + ASSERT_TRUE(table->Equals(*other_table)); +} + +TEST(TestTableEqualityFloatType, SingedZero) { + auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); + auto table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": -0.0}, {"f0": 3, "f1": 0.0}])"}); + auto other_table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 0.0}, {"f0": 3, "f1": -0.0}])"}); + auto options = EqualOptions::Defaults(); + + ASSERT_TRUE(table->Equals(*other_table, options)); + ASSERT_FALSE(table->Equals(*other_table, options.signed_zeros_equal(false))); +} + +TEST(TestTableEqualityFloatType, Infinity) { + auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); + auto table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": Inf}])"}); + auto table_different_inf = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": -Inf}])"}); + auto table_same_inf = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": Inf}])"}); + + ASSERT_FALSE(table->Equals(*table_different_inf)); + ASSERT_TRUE(table->Equals(*table_same_inf)); +} + +TEST(TestTableEqualityFloatType, NaN) { + auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); + auto table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"}); + auto other_table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"}); + auto options = EqualOptions::Defaults(); + + ASSERT_FALSE(table->Equals(*other_table, options)); + ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true))); +} + +TEST(TestTableEqualityFloatType, Approximate) { + auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); + auto table = TableFromJSON( + schema, + {R"([{"f0": 1, "f1": 4.0001}, {"f0": 2, "f1": 5.0001}, {"f0": 3, "f1": 6.0001}])"}); + auto other_table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": 5.0}, {"f0": 3, "f1": 6.0}])"}); + auto options = EqualOptions::Defaults(); + + ASSERT_FALSE(table->Equals(*other_table, options)); + + ASSERT_TRUE(table->Equals(*other_table, options.use_atol(true).atol(1e-3))); + + ASSERT_FALSE(table->Equals(*other_table, options.use_atol(true).atol(1e-5))); +} + +TEST(TestTableEqualitySameAddress, NonFloatType) { + auto schema = ::arrow::schema({field("f0", int32()), field("f1", uint8())}); + auto table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4}, {"f0": 2, "f1": 5}, {"f0": 3, "f1": 6}])"}); + auto other_table = table; + auto options = EqualOptions::Defaults(); + + ASSERT_TRUE(table->Equals(*other_table, options)); + ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true))); +} + +TEST(TestTableEqualitySameAddress, NestedTypesWithoutFloatType) { + auto schema = ::arrow::schema( + {field("f0", int32()), field("f1", struct_({{"f2", utf8()}, {"f3", int64()}}))}); + auto table = TableFromJSON( + schema, + {R"([{"f0": 1, "f1": {"f2": "4", "f3": 7}}, {"f0": 2, "f1": {"f2": "5", "f3": 8}}, {"f0": 3, "f1": {"f2" : "6", "f3": 9}}])"}); + auto other_table = table; + auto options = EqualOptions::Defaults(); + + ASSERT_TRUE(table->Equals(*other_table, options)); + ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true))); +} + +TEST(TestTableEqualitySameAddress, FloatType) { + auto schema = ::arrow::schema({field("f0", int32()), field("f1", float64())}); + auto table = TableFromJSON( + schema, {R"([{"f0": 1, "f1": 4.0}, {"f0": 2, "f1": NaN}, {"f0": 3, "f1": 6.0}])"}); + auto other_table = table; + auto options = EqualOptions::Defaults(); + + ASSERT_FALSE(table->Equals(*other_table, options)); + ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true))); +} + +TEST(TestTableEqualitySameAddress, NestedTypesWithFloatType) { + auto schema = ::arrow::schema( + {field("f0", int32()), field("f1", struct_({{"f2", utf8()}, {"f3", float64()}}))}); + auto table = TableFromJSON( + schema, + {R"([{"f0": 1, "f1": {"f2": "4", "f3": 7.0}}, {"f0": 2, "f1": {"f2": "5", "f3": NaN}}, {"f0": 3,"f1": {"f2" : "6", "f3": 9.0}}])"}); + auto other_table = table; + auto options = EqualOptions::Defaults(); + + ASSERT_FALSE(table->Equals(*other_table, options)); + ASSERT_TRUE(table->Equals(*other_table, options.nans_equal(true))); } TEST_F(TestTable, MakeEmpty) { diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index e69e5632bb1..8cdf7f82d26 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -485,12 +485,12 @@ namespace { template int64_t StridedTensorCountNonZero(int dim_index, int64_t offset, const Tensor& tensor) { using c_type = typename TYPE::c_type; - c_type const zero = c_type(0); + const c_type zero = c_type(0); int64_t nnz = 0; if (dim_index == tensor.ndim() - 1) { for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) { - auto const* ptr = tensor.raw_data() + offset + i * tensor.strides()[dim_index]; - auto& elem = *reinterpret_cast(ptr); + const auto* ptr = tensor.raw_data() + offset + i * tensor.strides()[dim_index]; + auto& elem = *reinterpret_cast(ptr); if (elem != zero) ++nnz; } return nnz; @@ -505,9 +505,9 @@ int64_t StridedTensorCountNonZero(int dim_index, int64_t offset, const Tensor& t template int64_t ContiguousTensorCountNonZero(const Tensor& tensor) { using c_type = typename TYPE::c_type; - auto* data = reinterpret_cast(tensor.raw_data()); + auto* data = reinterpret_cast(tensor.raw_data()); return std::count_if(data, data + tensor.size(), - [](c_type const& x) { return x != 0; }); + [](const c_type& x) { return x != 0; }); } template diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index dd3a21fae49..beb62a11bdc 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -33,7 +33,7 @@ namespace arrow { -static inline bool is_tensor_supported(Type::type type_id) { +constexpr bool is_tensor_supported(Type::type type_id) { switch (type_id) { case Type::UINT8: case Type::INT8: diff --git a/cpp/src/arrow/tensor/csf_converter.cc b/cpp/src/arrow/tensor/csf_converter.cc index 2d925ddbbb0..f6470e16b78 100644 --- a/cpp/src/arrow/tensor/csf_converter.cc +++ b/cpp/src/arrow/tensor/csf_converter.cc @@ -29,7 +29,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/sort.h" +#include "arrow/util/sort_internal.h" #include "arrow/visit_type_inline.h" namespace arrow { diff --git a/cpp/src/arrow/tensor/csx_converter.cc b/cpp/src/arrow/tensor/csx_converter.cc index f30e71f5c40..679c3a0f1ac 100644 --- a/cpp/src/arrow/tensor/csx_converter.cc +++ b/cpp/src/arrow/tensor/csx_converter.cc @@ -157,6 +157,8 @@ Status MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis axis, return Status::OK(); } +namespace { + Result> MakeTensorFromSparseCSXMatrix( SparseMatrixCompressedAxis axis, MemoryPool* pool, const std::shared_ptr& indptr, const std::shared_ptr& indices, @@ -211,6 +213,8 @@ Result> MakeTensorFromSparseCSXMatrix( dim_names); } +} // namespace + Result> MakeTensorFromSparseCSRMatrix( MemoryPool* pool, const SparseCSRMatrix* sparse_tensor) { const auto& sparse_index = diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h index a4526e31c2b..9b4492a543a 100644 --- a/cpp/src/arrow/testing/extension_type.h +++ b/cpp/src/arrow/testing/extension_type.h @@ -132,6 +132,23 @@ class ARROW_TESTING_EXPORT DictExtensionType : public ExtensionType { std::string Serialize() const override { return "dict-extension-serialized"; } }; +class ARROW_TESTING_EXPORT BinaryViewExtensionType : public ExtensionType { + public: + BinaryViewExtensionType() : ExtensionType(binary_view()) {} + + std::string extension_name() const override { return "binary_view"; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override; + + std::string Serialize() const override { return "binary_view_serialized"; } +}; + // A minimal extension type that does not error when passed blank extension information class ARROW_TESTING_EXPORT MetadataOptionalExtensionType : public ExtensionType { public: @@ -190,6 +207,9 @@ std::shared_ptr list_extension_type(); ARROW_TESTING_EXPORT std::shared_ptr dict_extension_type(); +ARROW_TESTING_EXPORT +std::shared_ptr binary_view_extension_type(); + ARROW_TESTING_EXPORT std::shared_ptr complex128(); diff --git a/cpp/src/arrow/testing/generator.cc b/cpp/src/arrow/testing/generator.cc index b6bed40e5e6..e9c7406120c 100644 --- a/cpp/src/arrow/testing/generator.cc +++ b/cpp/src/arrow/testing/generator.cc @@ -92,6 +92,11 @@ std::shared_ptr ConstantArrayGenerator::Int64(int64_t size, int64_ return ConstantArray(size, value); } +std::shared_ptr ConstantArrayGenerator::Float16(int64_t size, + uint16_t value) { + return ConstantArray(size, value); +} + std::shared_ptr ConstantArrayGenerator::Float32(int64_t size, float value) { return ConstantArray(size, value); } @@ -148,6 +153,8 @@ std::shared_ptr ConstantArrayGenerator::Zeroes( EXPECT_OK_AND_ASSIGN(auto viewed, Int32(size)->View(type)); return viewed; } + case Type::HALF_FLOAT: + return Float16(size); case Type::FLOAT: return Float32(size); case Type::DOUBLE: diff --git a/cpp/src/arrow/testing/generator.h b/cpp/src/arrow/testing/generator.h index e90c125a7d4..05cb8621ab9 100644 --- a/cpp/src/arrow/testing/generator.h +++ b/cpp/src/arrow/testing/generator.h @@ -106,6 +106,14 @@ class ARROW_TESTING_EXPORT ConstantArrayGenerator { /// \return a generated Array static std::shared_ptr Int64(int64_t size, int64_t value = 0); + /// \brief Generates a constant Float16Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] value to repeat + /// + /// \return a generated Array + static std::shared_ptr Float16(int64_t size, uint16_t value = 0); + /// \brief Generates a constant Float32Array /// /// \param[in] size the size of the array to generate @@ -151,6 +159,8 @@ class ARROW_TESTING_EXPORT ConstantArrayGenerator { return UInt64(size, static_cast(value)); case Type::INT64: return Int64(size, static_cast(value)); + case Type::HALF_FLOAT: + return Float16(size, static_cast(value)); case Type::FLOAT: return Float32(size, static_cast(value)); case Type::DOUBLE: diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 9eeca32e721..1acc47a99d4 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -47,13 +47,12 @@ #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/compute/api_vector.h" #include "arrow/datum.h" #include "arrow/extension/json.h" #include "arrow/io/memory.h" -#include "arrow/ipc/json_simple.h" #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" +#include "arrow/json/from_string.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep #include "arrow/pretty_print.h" #include "arrow/record_batch.h" @@ -381,23 +380,21 @@ void AssertDatumsApproxEqual(const Datum& expected, const Datum& actual, bool ve std::shared_ptr ArrayFromJSON(const std::shared_ptr& type, std::string_view json) { - EXPECT_OK_AND_ASSIGN(auto out, ipc::internal::json::ArrayFromJSON(type, json)); + EXPECT_OK_AND_ASSIGN(auto out, json::ArrayFromJSONString(type, json)); return out; } std::shared_ptr DictArrayFromJSON(const std::shared_ptr& type, std::string_view indices_json, std::string_view dictionary_json) { - std::shared_ptr out; - ABORT_NOT_OK( - ipc::internal::json::DictArrayFromJSON(type, indices_json, dictionary_json, &out)); + EXPECT_OK_AND_ASSIGN( + auto out, json::DictArrayFromJSONString(type, indices_json, dictionary_json)); return out; } std::shared_ptr ChunkedArrayFromJSON(const std::shared_ptr& type, const std::vector& json) { - std::shared_ptr out; - ABORT_NOT_OK(ipc::internal::json::ChunkedArrayFromJSON(type, json, &out)); + EXPECT_OK_AND_ASSIGN(auto out, json::ChunkedArrayFromJSONString(type, json)); return out; } @@ -405,7 +402,7 @@ std::shared_ptr RecordBatchFromJSON(const std::shared_ptr& std::string_view json) { // Parse as a StructArray auto struct_type = struct_(schema->fields()); - std::shared_ptr struct_array = ArrayFromJSON(struct_type, json); + std::shared_ptr struct_array = arrow::ArrayFromJSON(struct_type, json); // Convert StructArray to RecordBatch return *RecordBatch::FromStructArray(struct_array); @@ -413,17 +410,15 @@ std::shared_ptr RecordBatchFromJSON(const std::shared_ptr& std::shared_ptr ScalarFromJSON(const std::shared_ptr& type, std::string_view json) { - std::shared_ptr out; - ABORT_NOT_OK(ipc::internal::json::ScalarFromJSON(type, json, &out)); + EXPECT_OK_AND_ASSIGN(auto out, json::ScalarFromJSONString(type, json)); return out; } std::shared_ptr DictScalarFromJSON(const std::shared_ptr& type, std::string_view index_json, std::string_view dictionary_json) { - std::shared_ptr out; - ABORT_NOT_OK( - ipc::internal::json::DictScalarFromJSON(type, index_json, dictionary_json, &out)); + EXPECT_OK_AND_ASSIGN(auto out, + json::DictScalarFromJSONString(type, index_json, dictionary_json)); return out; } @@ -440,7 +435,7 @@ std::shared_ptr TensorFromJSON(const std::shared_ptr& type, std::string_view data, std::string_view shape, std::string_view strides, std::string_view dim_names) { - std::shared_ptr array = ArrayFromJSON(type, data); + std::shared_ptr array = arrow::ArrayFromJSON(type, data); rj::Document json_shape; json_shape.Parse(shape.data(), shape.length()); @@ -469,35 +464,10 @@ std::shared_ptr TensorFromJSON(const std::shared_ptr& type, const std::vector& shape, const std::vector& strides, const std::vector& dim_names) { - std::shared_ptr array = ArrayFromJSON(type, data); + std::shared_ptr array = arrow::ArrayFromJSON(type, data); return *Tensor::Make(type, array->data()->buffers[1], shape, strides, dim_names); } -Result> RunEndEncodeTableColumns( - const Table& table, const std::vector& column_indices) { - const int num_columns = table.num_columns(); - std::vector> encoded_columns; - encoded_columns.reserve(num_columns); - std::vector> encoded_fields; - encoded_fields.reserve(num_columns); - for (int i = 0; i < num_columns; i++) { - const auto& field = table.schema()->field(i); - if (std::find(column_indices.begin(), column_indices.end(), i) != - column_indices.end()) { - ARROW_ASSIGN_OR_RAISE(auto run_end_encoded, compute::RunEndEncode(table.column(i))); - DCHECK_EQ(run_end_encoded.kind(), Datum::CHUNKED_ARRAY); - encoded_columns.push_back(run_end_encoded.chunked_array()); - auto encoded_type = arrow::run_end_encoded(arrow::int32(), field->type()); - encoded_fields.push_back(field->WithType(encoded_type)); - } else { - encoded_columns.push_back(table.column(i)); - encoded_fields.push_back(field); - } - } - auto updated_schema = arrow::schema(std::move(encoded_fields)); - return Table::Make(std::move(updated_schema), std::move(encoded_columns)); -} - Result> PrintArrayDiff(const ChunkedArray& expected, const ChunkedArray& actual) { if (actual.Equals(expected)) { @@ -972,6 +942,30 @@ Result> DictExtensionType::Deserialize( return std::make_shared(); } +bool BinaryViewExtensionType::ExtensionEquals(const ExtensionType& other) const { + return (other.extension_name() == this->extension_name()); +} + +std::shared_ptr BinaryViewExtensionType::MakeArray( + std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("binary_view", + static_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> BinaryViewExtensionType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized) const { + if (serialized != "binary_view_serialized") { + return Status::Invalid("Type identifier did not match: '", serialized, "'"); + } + if (!storage_type->Equals(*int16())) { + return Status::Invalid("Invalid storage type for BinaryViewExtensionType: ", + storage_type->ToString()); + } + return std::make_shared(); +} + bool Complex128Type::ExtensionEquals(const ExtensionType& other) const { return (other.extension_name() == this->extension_name()); } @@ -1004,6 +998,10 @@ std::shared_ptr list_extension_type() { return std::make_shared(); } +std::shared_ptr binary_view_extension_type() { + return std::make_shared(); +} + std::shared_ptr dict_extension_type() { return std::make_shared(); } @@ -1020,19 +1018,19 @@ std::shared_ptr MakeComplex128(const std::shared_ptr& real, } std::shared_ptr ExampleUuid() { - auto arr = ArrayFromJSON( + auto arr = arrow::ArrayFromJSON( fixed_size_binary(16), "[null, \"abcdefghijklmno0\", \"abcdefghijklmno1\", \"abcdefghijklmno2\"]"); return ExtensionType::WrapArray(uuid(), arr); } std::shared_ptr ExampleSmallint() { - auto arr = ArrayFromJSON(int16(), "[-32768, null, 1, 2, 3, 4, 32767]"); + auto arr = arrow::ArrayFromJSON(int16(), "[-32768, null, 1, 2, 3, 4, 32767]"); return ExtensionType::WrapArray(smallint(), arr); } std::shared_ptr ExampleTinyint() { - auto arr = ArrayFromJSON(int8(), "[-128, null, 1, 2, 3, 4, 127]"); + auto arr = arrow::ArrayFromJSON(int8(), "[-128, null, 1, 2, 3, 4, 127]"); return ExtensionType::WrapArray(tinyint(), arr); } @@ -1043,8 +1041,8 @@ std::shared_ptr ExampleDictExtension() { } std::shared_ptr ExampleComplex128() { - auto arr = ArrayFromJSON(struct_({field("", float64()), field("", float64())}), - "[[1.0, -2.5], null, [3.0, -4.5]]"); + auto arr = arrow::ArrayFromJSON(struct_({field("", float64()), field("", float64())}), + "[[1.0, -2.5], null, [3.0, -4.5]]"); return ExtensionType::WrapArray(complex128(), arr); } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 89a986097f8..62bf907a2d8 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -40,7 +40,7 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/macros.h" -#include "arrow/util/string_builder.h" +#include "arrow/util/string_util.h" #include "arrow/util/type_fwd.h" // NOTE: failing must be inline in the macros below, to get correct file / line number @@ -49,8 +49,7 @@ // NOTE: using a for loop for this macro allows extra failure messages to be // appended with operator<< #define ASSERT_RAISES(ENUM, expr) \ - for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); \ - !_st.Is##ENUM();) \ + for (::arrow::Status _st = ::arrow::ToStatus((expr)); !_st.Is##ENUM();) \ FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ ENUM) ", but got " \ << _st.ToString() @@ -58,7 +57,7 @@ #define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ do { \ auto _res = (expr); \ - ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ + ::arrow::Status _st = ::arrow::ToStatus(_res); \ if (!_st.Is##ENUM()) { \ FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ ENUM) ", but got " \ @@ -70,7 +69,7 @@ #define EXPECT_RAISES_WITH_MESSAGE_THAT(ENUM, matcher, expr) \ do { \ auto _res = (expr); \ - ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ + ::arrow::Status _st = ::arrow::ToStatus(_res); \ EXPECT_TRUE(_st.Is##ENUM()) << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " \ << ARROW_STRINGIFY(ENUM) ", but got " << _st.ToString(); \ EXPECT_THAT(_st.ToStringWithoutContextLines(), (matcher)); \ @@ -79,13 +78,13 @@ #define EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(code, matcher, expr) \ do { \ auto _res = (expr); \ - ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ + ::arrow::Status _st = ::arrow::ToStatus(_res); \ EXPECT_EQ(_st.CodeAsString(), Status::CodeAsString(code)); \ EXPECT_THAT(_st.ToStringWithoutContextLines(), (matcher)); \ } while (false) -#define ASSERT_OK(expr) \ - for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); !_st.ok();) \ +#define ASSERT_OK(expr) \ + for (::arrow::Status _st = ::arrow::ToStatus((expr)); !_st.ok();) \ FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString() #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) @@ -93,22 +92,26 @@ #define ARROW_EXPECT_OK(expr) \ do { \ auto _res = (expr); \ - ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ + ::arrow::Status _st = ::arrow::ToStatus(_res); \ EXPECT_TRUE(_st.ok()) << "'" ARROW_STRINGIFY(expr) "' failed with " \ << _st.ToString(); \ } while (false) -#define ASSERT_NOT_OK(expr) \ - for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); _st.ok();) \ +#define EXPECT_OK ARROW_EXPECT_OK + +#define EXPECT_OK_NO_THROW(expr) EXPECT_NO_THROW(EXPECT_OK(expr)) + +#define ASSERT_NOT_OK(expr) \ + for (::arrow::Status _st = ::arrow::ToStatus((expr)); _st.ok();) \ FAIL() << "'" ARROW_STRINGIFY(expr) "' did not failed" << _st.ToString() -#define ABORT_NOT_OK(expr) \ - do { \ - auto _res = (expr); \ - ::arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ - if (ARROW_PREDICT_FALSE(!_st.ok())) { \ - _st.Abort(); \ - } \ +#define ABORT_NOT_OK(expr) \ + do { \ + auto _res = (expr); \ + ::arrow::Status _st = ::arrow::ToStatus(_res); \ + if (ARROW_PREDICT_FALSE(!_st.ok())) { \ + _st.Abort(); \ + } \ } while (false); #define ASSIGN_OR_HANDLE_ERROR_IMPL(handle_error, status_name, lhs, rexpr) \ @@ -139,7 +142,7 @@ // A generalized version of GTest's SCOPED_TRACE that takes arbitrary arguments. // ARROW_SCOPED_TRACE("some variable = ", some_variable, ...) -#define ARROW_SCOPED_TRACE(...) SCOPED_TRACE(::arrow::util::StringBuilder(__VA_ARGS__)) +#define ARROW_SCOPED_TRACE(...) SCOPED_TRACE(::arrow::internal::JoinToString(__VA_ARGS__)) namespace arrow { @@ -182,6 +185,9 @@ using BaseBinaryArrowTypes = using BaseBinaryOrBinaryViewLikeArrowTypes = ::testing::Types; +using AllBinaryOrBinrayViewLikeArrowTypes = + ::testing::Types; using BinaryArrowTypes = ::testing::Types; @@ -370,10 +376,6 @@ std::shared_ptr TensorFromJSON(const std::shared_ptr& type, const std::vector& strides = {}, const std::vector& dim_names = {}); -ARROW_TESTING_EXPORT -Result> RunEndEncodeTableColumns( - const Table& table, const std::vector& column_indices); - // Given an array, return a new identical array except for one validity bit // set to a new value. // This is useful to force the underlying "value" of null entries to otherwise diff --git a/cpp/src/arrow/testing/gtest_util_test.cc b/cpp/src/arrow/testing/gtest_util_test.cc index 8dc496fa500..663d1549be1 100644 --- a/cpp/src/arrow/testing/gtest_util_test.cc +++ b/cpp/src/arrow/testing/gtest_util_test.cc @@ -287,22 +287,12 @@ TEST(TestWithinUlp, Float) { TEST(AssertTestWithinUlp, Basics) { AssertWithinUlp(123.4567, 123.45670000000015, 11); AssertWithinUlp(123.456f, 123.456085f, 11); +#ifndef _WIN32 + // GH-47442 EXPECT_FATAL_FAILURE(AssertWithinUlp(123.4567, 123.45670000000015, 10), "not within 10 ulps"); EXPECT_FATAL_FAILURE(AssertWithinUlp(123.456f, 123.456085f, 10), "not within 10 ulps"); +#endif } -TEST(RunEndEncodeGtestUtilTest, SchemaTypeIsModified) { - std::shared_ptr
table = - arrow::TableFromJSON(arrow::schema({arrow::field("col", arrow::utf8())}), {R"([ - {"col": "a"}, - {"col": "b"}, - {"col": "c"}, - {"col": "d"} - ])"}); - ASSERT_OK_AND_ASSIGN(std::shared_ptr
ree_table, - RunEndEncodeTableColumns(*table, {0})); - ASSERT_TRUE(ree_table->schema()->field(0)->type()->Equals( - arrow::run_end_encoded(arrow::int32(), arrow::utf8()))); -} } // namespace arrow diff --git a/cpp/src/arrow/testing/matchers.h b/cpp/src/arrow/testing/matchers.h index b4625b3922e..0e1bae47381 100644 --- a/cpp/src/arrow/testing/matchers.h +++ b/cpp/src/arrow/testing/matchers.h @@ -75,7 +75,7 @@ class AnyOfJSONMatcher { "AnyOfJSON only supported for std::shared_ptr"); Impl(std::shared_ptr type, std::string array_json) : type_(std::move(type)), array_json_(std::move(array_json)) { - array = ArrayFromJSON(type_, array_json_); + array = arrow::ArrayFromJSON(type_, array_json_); } void DescribeTo(std::ostream* os) const override { *os << "matches at least one scalar from "; @@ -250,7 +250,7 @@ class ErrorMatcher { bool MatchAndExplain(const Res& maybe_value, testing::MatchResultListener* listener) const override { - const Status& status = internal::GenericToStatus(maybe_value); + const Status& status = ToStatus(maybe_value); testing::StringMatchResultListener value_listener; bool match = status.code() == code_; @@ -294,7 +294,7 @@ class OkMatcher { bool MatchAndExplain(const Res& maybe_value, testing::MatchResultListener* listener) const override { - const Status& status = internal::GenericToStatus(maybe_value); + const Status& status = ToStatus(maybe_value); const bool match = status.ok(); *listener << "whose " << (match ? "non-error matches" : "error doesn't match"); @@ -415,7 +415,7 @@ DataEqMatcher DataEq(Data&& dat) { /// Constructs an array with ArrayFromJSON against which arguments are matched inline DataEqMatcher DataEqArray(const std::shared_ptr& type, std::string_view json) { - return DataEq(ArrayFromJSON(type, json)); + return DataEq(arrow::ArrayFromJSON(type, json)); } /// Constructs an array from a vector of optionals against which arguments are matched diff --git a/cpp/src/arrow/testing/meson.build b/cpp/src/arrow/testing/meson.build index 8b03176aa42..4d323319be5 100644 --- a/cpp/src/arrow/testing/meson.build +++ b/cpp/src/arrow/testing/meson.build @@ -28,7 +28,6 @@ install_headers( 'gtest_util.h', 'matchers.h', 'math.h', - 'pch.h', 'process.h', 'random.h', 'uniform_real.h', diff --git a/cpp/src/arrow/testing/pch.h b/cpp/src/arrow/testing/pch.h deleted file mode 100644 index e544ad806ad..00000000000 --- a/cpp/src/arrow/testing/pch.h +++ /dev/null @@ -1,25 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Often-used headers, for precompiling. -// If updating this header, please make sure you check compilation speed -// before checking in. Adding headers which are not used extremely often -// may incur a slowdown, since it makes the precompiled header heavier to load. - -#include "arrow/pch.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/testing/util.h" diff --git a/cpp/src/arrow/testing/process.cc b/cpp/src/arrow/testing/process.cc index b1c4fa55505..40538bd9862 100644 --- a/cpp/src/arrow/testing/process.cc +++ b/cpp/src/arrow/testing/process.cc @@ -17,10 +17,10 @@ #include "arrow/testing/process.h" #include "arrow/result.h" +#include "arrow/util/config.h" -#define BOOST_PROCESS_AVAILABLE -#ifdef __EMSCRIPTEN__ -# undef BOOST_PROCESS_AVAILABLE +#ifdef ARROW_ENABLE_THREADING +# define BOOST_PROCESS_AVAILABLE #endif #ifdef BOOST_PROCESS_AVAILABLE @@ -39,11 +39,18 @@ # ifdef __APPLE__ # include # endif -# include # include -# else -# include # endif +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include # include # else // We need BOOST_USE_WINDOWS_H definition with MinGW when we use @@ -169,7 +176,7 @@ class Process::Impl { for (const auto& kv : process::environment::current()) { env[kv.key()] = process::environment::value(kv.value()); } - env["PATH"] = process::environment::value(current_exe.parent_path()); + env["PATH"] = process::environment::value(current_exe.parent_path().string()); executable_ = process::environment::find_executable(name, env); # else executable_ = process::search_path(name, {current_exe.parent_path()}); @@ -274,6 +281,8 @@ class Process::Impl { # if defined(__linux__) path = filesystem::canonical("/proc/self/exe", error_code); +# elif defined(__FreeBSD__) + path = filesystem::canonical("/proc/curproc/file", error_code); # elif defined(__APPLE__) char buf[PATH_MAX + 1]; uint32_t bufsize = sizeof(buf); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 40a67b40d76..5f95638b7d6 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -43,6 +43,7 @@ #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging_internal.h" #include "arrow/util/pcg_random.h" @@ -54,55 +55,85 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; using internal::ToChars; +using util::Float16; namespace random { namespace { +template +struct GeneratorFactory { + GeneratorFactory(ValueType min, ValueType max) : min_(min), max_(max) {} + + auto operator()(pcg32_fast* rng) const { + return [dist = DistributionType(min_, max_), rng]() mutable { + return static_cast(dist(*rng)); + }; + } + + private: + ValueType min_; + ValueType max_; +}; + +template +struct GeneratorFactory { + GeneratorFactory(Float16 min, Float16 max) : min_(min.ToFloat()), max_(max.ToFloat()) {} + + auto operator()(pcg32_fast* rng) const { + return [dist = DistributionType(min_, max_), rng]() mutable { + return Float16(dist(*rng)).bits(); + }; + } + + private: + float min_; + float max_; +}; + template struct GenerateOptions { + static constexpr bool kIsHalfFloat = std::is_same_v; + using PhysicalType = std::conditional_t; + using FactoryType = GeneratorFactory; + GenerateOptions(SeedType seed, ValueType min, ValueType max, double probability, double nan_probability = 0.0) - : min_(min), - max_(max), + : generator_factory_(FactoryType(min, max)), seed_(seed), probability_(probability), nan_probability_(nan_probability) {} void GenerateData(uint8_t* buffer, size_t n) { - GenerateTypedData(reinterpret_cast(buffer), n); + GenerateTypedData(reinterpret_cast(buffer), n); } template - typename std::enable_if::value>::type GenerateTypedData( - V* data, size_t n) { + typename std::enable_if && !kIsHalfFloat>::type + GenerateTypedData(V* data, size_t n) { GenerateTypedDataNoNan(data, n); } template - typename std::enable_if::value>::type GenerateTypedData( - V* data, size_t n) { + typename std::enable_if || kIsHalfFloat>::type + GenerateTypedData(V* data, size_t n) { if (nan_probability_ == 0.0) { GenerateTypedDataNoNan(data, n); return; } pcg32_fast rng(seed_++); - DistributionType dist(min_, max_); + auto gen = generator_factory_(&rng); ::arrow::random::bernoulli_distribution nan_dist(nan_probability_); - const ValueType nan_value = std::numeric_limits::quiet_NaN(); + const PhysicalType nan_value = get_nan(); - // A static cast is required due to the int16 -> int8 handling. - std::generate(data, data + n, [&] { - return nan_dist(rng) ? nan_value : static_cast(dist(rng)); - }); + std::generate(data, data + n, [&] { return nan_dist(rng) ? nan_value : gen(); }); } - void GenerateTypedDataNoNan(ValueType* data, size_t n) { + void GenerateTypedDataNoNan(PhysicalType* data, size_t n) { pcg32_fast rng(seed_++); - DistributionType dist(min_, max_); + auto gen = generator_factory_(&rng); - // A static cast is required due to the int16 -> int8 handling. - std::generate(data, data + n, [&] { return static_cast(dist(rng)); }); + std::generate(data, data + n, [&] { return gen(); }); } void GenerateBitmap(uint8_t* buffer, size_t n, int64_t* null_count) { @@ -121,13 +152,49 @@ struct GenerateOptions { if (null_count != nullptr) *null_count = count; } - ValueType min_; - ValueType max_; + static constexpr PhysicalType get_nan() { + if constexpr (kIsHalfFloat) { + return std::numeric_limits::quiet_NaN().bits(); + } else { + return std::numeric_limits::quiet_NaN(); + } + } + + FactoryType generator_factory_; SeedType seed_; double probability_; double nan_probability_; }; +void GenerateFullDayMillisNoNan(uint8_t* buffer, size_t n) { + int64_t* data = reinterpret_cast(buffer); + constexpr int64_t kFullDayMillis = 1000 * 60 * 60 * 24; + std::for_each(data, data + n, [&](int64_t& v) { return v *= kFullDayMillis; }); +} + +template +std::shared_ptr> GenerateNumericArray(int64_t size, + OptionType options, + int64_t alignment, + MemoryPool* memory_pool) { + using CType = typename ArrowType::c_type; + auto type = TypeTraits::type_singleton(); + BufferVector buffers{2}; + + int64_t null_count = 0; + buffers[0] = *AllocateEmptyBitmap(size, alignment, memory_pool); + options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count); + + buffers[1] = *AllocateBuffer(sizeof(CType) * size, alignment, memory_pool); + options.GenerateData(buffers[1]->mutable_data(), size); + if (std::is_same::value) { + GenerateFullDayMillisNoNan(buffers[1]->mutable_data(), size); + } + + auto array_data = ArrayData::Make(type, size, buffers, null_count); + return std::make_shared>(array_data); +} + } // namespace std::shared_ptr RandomArrayGenerator::NullBitmap(int64_t size, @@ -176,33 +243,6 @@ std::shared_ptr RandomArrayGenerator::Boolean(int64_t size, return std::make_shared(array_data); } -void GenerateFullDayMillisNoNan(uint8_t* buffer, size_t n) { - int64_t* data = reinterpret_cast(buffer); - constexpr int64_t kFullDayMillis = 1000 * 60 * 60 * 24; - std::for_each(data, data + n, [&](int64_t& v) { return v *= kFullDayMillis; }); -} - -template -static std::shared_ptr> GenerateNumericArray( - int64_t size, OptionType options, int64_t alignment, MemoryPool* memory_pool) { - using CType = typename ArrowType::c_type; - auto type = TypeTraits::type_singleton(); - BufferVector buffers{2}; - - int64_t null_count = 0; - buffers[0] = *AllocateEmptyBitmap(size, alignment, memory_pool); - options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count); - - buffers[1] = *AllocateBuffer(sizeof(CType) * size, alignment, memory_pool); - options.GenerateData(buffers[1]->mutable_data(), size); - if (std::is_same::value) { - GenerateFullDayMillisNoNan(buffers[1]->mutable_data(), size); - } - - auto array_data = ArrayData::Make(type, size, buffers, null_count); - return std::make_shared>(array_data); -} - #define PRIMITIVE_RAND_IMPL(Name, CType, ArrowType, Distribution) \ std::shared_ptr RandomArrayGenerator::Name( \ int64_t size, CType min, CType max, double probability, int64_t alignment, \ @@ -226,8 +266,6 @@ PRIMITIVE_RAND_INTEGER_IMPL(UInt32, uint32_t, UInt32Type) PRIMITIVE_RAND_INTEGER_IMPL(Int32, int32_t, Int32Type) PRIMITIVE_RAND_INTEGER_IMPL(UInt64, uint64_t, UInt64Type) PRIMITIVE_RAND_INTEGER_IMPL(Int64, int64_t, Int64Type) -// Generate 16bit values for half-float -PRIMITIVE_RAND_INTEGER_IMPL(Float16, int16_t, HalfFloatType) std::shared_ptr RandomArrayGenerator::Date64(int64_t size, int64_t min, int64_t max, double null_probability, @@ -239,6 +277,25 @@ std::shared_ptr RandomArrayGenerator::Date64(int64_t size, int64_t min, memory_pool); } +std::shared_ptr RandomArrayGenerator::Float16(int64_t size, uint16_t min, + uint16_t max, + double null_probability, + int64_t alignment, + MemoryPool* memory_pool) { + return this->Float16(size, Float16::FromBits(min), Float16::FromBits(max), + null_probability, /*nan_probability=*/0, alignment, memory_pool); +} + +std::shared_ptr RandomArrayGenerator::Float16( + int64_t size, util::Float16 min, util::Float16 max, double null_probability, + double nan_probability, int64_t alignment, MemoryPool* memory_pool) { + using OptionType = + GenerateOptions>; + OptionType options(seed(), min, max, null_probability, nan_probability); + return GenerateNumericArray(size, options, alignment, + memory_pool); +} + std::shared_ptr RandomArrayGenerator::Float32(int64_t size, float min, float max, double null_probability, double nan_probability, @@ -1087,10 +1144,23 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t GENERATE_INTEGRAL_CASE(Int32Type); GENERATE_INTEGRAL_CASE(UInt64Type); GENERATE_INTEGRAL_CASE(Int64Type); - GENERATE_INTEGRAL_CASE_VIEW(Int16Type, HalfFloatType); GENERATE_FLOATING_CASE(FloatType, Float32); GENERATE_FLOATING_CASE(DoubleType, Float64); + case Type::type::HALF_FLOAT: { + using ValueType = util::Float16; + const ValueType min_value = GetMetadata( + field.metadata().get(), "min", std::numeric_limits::min()); + const ValueType max_value = GetMetadata( + field.metadata().get(), "max", std::numeric_limits::max()); + const double nan_probability = + GetMetadata(field.metadata().get(), "nan_probability", 0); + VALIDATE_MIN_MAX(min_value, max_value); + VALIDATE_RANGE(nan_probability, 0.0, 1.0); + return Float16(length, min_value, max_value, null_probability, nan_probability, + alignment, memory_pool); + } + case Type::type::STRING: case Type::type::BINARY: { const auto min_length = diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index ad87b121059..d9122915a09 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -28,6 +28,7 @@ #include "arrow/testing/uniform_real.h" #include "arrow/testing/visibility.h" #include "arrow/type.h" +#include "arrow/util/float16.h" namespace arrow { @@ -198,11 +199,33 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// \param[in] memory_pool memory pool to allocate memory from /// /// \return a generated Array - std::shared_ptr Float16(int64_t size, int16_t min, int16_t max, + /// + /// \deprecated Deprecated in 22.0.0. Use the other Float16() method that accepts + /// nan_probability as a parameter + ARROW_DEPRECATED( + "Deprecated in 22.0.0. Use the other Float16() method that accepts nan_probability " + "as a parameter") + std::shared_ptr Float16(int64_t size, uint16_t min, uint16_t max, double null_probability = 0, int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random HalfFloatArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a value being null + /// \param[in] nan_probability the probability of a value being NaN + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr Float16(int64_t size, util::Float16 min, util::Float16 max, + double null_probability = 0, double nan_probability = 0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random FloatArray /// /// \param[in] size the size of the array to generate @@ -281,8 +304,9 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { return Int64(size, static_cast(min), static_cast(max), null_probability, alignment, memory_pool); case Type::HALF_FLOAT: - return Float16(size, static_cast(min), static_cast(max), - null_probability, alignment, memory_pool); + return Float16(size, util::Float16::FromBits(static_cast(min)), + util::Float16::FromBits(static_cast(max)), + null_probability, /*nan_probability=*/0, alignment, memory_pool); case Type::FLOAT: return Float32(size, static_cast(min), static_cast(max), null_probability, /*nan_probability=*/0, alignment, memory_pool); diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 6f8621f8e99..279fb6dc91f 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -26,12 +26,14 @@ #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/float16.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/pcg_random.h" namespace arrow { using internal::checked_cast; +using util::Float16; namespace random { @@ -242,8 +244,14 @@ TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { auto array = this->Downcast(batch->column(0)); for (auto slot : *array) { if (!slot.has_value()) continue; - ASSERT_GE(slot, typename TypeParam::c_type(0)); - ASSERT_LE(slot, typename TypeParam::c_type(127)); + if constexpr (is_half_float_type::value) { + const auto f16_slot = Float16::FromBits(*slot); + ASSERT_GE(f16_slot, Float16(0)); + ASSERT_LE(f16_slot, Float16(127)); + } else { + ASSERT_GE(slot, typename TypeParam::c_type(0)); + ASSERT_LE(slot, typename TypeParam::c_type(127)); + } } } @@ -256,7 +264,11 @@ TYPED_TEST(RandomNumericArrayTest, EmptyRange) { auto array = this->Downcast(batch->column(0)); for (auto slot : *array) { if (!slot.has_value()) continue; - ASSERT_EQ(slot, typename TypeParam::c_type(42)); + if constexpr (is_half_float_type::value) { + ASSERT_EQ(Float16::FromBits(*slot), Float16(42)); + } else { + ASSERT_EQ(slot, typename TypeParam::c_type(42)); + } } } @@ -359,6 +371,18 @@ TEST(TypeSpecificTests, DictionaryValues) { ASSERT_EQ(16, array->dictionary()->length()); } +TEST(TypeSpecificTests, Float16Nan) { + auto field = arrow::field("float16", float16(), + key_value_metadata({{"nan_probability", "1.0"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast>(base_array); + ASSERT_OK(array->ValidateFull()); + for (const auto& value : *array) { + ASSERT_TRUE(!value.has_value() || Float16::FromBits(*value).is_nan()); + } +} + TEST(TypeSpecificTests, Float32Nan) { auto field = arrow::field("float32", float32(), key_value_metadata({{"nan_probability", "1.0"}})); diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc index e5e53801df9..b0c8deae36c 100644 --- a/cpp/src/arrow/testing/util.cc +++ b/cpp/src/arrow/testing/util.cc @@ -90,6 +90,16 @@ void random_ascii(int64_t n, uint32_t seed, uint8_t* out) { rand_uniform_int(n, seed, static_cast('A'), static_cast('z'), out); } +void random_alnum(int64_t n, uint32_t seed, uint8_t* out) { + static const char charset[] = + "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + pcg32_fast gen(seed); + std::uniform_int_distribution d(0, sizeof(charset) - 2); + std::generate(out, out + n, [&d, &gen] { return charset[d(gen)]; }); +} + int64_t CountNulls(const std::vector& valid_bytes) { return static_cast(std::count(valid_bytes.cbegin(), valid_bytes.cend(), '\0')); } diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index 8cc28a8b073..c2d6ca4d156 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -64,6 +64,7 @@ ARROW_TESTING_EXPORT void random_bytes(int64_t n, uint32_t seed, uint8_t* out); ARROW_TESTING_EXPORT std::string random_string(int64_t n, uint32_t seed); ARROW_TESTING_EXPORT int32_t DecimalSize(int32_t precision); ARROW_TESTING_EXPORT void random_ascii(int64_t n, uint32_t seed, uint8_t* out); +ARROW_TESTING_EXPORT void random_alnum(int64_t n, uint32_t seed, uint8_t* out); ARROW_TESTING_EXPORT int64_t CountNulls(const std::vector& valid_bytes); ARROW_TESTING_EXPORT Status MakeRandomByteBuffer(int64_t length, MemoryPool* pool, diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 86cd2842c78..2e9d860a8d6 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1329,9 +1329,9 @@ bool RunEndEncodedType::RunEndTypeValid(const DataType& run_end_type) { namespace { -std::unordered_multimap CreateNameToIndexMap( +std::unordered_multimap CreateNameToIndexMap( const FieldVector& fields) { - std::unordered_multimap name_to_index; + std::unordered_multimap name_to_index; for (size_t i = 0; i < fields.size(); ++i) { name_to_index.emplace(fields[i]->name(), static_cast(i)); } @@ -1339,8 +1339,8 @@ std::unordered_multimap CreateNameToIndexMap( } template -int LookupNameIndex(const std::unordered_multimap& name_to_index, - const std::string& name) { +int LookupNameIndex(const std::unordered_multimap& name_to_index, + std::string_view name) { auto p = name_to_index.equal_range(name); auto it = p.first; if (it == p.second) { @@ -1362,7 +1362,7 @@ class StructType::Impl { explicit Impl(const FieldVector& fields) : name_to_index_(CreateNameToIndexMap(fields)) {} - const std::unordered_multimap name_to_index_; + const std::unordered_multimap name_to_index_; }; StructType::StructType(const FieldVector& fields) @@ -2150,7 +2150,7 @@ std::vector FieldRef::FindAll(const FieldVector& fields) const { auto maybe_field = FieldPathGetImpl::Get(&path, FieldSelector(fields_), &out_of_range_depth); - DCHECK_OK(maybe_field.status()); + DCHECK_OK(maybe_field); if (maybe_field.ValueOrDie() != nullptr) { return {path}; @@ -2188,7 +2188,7 @@ std::vector FieldRef::FindAll(const FieldVector& fields) const { void Add(const FieldPath& prefix, const FieldPath& suffix, const FieldVector& fields) { auto maybe_field = suffix.Get(fields); - DCHECK_OK(maybe_field.status()); + DCHECK_OK(maybe_field); referents.push_back(std::move(maybe_field).ValueOrDie()); std::vector concatenated_indices(prefix.indices().size() + @@ -2279,7 +2279,7 @@ class Schema::Impl { FieldVector fields_; Endianness endianness_; - std::unordered_multimap name_to_index_; + std::unordered_multimap name_to_index_; std::shared_ptr metadata_; }; @@ -2363,16 +2363,16 @@ bool Schema::Equals(const std::shared_ptr& other, bool check_metadata) c return Equals(*other, check_metadata); } -std::shared_ptr Schema::GetFieldByName(const std::string& name) const { +std::shared_ptr Schema::GetFieldByName(std::string_view name) const { int i = GetFieldIndex(name); return i == -1 ? nullptr : impl_->fields_[i]; } -int Schema::GetFieldIndex(const std::string& name) const { +int Schema::GetFieldIndex(std::string_view name) const { return LookupNameIndex(impl_->name_to_index_, name); } -std::vector Schema::GetAllFieldIndices(const std::string& name) const { +std::vector Schema::GetAllFieldIndices(std::string_view name) const { std::vector result; auto p = impl_->name_to_index_.equal_range(name); for (auto it = p.first; it != p.second; ++it) { @@ -2384,7 +2384,7 @@ std::vector Schema::GetAllFieldIndices(const std::string& name) const { return result; } -Status Schema::CanReferenceFieldByName(const std::string& name) const { +Status Schema::CanReferenceFieldByName(std::string_view name) const { if (GetFieldByName(name) == nullptr) { return Status::Invalid("Field named '", name, "' not found or not unique in the schema."); @@ -2399,7 +2399,7 @@ Status Schema::CanReferenceFieldsByNames(const std::vector& names) return Status::OK(); } -FieldVector Schema::GetAllFieldsByName(const std::string& name) const { +FieldVector Schema::GetAllFieldsByName(std::string_view name) const { FieldVector result; auto p = impl_->name_to_index_.equal_range(name); for (auto it = p.first; it != p.second; ++it) { @@ -2580,7 +2580,7 @@ class SchemaBuilder::Impl { private: FieldVector fields_; - std::unordered_multimap name_to_index_; + std::unordered_multimap name_to_index_; std::shared_ptr metadata_; ConflictPolicy policy_; Field::MergeOptions field_merge_options_; @@ -2641,7 +2641,8 @@ Status SchemaBuilder::AddSchemas(const std::vector>& sch } Status SchemaBuilder::AddMetadata(const KeyValueMetadata& metadata) { - impl_->metadata_ = metadata.Copy(); + impl_->metadata_ = + impl_->metadata_ ? impl_->metadata_->Merge(metadata) : metadata.Copy(); return Status::OK(); } @@ -3225,12 +3226,6 @@ std::shared_ptr map(std::shared_ptr key_type, keys_sorted); } -std::shared_ptr map(std::shared_ptr key_field, - std::shared_ptr item_field, bool keys_sorted) { - return std::make_shared(std::move(key_field), std::move(item_field), - keys_sorted); -} - std::shared_ptr fixed_size_list(std::shared_ptr value_type, int32_t list_size) { return std::make_shared(std::move(value_type), list_size); @@ -3288,6 +3283,8 @@ std::shared_ptr dense_union(FieldVector child_fields, return std::make_shared(std::move(child_fields), std::move(type_codes)); } +namespace { + FieldVector FieldsFromArraysAndNames(std::vector names, const ArrayVector& arrays) { FieldVector fields(arrays.size()); @@ -3307,6 +3304,8 @@ FieldVector FieldsFromArraysAndNames(std::vector names, return fields; } +} // namespace + std::shared_ptr sparse_union(const ArrayVector& children, std::vector field_names, std::vector type_codes) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 0dd1d56c652..f68d2dcb619 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -178,7 +178,7 @@ class ARROW_EXPORT DataType : public std::enable_shared_from_this, virtual DataTypeLayout layout() const = 0; /// \brief Return the type category - Type::type id() const { return id_; } + constexpr Type::type id() const { return id_; } /// \brief Return the type category of the storage type virtual Type::type storage_id() const { return id_; } @@ -292,6 +292,7 @@ std::ostream& operator<<(std::ostream& os, const TypeHolder& type); /// - if a `PhysicalType` alias exists in the concrete type class, return /// an instance of `PhysicalType`. /// - otherwise, return the input type itself. +ARROW_EXPORT std::shared_ptr GetPhysicalType(const std::shared_ptr& type); /// \brief Base class for all fixed-width data types @@ -2366,19 +2367,19 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, std::vector field_names() const; /// Returns null if name not found - std::shared_ptr GetFieldByName(const std::string& name) const; + std::shared_ptr GetFieldByName(std::string_view name) const; /// \brief Return the indices of all fields having this name in sorted order - FieldVector GetAllFieldsByName(const std::string& name) const; + FieldVector GetAllFieldsByName(std::string_view name) const; /// Returns -1 if name not found - int GetFieldIndex(const std::string& name) const; + int GetFieldIndex(std::string_view name) const; /// Return the indices of all fields having this name - std::vector GetAllFieldIndices(const std::string& name) const; + std::vector GetAllFieldIndices(std::string_view name) const; /// Indicate if field named `name` can be found unambiguously in the schema. - Status CanReferenceFieldByName(const std::string& name) const; + Status CanReferenceFieldByName(std::string_view name) const; /// Indicate if fields named `names` can be found unambiguously in the schema. Status CanReferenceFieldsByNames(const std::vector& names) const; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 5a2fbde0232..be26c40dc1f 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -46,6 +46,7 @@ class Future; namespace util { class Codec; class CodecOptions; +class Float16; } // namespace util class Buffer; @@ -242,12 +243,17 @@ _NUMERIC_TYPE_DECL(UInt8) _NUMERIC_TYPE_DECL(UInt16) _NUMERIC_TYPE_DECL(UInt32) _NUMERIC_TYPE_DECL(UInt64) -_NUMERIC_TYPE_DECL(HalfFloat) _NUMERIC_TYPE_DECL(Float) _NUMERIC_TYPE_DECL(Double) #undef _NUMERIC_TYPE_DECL +class HalfFloatType; +using HalfFloatArray = NumericArray; +class HalfFloatBuilder; +struct HalfFloatScalar; +using HalfFloatTensor = NumericTensor; + enum class DateUnit : char { DAY = 0, MILLI = 1 }; class DateType; diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 7610be8a47f..cb17f1ac3fc 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -746,25 +746,31 @@ TEST(TestSchemaBuilder, WithMetadata) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto metadata = key_value_metadata({{"foo", "bar"}}); + auto metadata2 = key_value_metadata({{"foo2", "bar2"}}); + auto merged_metadata = metadata->Merge(*metadata2); SchemaBuilder builder; ASSERT_OK(builder.AddMetadata(*metadata)); ASSERT_OK_AND_ASSIGN(auto schema, builder.Finish()); AssertSchemaEqual(schema, ::arrow::schema({})->WithMetadata(metadata)); + ASSERT_OK(builder.AddMetadata(*metadata2)); + ASSERT_OK_AND_ASSIGN(schema, builder.Finish()); + AssertSchemaEqual(schema, ::arrow::schema({})->WithMetadata(merged_metadata)); + ASSERT_OK(builder.AddField(f0)); ASSERT_OK_AND_ASSIGN(schema, builder.Finish()); - AssertSchemaEqual(schema, ::arrow::schema({f0})->WithMetadata(metadata)); + AssertSchemaEqual(schema, ::arrow::schema({f0})->WithMetadata(merged_metadata)); - SchemaBuilder other_builder{::arrow::schema({})->WithMetadata(metadata)}; + SchemaBuilder other_builder{::arrow::schema({})->WithMetadata(merged_metadata)}; ASSERT_OK(other_builder.AddField(f1)); ASSERT_OK_AND_ASSIGN(schema, other_builder.Finish()); - AssertSchemaEqual(schema, ::arrow::schema({f1})->WithMetadata(metadata)); + AssertSchemaEqual(schema, ::arrow::schema({f1})->WithMetadata(merged_metadata)); other_builder.Reset(); - ASSERT_OK(other_builder.AddField(f1->WithMetadata(metadata))); + ASSERT_OK(other_builder.AddField(f1->WithMetadata(merged_metadata))); ASSERT_OK_AND_ASSIGN(schema, other_builder.Finish()); - AssertSchemaEqual(schema, ::arrow::schema({f1->WithMetadata(metadata)})); + AssertSchemaEqual(schema, ::arrow::schema({f1->WithMetadata(merged_metadata)})); } TEST(TestSchemaBuilder, IncrementalConstruction) { diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 6ed495dcb2b..1b7a02e1085 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -316,6 +316,11 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return float16(); } }; +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = HalfFloatType; +}; + template <> struct TypeTraits { using ArrayType = Decimal32Array; @@ -1237,6 +1242,22 @@ constexpr bool is_binary(Type::type type_id) { return false; } +/// \brief Check for a binary or binary view (non-string) type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a binary type one +constexpr bool is_binary_or_binary_view(Type::type type_id) { + switch (type_id) { + case Type::BINARY: + case Type::LARGE_BINARY: + case Type::BINARY_VIEW: + return true; + default: + break; + } + return false; +} + /// \brief Check for a string type /// /// \param[in] type_id the type-id to check @@ -1252,6 +1273,22 @@ constexpr bool is_string(Type::type type_id) { return false; } +/// \brief Check for a string or string view type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a string type one +constexpr bool is_string_or_string_view(Type::type type_id) { + switch (type_id) { + case Type::STRING: + case Type::LARGE_STRING: + case Type::STRING_VIEW: + return true; + default: + break; + } + return false; +} + /// \brief Check for a binary-view-like type (i.e. string view and binary view) /// /// \param[in] type_id the type-id to check @@ -1491,7 +1528,7 @@ constexpr bool is_union(Type::type type_id) { /// /// For Type::FIXED_SIZE_BINARY, you will instead need to inspect the concrete /// DataType to get this information. -static inline int bit_width(Type::type type_id) { +constexpr int bit_width(Type::type type_id) { switch (type_id) { case Type::BOOL: return 1; @@ -1547,7 +1584,7 @@ static inline int bit_width(Type::type type_id) { /// /// \param[in] type_id the type-id to check /// \return the offsets bit width, or 0 if the type does not have offsets -static inline int offset_bit_width(Type::type type_id) { +constexpr int offset_bit_width(Type::type type_id) { switch (type_id) { case Type::STRING: case Type::BINARY: @@ -1596,7 +1633,7 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index); /// \return whether type is an integer type /// /// Convenience for checking using the type's id -static inline bool is_integer(const DataType& type) { return is_integer(type.id()); } +constexpr bool is_integer(const DataType& type) { return is_integer(type.id()); } /// \brief Check for a signed integer type /// @@ -1604,7 +1641,7 @@ static inline bool is_integer(const DataType& type) { return is_integer(type.id( /// \return whether type is a signed integer type /// /// Convenience for checking using the type's id -static inline bool is_signed_integer(const DataType& type) { +constexpr bool is_signed_integer(const DataType& type) { return is_signed_integer(type.id()); } @@ -1614,7 +1651,7 @@ static inline bool is_signed_integer(const DataType& type) { /// \return whether type is an unsigned integer type /// /// Convenience for checking using the type's id -static inline bool is_unsigned_integer(const DataType& type) { +constexpr bool is_unsigned_integer(const DataType& type) { return is_unsigned_integer(type.id()); } @@ -1624,7 +1661,7 @@ static inline bool is_unsigned_integer(const DataType& type) { /// \return whether type is a floating point type /// /// Convenience for checking using the type's id -static inline bool is_floating(const DataType& type) { return is_floating(type.id()); } +constexpr bool is_floating(const DataType& type) { return is_floating(type.id()); } /// \brief Check for a numeric type (number except boolean type) /// @@ -1632,7 +1669,7 @@ static inline bool is_floating(const DataType& type) { return is_floating(type.i /// \return whether type is a numeric type /// /// Convenience for checking using the type's id -static inline bool is_numeric(const DataType& type) { return is_numeric(type.id()); } +constexpr bool is_numeric(const DataType& type) { return is_numeric(type.id()); } /// \brief Check for a decimal type /// @@ -1640,7 +1677,7 @@ static inline bool is_numeric(const DataType& type) { return is_numeric(type.id( /// \return whether type is a decimal type /// /// Convenience for checking using the type's id -static inline bool is_decimal(const DataType& type) { return is_decimal(type.id()); } +constexpr bool is_decimal(const DataType& type) { return is_decimal(type.id()); } /// \brief Check for a primitive type /// @@ -1648,7 +1685,7 @@ static inline bool is_decimal(const DataType& type) { return is_decimal(type.id( /// \return whether type is a primitive type /// /// Convenience for checking using the type's id -static inline bool is_primitive(const DataType& type) { return is_primitive(type.id()); } +constexpr bool is_primitive(const DataType& type) { return is_primitive(type.id()); } /// \brief Check for a binary or string-like type (except fixed-size binary) /// @@ -1656,7 +1693,7 @@ static inline bool is_primitive(const DataType& type) { return is_primitive(type /// \return whether type is a binary or string-like type /// /// Convenience for checking using the type's id -static inline bool is_base_binary_like(const DataType& type) { +constexpr bool is_base_binary_like(const DataType& type) { return is_base_binary_like(type.id()); } @@ -1666,9 +1703,7 @@ static inline bool is_base_binary_like(const DataType& type) { /// \return whether type is a binary-like type /// /// Convenience for checking using the type's id -static inline bool is_binary_like(const DataType& type) { - return is_binary_like(type.id()); -} +constexpr bool is_binary_like(const DataType& type) { return is_binary_like(type.id()); } /// \brief Check for a large-binary-like type /// @@ -1676,7 +1711,7 @@ static inline bool is_binary_like(const DataType& type) { /// \return whether type is a large-binary-like type /// /// Convenience for checking using the type's id -static inline bool is_large_binary_like(const DataType& type) { +constexpr bool is_large_binary_like(const DataType& type) { return is_large_binary_like(type.id()); } @@ -1686,7 +1721,7 @@ static inline bool is_large_binary_like(const DataType& type) { /// \return whether type is a binary type /// /// Convenience for checking using the type's id -static inline bool is_binary(const DataType& type) { return is_binary(type.id()); } +constexpr bool is_binary(const DataType& type) { return is_binary(type.id()); } /// \brief Check for a string type /// @@ -1694,7 +1729,7 @@ static inline bool is_binary(const DataType& type) { return is_binary(type.id()) /// \return whether type is a string type /// /// Convenience for checking using the type's id -static inline bool is_string(const DataType& type) { return is_string(type.id()); } +constexpr bool is_string(const DataType& type) { return is_string(type.id()); } /// \brief Check for a binary-view-like type /// @@ -1702,7 +1737,7 @@ static inline bool is_string(const DataType& type) { return is_string(type.id()) /// \return whether type is a binary-view-like type /// /// Convenience for checking using the type's id -static inline bool is_binary_view_like(const DataType& type) { +constexpr bool is_binary_view_like(const DataType& type) { return is_binary_view_like(type.id()); } @@ -1712,7 +1747,7 @@ static inline bool is_binary_view_like(const DataType& type) { /// \return whether type is a temporal type /// /// Convenience for checking using the type's id -static inline bool is_temporal(const DataType& type) { return is_temporal(type.id()); } +constexpr bool is_temporal(const DataType& type) { return is_temporal(type.id()); } /// \brief Check for an interval type /// @@ -1720,7 +1755,7 @@ static inline bool is_temporal(const DataType& type) { return is_temporal(type.i /// \return whether type is a interval type /// /// Convenience for checking using the type's id -static inline bool is_interval(const DataType& type) { return is_interval(type.id()); } +constexpr bool is_interval(const DataType& type) { return is_interval(type.id()); } /// \brief Check for a dictionary type /// @@ -1728,9 +1763,7 @@ static inline bool is_interval(const DataType& type) { return is_interval(type.i /// \return whether type is a dictionary type /// /// Convenience for checking using the type's id -static inline bool is_dictionary(const DataType& type) { - return is_dictionary(type.id()); -} +constexpr bool is_dictionary(const DataType& type) { return is_dictionary(type.id()); } /// \brief Check for a fixed-size-binary type /// @@ -1738,7 +1771,7 @@ static inline bool is_dictionary(const DataType& type) { /// \return whether type is a fixed-size-binary type /// /// Convenience for checking using the type's id -static inline bool is_fixed_size_binary(const DataType& type) { +constexpr bool is_fixed_size_binary(const DataType& type) { return is_fixed_size_binary(type.id()); } @@ -1748,9 +1781,7 @@ static inline bool is_fixed_size_binary(const DataType& type) { /// \return whether type is a fixed-width type /// /// Convenience for checking using the type's id -static inline bool is_fixed_width(const DataType& type) { - return is_fixed_width(type.id()); -} +constexpr bool is_fixed_width(const DataType& type) { return is_fixed_width(type.id()); } /// \brief Check for a variable-length list type /// @@ -1758,7 +1789,7 @@ static inline bool is_fixed_width(const DataType& type) { /// \return whether type is a variable-length list type /// /// Convenience for checking using the type's id -static inline bool is_var_length_list(const DataType& type) { +constexpr bool is_var_length_list(const DataType& type) { return is_var_length_list(type.id()); } @@ -1768,7 +1799,7 @@ static inline bool is_var_length_list(const DataType& type) { /// \return whether type is a list-like type /// /// Convenience for checking using the type's id -static inline bool is_list_like(const DataType& type) { return is_list_like(type.id()); } +constexpr bool is_list_like(const DataType& type) { return is_list_like(type.id()); } /// \brief Check for a var-length list or list-view like type /// @@ -1776,7 +1807,7 @@ static inline bool is_list_like(const DataType& type) { return is_list_like(type /// \return whether type is a var-length list or list-view like type /// /// Convenience for checking using the type's id -static inline bool is_var_length_list_like(const DataType& type) { +constexpr bool is_var_length_list_like(const DataType& type) { return is_var_length_list_like(type.id()); } @@ -1786,7 +1817,7 @@ static inline bool is_var_length_list_like(const DataType& type) { /// \return whether type is a list-view type /// /// Convenience for checking using the type's id -static inline bool is_list_view(const DataType& type) { return is_list_view(type.id()); } +constexpr bool is_list_view(const DataType& type) { return is_list_view(type.id()); } /// \brief Check for a nested type /// @@ -1794,7 +1825,7 @@ static inline bool is_list_view(const DataType& type) { return is_list_view(type /// \return whether type is a nested type /// /// Convenience for checking using the type's id -static inline bool is_nested(const DataType& type) { return is_nested(type.id()); } +constexpr bool is_nested(const DataType& type) { return is_nested(type.id()); } /// \brief Check for a union type /// @@ -1802,7 +1833,7 @@ static inline bool is_nested(const DataType& type) { return is_nested(type.id()) /// \return whether type is a union type /// /// Convenience for checking using the type's id -static inline bool is_union(const DataType& type) { return is_union(type.id()); } +constexpr bool is_union(const DataType& type) { return is_union(type.id()); } /// @} diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 17eea5532cc..56545f6aa79 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -72,6 +72,7 @@ add_arrow_test(utility-test ree_util_test.cc reflection_test.cc rows_to_batches_test.cc + secure_string_test.cc small_vector_test.cc span_test.cc stl_util_test.cc @@ -97,6 +98,7 @@ add_arrow_test(bit-utility-test SOURCES bit_block_counter_test.cc bit_util_test.cc + bpacking_test.cc rle_encoding_test.cc) add_arrow_test(threading-utility-test @@ -116,6 +118,7 @@ add_arrow_test(crc32-test add_arrow_benchmark(bit_block_counter_benchmark) add_arrow_benchmark(bit_util_benchmark) +add_arrow_benchmark(bpacking_benchmark) add_arrow_benchmark(bitmap_reader_benchmark) add_arrow_benchmark(cache_benchmark) add_arrow_benchmark(compression_benchmark) diff --git a/cpp/src/arrow/util/align_util.cc b/cpp/src/arrow/util/align_util.cc index a327afa7a5c..4cc7675ab8c 100644 --- a/cpp/src/arrow/util/align_util.cc +++ b/cpp/src/arrow/util/align_util.cc @@ -19,6 +19,7 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" +#include "arrow/extension_type.h" #include "arrow/record_batch.h" #include "arrow/table.h" #include "arrow/type_fwd.h" @@ -28,6 +29,8 @@ namespace arrow { +using internal::checked_cast; + namespace util { bool CheckAlignment(const Buffer& buffer, int64_t alignment) { @@ -44,9 +47,13 @@ namespace { Type::type GetTypeForBuffers(const ArrayData& array) { Type::type type_id = array.type->storage_id(); if (type_id == Type::DICTIONARY) { - return ::arrow::internal::checked_pointer_cast(array.type) - ->index_type() - ->id(); + // return index type id, provided by the DictionaryType array.type or + // array.type->storage_type() if array.type is an ExtensionType + DataType* dict_type = array.type.get(); + if (array.type->id() == Type::EXTENSION) { + dict_type = checked_cast(dict_type)->storage_type().get(); + } + return checked_cast(dict_type)->index_type()->id(); } return type_id; } diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc index 46825c35da0..f8b979a3f56 100644 --- a/cpp/src/arrow/util/async_util.cc +++ b/cpp/src/arrow/util/async_util.cc @@ -316,15 +316,11 @@ class ThrottledAsyncTaskSchedulerImpl #endif queue_->Push(std::move(task)); lk.unlock(); - maybe_backoff->AddCallback( - [weak_self = std::weak_ptr( - shared_from_this())](const Status& st) { - if (st.ok()) { - if (auto self = weak_self.lock()) { - self->ContinueTasks(); - } - } - }); + maybe_backoff->AddCallback([weak_self = weak_from_this()](const Status& st) { + if (auto self = weak_self.lock(); self && st.ok()) { + self->ContinueTasks(); + } + }); return true; } else { lk.unlock(); @@ -350,8 +346,9 @@ class ThrottledAsyncTaskSchedulerImpl self = shared_from_this()]() mutable -> Result> { ARROW_ASSIGN_OR_RAISE(Future<> inner_fut, (*inner_task)()); if (!inner_fut.TryAddCallback([&] { - return [latched_cost, self = std::move(self)](const Status& st) -> void { - if (st.ok()) { + return [latched_cost, + weak_self = self->weak_from_this()](const Status& st) -> void { + if (auto self = weak_self.lock(); self && st.ok()) { self->throttle_->Release(latched_cost); self->ContinueTasks(); } @@ -360,6 +357,7 @@ class ThrottledAsyncTaskSchedulerImpl // If the task is already finished then don't run ContinueTasks // if we are already running it so we can avoid stack overflow self->throttle_->Release(latched_cost); + inner_task.reset(); if (!in_continue) { self->ContinueTasks(); } @@ -377,8 +375,8 @@ class ThrottledAsyncTaskSchedulerImpl if (maybe_backoff) { lk.unlock(); if (!maybe_backoff->TryAddCallback([&] { - return [self = shared_from_this()](const Status& st) { - if (st.ok()) { + return [weak_self = weak_from_this()](const Status& st) { + if (auto self = weak_self.lock(); self && st.ok()) { self->ContinueTasks(); } }; diff --git a/cpp/src/arrow/util/bit_run_reader.h b/cpp/src/arrow/util/bit_run_reader.h index a436a50b86f..ed7be940a54 100644 --- a/cpp/src/arrow/util/bit_run_reader.h +++ b/cpp/src/arrow/util/bit_run_reader.h @@ -52,6 +52,8 @@ inline bool operator!=(const BitRun& lhs, const BitRun& rhs) { class BitRunReaderLinear { public: + BitRunReaderLinear() = default; + BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length) : reader_(bitmap, start_offset, length) {} @@ -74,6 +76,8 @@ class BitRunReaderLinear { /// in a bitmap. class ARROW_EXPORT BitRunReader { public: + BitRunReader() = default; + /// \brief Constructs new BitRunReader. /// /// \param[in] bitmap source data @@ -457,6 +461,26 @@ using ReverseSetBitRunReader = BaseSetBitRunReader; // Functional-style bit run visitors. +template +inline Status VisitBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length, + Visit&& visit) { + if (bitmap == NULLPTR) { + // Assuming all set (as in a null bitmap) + return visit(static_cast(0), length, true); + } + BitRunReader reader(bitmap, offset, length); + int64_t position = 0; + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + ARROW_RETURN_NOT_OK(visit(position, run.length, run.set)); + position += run.length; + } + return Status::OK(); +} + // XXX: Try to make this function small so the compiler can inline and optimize // the `visit` function, which is normally a hot loop with vectorizable code. // - don't inline SetBitRunReader constructor, it doesn't hurt performance diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index d59e88a3449..1f3b699e1ac 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -22,15 +22,16 @@ #include #include #include +#include #include "arrow/util/bit_util.h" -#include "arrow/util/bpacking.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" -namespace arrow { -namespace bit_util { +namespace arrow::bit_util { /// Utility class to write bit/byte streams. This class can write data to either be /// bit packed or byte aligned (and a single stream that has a mix of both). @@ -72,19 +73,14 @@ class BitWriter { /// room. The value is written byte aligned. /// For more details on vlq: /// en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(uint32_t v); + template + bool PutVlqInt(Int v); - // Writes an int zigzag encoded. - bool PutZigZagVlqInt(int32_t v); - - /// Write a Vlq encoded int64 to the buffer. Returns false if there was not enough - /// room. The value is written byte aligned. - /// For more details on vlq: - /// en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(uint64_t v); - - // Writes an int64 zigzag encoded. - bool PutZigZagVlqInt(int64_t v); + /// Writes a zigzag encoded signed integer. + /// Zigzag encoding is used to encode possibly negative numbers by alternating positive + /// and negative ones. + template + bool PutZigZagVlqInt(Int v); /// Get a pointer to the next aligned byte and advance the underlying buffer /// by num_bytes. @@ -127,14 +123,14 @@ inline uint64_t ReadLittleEndianWord(const uint8_t* buffer, int bytes_remaining) /// bytes in one read (e.g. encoded int). class BitReader { public: - BitReader() = default; + BitReader() noexcept = default; /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. BitReader(const uint8_t* buffer, int buffer_len) : BitReader() { Reset(buffer, buffer_len); } - void Reset(const uint8_t* buffer, int buffer_len) { + void Reset(const uint8_t* buffer, int buffer_len) noexcept { buffer_ = buffer; max_bytes_ = buffer_len; byte_offset_ = 0; @@ -168,18 +164,14 @@ class BitReader { /// Reads a vlq encoded int from the stream. The encoded int must start at /// the beginning of a byte. Return false if there were not enough bytes in /// the buffer. - bool GetVlqInt(uint32_t* v); - - // Reads a zigzag encoded int `into` v. - bool GetZigZagVlqInt(int32_t* v); - - /// Reads a vlq encoded int64 from the stream. The encoded int must start at - /// the beginning of a byte. Return false if there were not enough bytes in - /// the buffer. - bool GetVlqInt(uint64_t* v); + template + bool GetVlqInt(Int* v); - // Reads a zigzag encoded int64 `into` v. - bool GetZigZagVlqInt(int64_t* v); + /// Reads a zigzag encoded integer into a signed integer output v. + /// Zigzag encoding is used to decode possibly negative numbers by alternating positive + /// and negative ones. + template + bool GetZigZagVlqInt(Int* v); /// Returns the number of bytes left in the stream, not including the current /// byte (i.e., there may be an additional fraction of a byte). @@ -188,12 +180,6 @@ class BitReader { (byte_offset_ + static_cast(bit_util::BytesForBits(bit_offset_))); } - /// Maximum byte length of a vlq encoded int - static constexpr int kMaxVlqByteLength = 5; - - /// Maximum byte length of a vlq encoded int64 - static constexpr int kMaxVlqByteLengthForInt64 = 10; - private: const uint8_t* buffer_; int max_bytes_; @@ -339,8 +325,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { if (sizeof(T) == 4) { int num_unpacked = - internal::unpack32(reinterpret_cast(buffer + byte_offset), - reinterpret_cast(v + i), batch_size - i, num_bits); + internal::unpack32(buffer + byte_offset, reinterpret_cast(v + i), + batch_size - i, num_bits); i += num_unpacked; byte_offset += num_unpacked * num_bits / 8; } else if (sizeof(T) == 8 && num_bits > 32) { @@ -360,8 +346,7 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { while (i < batch_size) { int unpack_size = std::min(buffer_size, batch_size - i); int num_unpacked = - internal::unpack32(reinterpret_cast(buffer + byte_offset), - unpack_buffer, unpack_size, num_bits); + internal::unpack32(buffer + byte_offset, unpack_buffer, unpack_size, num_bits); if (num_unpacked == 0) { break; } @@ -439,91 +424,92 @@ inline bool BitReader::Advance(int64_t num_bits) { return true; } -inline bool BitWriter::PutVlqInt(uint32_t v) { - bool result = true; - while ((v & 0xFFFFFF80UL) != 0UL) { - result &= PutAligned(static_cast((v & 0x7F) | 0x80), 1); - v >>= 7; - } - result &= PutAligned(static_cast(v & 0x7F), 1); - return result; -} +template +inline bool BitWriter::PutVlqInt(Int v) { + static_assert(std::is_integral_v); -inline bool BitReader::GetVlqInt(uint32_t* v) { - uint32_t tmp = 0; + constexpr auto kBufferSize = kMaxLEB128ByteLenFor; - for (int i = 0; i < kMaxVlqByteLength; i++) { - uint8_t byte = 0; - if (ARROW_PREDICT_FALSE(!GetAligned(1, &byte))) { + uint8_t buffer[kBufferSize] = {}; + const auto bytes_written = WriteLEB128(v, buffer, kBufferSize); + ARROW_DCHECK_LE(bytes_written, kBufferSize); + if constexpr (std::is_signed_v) { + // Can fail if negative + if (ARROW_PREDICT_FALSE(!bytes_written == 0)) { return false; } - tmp |= static_cast(byte & 0x7F) << (7 * i); + } else { + // Cannot fail since we gave max space + ARROW_DCHECK_GT(bytes_written, 0); + } - if ((byte & 0x80) == 0) { - *v = tmp; - return true; + for (int i = 0; i < bytes_written; ++i) { + const bool success = PutAligned(buffer[i], 1); + if (ARROW_PREDICT_FALSE(!success)) { + return false; } } - return false; -} - -inline bool BitWriter::PutZigZagVlqInt(int32_t v) { - uint32_t u_v = ::arrow::util::SafeCopy(v); - u_v = (u_v << 1) ^ static_cast(v >> 31); - return PutVlqInt(u_v); -} - -inline bool BitReader::GetZigZagVlqInt(int32_t* v) { - uint32_t u; - if (!GetVlqInt(&u)) return false; - u = (u >> 1) ^ (~(u & 1) + 1); - *v = ::arrow::util::SafeCopy(u); return true; } -inline bool BitWriter::PutVlqInt(uint64_t v) { - bool result = true; - while ((v & 0xFFFFFFFFFFFFFF80ULL) != 0ULL) { - result &= PutAligned(static_cast((v & 0x7F) | 0x80), 1); - v >>= 7; +template +inline bool BitReader::GetVlqInt(Int* v) { + static_assert(std::is_integral_v); + + // The data that we will pass to the LEB128 parser + // In all case, we read a byte-aligned value, skipping remaining bits + const uint8_t* data = NULLPTR; + int max_size = 0; + + // Number of bytes left in the buffered values, not including the current + // byte (i.e., there may be an additional fraction of a byte). + const int bytes_left_in_cache = + sizeof(buffered_values_) - static_cast(bit_util::BytesForBits(bit_offset_)); + + // If there are clearly enough bytes left we can try to parse from the cache + if (bytes_left_in_cache >= kMaxLEB128ByteLenFor) { + max_size = bytes_left_in_cache; + data = reinterpret_cast(&buffered_values_) + + bit_util::BytesForBits(bit_offset_); + // Otherwise, we try straight from buffer (ignoring few bytes that may be cached) + } else { + max_size = bytes_left(); + data = buffer_ + (max_bytes_ - max_size); } - result &= PutAligned(static_cast(v & 0x7F), 1); - return result; -} -inline bool BitReader::GetVlqInt(uint64_t* v) { - uint64_t tmp = 0; - - for (int i = 0; i < kMaxVlqByteLengthForInt64; i++) { - uint8_t byte = 0; - if (ARROW_PREDICT_FALSE(!GetAligned(1, &byte))) { - return false; - } - tmp |= static_cast(byte & 0x7F) << (7 * i); - - if ((byte & 0x80) == 0) { - *v = tmp; - return true; - } + const auto bytes_read = bit_util::ParseLeadingLEB128(data, max_size, v); + if (ARROW_PREDICT_FALSE(bytes_read == 0)) { + // Corrupt LEB128 + return false; } - return false; + // Advance for the bytes we have read + the bits we skipped + return Advance((8 * bytes_read) + (bit_offset_ % 8)); } -inline bool BitWriter::PutZigZagVlqInt(int64_t v) { - uint64_t u_v = ::arrow::util::SafeCopy(v); - u_v = (u_v << 1) ^ static_cast(v >> 63); +template +inline bool BitWriter::PutZigZagVlqInt(Int v) { + static_assert(std::is_integral_v); + static_assert(std::is_signed_v); + using UInt = std::make_unsigned_t; + constexpr auto kBitSize = 8 * sizeof(Int); + + UInt u_v = ::arrow::util::SafeCopy(v); + u_v = (u_v << 1) ^ static_cast(v >> (kBitSize - 1)); return PutVlqInt(u_v); } -inline bool BitReader::GetZigZagVlqInt(int64_t* v) { - uint64_t u; +template +inline bool BitReader::GetZigZagVlqInt(Int* v) { + static_assert(std::is_integral_v); + static_assert(std::is_signed_v); + + std::make_unsigned_t u; if (!GetVlqInt(&u)) return false; u = (u >> 1) ^ (~(u & 1) + 1); - *v = ::arrow::util::SafeCopy(u); + *v = ::arrow::util::SafeCopy(u); return true; } -} // namespace bit_util -} // namespace arrow +} // namespace arrow::bit_util diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index e7eb3f833ea..8d4811ede79 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -333,7 +333,7 @@ void ClearBitmap(uint8_t* data, int64_t offset, int64_t length); /// ex: /// ref: https://stackoverflow.com/a/59523400 template -constexpr Word PrecedingWordBitmask(unsigned int const i) { +constexpr Word PrecedingWordBitmask(const unsigned int i) { return static_cast(static_cast(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1; @@ -365,5 +365,126 @@ void PackBits(const uint32_t* values, uint8_t* out) { } } +constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 7); } + +template +constexpr int64_t kMaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8); + +/// Write a integer as LEB128 +/// +/// Write the input value as LEB128 into the outptut buffer and return the number of bytes +/// written. +/// If the output buffer size is insufficient, return 0 but the output may have been +/// written to. +/// The input value can be a signed integer, but must be non negative. +/// +/// \see https://en.wikipedia.org/wiki/LEB128 +/// \see MaxLEB128ByteLenFor +template +constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { + constexpr Int kLow7Mask = Int(0x7F); + constexpr Int kHigh7Mask = ~kLow7Mask; + constexpr uint8_t kContinuationBit = 0x80; + + // This encoding does not work for negative values + if constexpr (std::is_signed_v) { + if (ARROW_PREDICT_FALSE(value < 0)) { + return 0; + } + } + + const auto out_first = out; + + // Write as many bytes as we could be for the given input + while ((value & kHigh7Mask) != Int(0)) { + // We do not have enough room to write the LEB128 + if (ARROW_PREDICT_FALSE(out - out_first >= max_out_size)) { + return 0; + } + + // Write the encoded byte with continuation bit + *out = static_cast(value & kLow7Mask) | kContinuationBit; + ++out; + // Shift remaining data + value >>= 7; + } + + // We do not have enough room to write the LEB128 + if (ARROW_PREDICT_FALSE(out - out_first >= max_out_size)) { + return 0; + } + + // Write last non-continuing byte + *out = static_cast(value & kLow7Mask); + ++out; + + return static_cast(out - out_first); +} + +/// Parse a leading LEB128 +/// +/// Take as input a data pointer and the maximum number of bytes that can be read from it +/// (typically the array size). +/// When a valid LEB128 is found at the start of the data, the function writes it to the +/// out pointer and return the number of bytes read. +/// Otherwise, the out pointer is unmodified and zero is returned. +/// +/// \see https://en.wikipedia.org/wiki/LEB128 +/// \see MaxLEB128ByteLenFor +template +constexpr int32_t ParseLeadingLEB128(const uint8_t* data, int32_t max_data_size, + Int* out) { + constexpr auto kMaxBytes = static_cast(kMaxLEB128ByteLenFor); + static_assert(kMaxBytes >= 1); + constexpr uint8_t kLow7Mask = 0x7F; + constexpr uint8_t kContinuationBit = 0x80; + constexpr int32_t kSignBitCount = std::is_signed_v ? 1 : 0; + // Number of bits allowed for encoding data on the last byte to avoid overflow + constexpr uint8_t kHighBitCount = (8 * sizeof(Int) - kSignBitCount) % 7; + // kHighBitCount least significant `0` bits and the rest with `1` + constexpr uint8_t kHighForbiddenMask = ~((1 << kHighBitCount) - 1); + + // Iteratively building the value + std::make_unsigned_t value = 0; + + // Read as many bytes as we could be for the given output. + for (int32_t i = 0; i < kMaxBytes - 1; i++) { + // We have not finished reading a valid LEB128, yet we run out of data + if (ARROW_PREDICT_FALSE(i >= max_data_size)) { + return 0; + } + + // Read the byte and set its 7 LSB to in the final value + const uint8_t byte = data[i]; + value |= static_cast(byte & kLow7Mask) << (7 * i); + + // Check for lack of continuation flag in MSB + if ((byte & kContinuationBit) == 0) { + *out = value; + return i + 1; + } + } + + // Process the last index avoiding overflowing + constexpr int32_t last = kMaxBytes - 1; + + // We have not finished reading a valid LEB128, yet we run out of data + if (ARROW_PREDICT_FALSE(last >= max_data_size)) { + return 0; + } + + const uint8_t byte = data[last]; + + // Need to check if there are bits that would overflow the output. + // Also checks that there is no continuation. + if (ARROW_PREDICT_FALSE((byte & kHighForbiddenMask) != 0)) { + return 0; + } + + // No longer need to mask since we ensured + value |= static_cast(byte) << (7 * last); + *out = value; + return last + 1; +} } // namespace bit_util } // namespace arrow diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 02f583e0110..e8cee340ded 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -50,7 +50,7 @@ #include "arrow/util/bitmap_reader.h" #include "arrow/util/bitmap_visit.h" #include "arrow/util/bitmap_writer.h" -#include "arrow/util/bitset_stack.h" +#include "arrow/util/bitset_stack_internal.h" #include "arrow/util/endian.h" #include "arrow/util/ubsan.h" @@ -1997,11 +1997,189 @@ TEST(BitUtil, RoundUpToPowerOf2) { #undef U64 #undef S64 +/// Test the maximum number of bytes needed to write a LEB128 of a give size. +TEST(LEB128, MaxLEB128ByteLenFor) { + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 2); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 2); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 3); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 3); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 5); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 5); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 10); + EXPECT_EQ(bit_util::kMaxLEB128ByteLenFor, 10); +} + +/// Utility function to test LEB128 encoding with known input value and expected byte +/// array +template +void TestLEB128Encode(Int input_value, const std::vector& expected_data, + std::size_t buffer_size) { + std::vector buffer(buffer_size); + auto bytes_written = bit_util::WriteLEB128(input_value, buffer.data(), + static_cast(buffer.size())); + + EXPECT_EQ(bytes_written, expected_data.size()); + // Encoded data + for (std::size_t i = 0; i < expected_data.size(); ++i) { + EXPECT_EQ(buffer.at(i), expected_data.at(i)); + } + + // When the value is successfully encoded, the remaining of the buffer is untouched + if (bytes_written > 0) { + for (std::size_t i = bytes_written; i < buffer.size(); ++i) { + EXPECT_EQ(buffer.at(i), 0); + } + } +} + +/// Test encoding to known LEB128 byte sequences with edge cases parameters. +/// \see LEB128.KnownSuccessfulValues for other known values tested. +TEST(LEB128, WriteEdgeCases) { + // Single byte value 0 + TestLEB128Encode(0U, {0x00}, 1); + // Single byte value 127 + TestLEB128Encode(127U, {0x7F}, 1); + // Three byte value 16384, encoded in larger buffer + TestLEB128Encode(16384U, {0x80, 0x80, 0x01}, 10); + // Two byte boundary values + TestLEB128Encode(128U, {0x80, 0x01}, 2); + TestLEB128Encode(129U, {0x81, 0x01}, 2); + TestLEB128Encode(16383U, {0xFF, 0x7F}, 2); + // Error case: Buffer too small for value 128 (needs 2 bytes but only 1 provided) + TestLEB128Encode(128U, {}, 1); + // Error case: Buffer too small for uint32_t max (needs 5 bytes but only 4 provided) + TestLEB128Encode(4294967295U, {}, 4); + // Error case: Zero buffer size + TestLEB128Encode(52U, {}, 0); + // Error case: Negative value + TestLEB128Encode(-3, {}, 1); +} + +/// Utility function to test LEB128 decoding with known byte array and expected result +template +void TestLEB128Decode(const std::vector& data, Int expected_value, + int32_t expected_bytes_read) { + Int result = 0; + auto bytes_read = bit_util::ParseLeadingLEB128( + data.data(), static_cast(data.size()), &result); + EXPECT_EQ(bytes_read, expected_bytes_read); + if (expected_bytes_read > 0) { + EXPECT_EQ(result, expected_value); + } +} + +/// Test decoding from known LEB128 byte sequences with edge case parameters. +/// \see LEB128.KnownSuccessfulValues for other known values tested. +TEST(LEB128, ReadEdgeCases) { + // Single byte value 0 + TestLEB128Decode({0x00}, 0U, 1); + // Single byte value 127 + TestLEB128Decode({0x7F}, 127U, 1); + // Three byte value 16384, with remaining data + TestLEB128Decode({0x80, 0x80, 0x01, 0x80, 0x00}, 16384U, 3); + // Four byte value 268435455 + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0x7F}, 268435455U, 4); + // Error case: Truncated sequence (continuation bit set but no more data) + TestLEB128Decode({0x80}, 0U, 0); + // Error case: Input has exactly the maximum number of bytes for a int32_t (5), + // but the decoded value overflows nonetheless (7 * 5 = 35 bits of data). + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0x7F}, int32_t{}, 0); + // Error case: Oversized sequence for uint32_t (too many bytes) + TestLEB128Decode({0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}, 0U, 0); +} + +struct KnownLEB128Encoding { + uint64_t value; + std::vector bytes; +}; + +static const std::vector KnownLEB128EncodingValues{ + {0, {0x00}}, + {1, {0x01}}, + {63, {0x3F}}, + {64, {0x40}}, + {127U, {0x7F}}, + {128, {0x80, 0x01}}, + {300, {0xAC, 0x02}}, + {16384, {0x80, 0x80, 0x01}}, + {268435455, {0xFF, 0xFF, 0xFF, 0x7F}}, + {static_cast(std::numeric_limits::max()), {0xFF, 0x01}}, + {static_cast(std::numeric_limits::max()), {0x7F}}, + {static_cast(std::numeric_limits::max()), {0xFF, 0xFF, 0x03}}, + {static_cast(std::numeric_limits::max()), {0xFF, 0xFF, 0x01}}, + {static_cast(std::numeric_limits::max()), + {0xFF, 0xFF, 0xFF, 0xFF, 0x0F}}, + {static_cast(std::numeric_limits::max()), + {0xFF, 0xFF, 0xFF, 0xFF, 0x7}}, + {static_cast(std::numeric_limits::max()), + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01}}, + {static_cast(std::numeric_limits::max()), + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F}}, +}; + +/// Test encoding and decoding to known LEB128 byte sequences with all possible +/// integer sizes and signess. +TEST(LEB128, KnownSuccessfulValues) { + for (const auto& data : KnownLEB128EncodingValues) { + SCOPED_TRACE("Testing value " + std::to_string(data.value)); + + // 8 bits + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); + } + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); + } + + // 16 bits + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); + } + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); + } + + // 32 bits + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); + } + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); + } + + // 64 bits + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); + } + if (data.value <= static_cast(std::numeric_limits::max())) { + const auto val = static_cast(data.value); + TestLEB128Encode(val, data.bytes, data.bytes.size()); + TestLEB128Decode(data.bytes, val, static_cast(data.bytes.size())); + } + } +} + static void TestZigZag(int32_t v, std::array buffer_expect) { - uint8_t buffer[bit_util::BitReader::kMaxVlqByteLength] = {}; + uint8_t buffer[bit_util::kMaxLEB128ByteLenFor] = {}; bit_util::BitWriter writer(buffer, sizeof(buffer)); - bit_util::BitReader reader(buffer, sizeof(buffer)); writer.PutZigZagVlqInt(v); + // WARNING: The reader reads and caches the input when created, so it must be created + // after the data is written in the buffer. + bit_util::BitReader reader(buffer, sizeof(buffer)); EXPECT_THAT(buffer, testing::ElementsAreArray(buffer_expect)); int32_t result; EXPECT_TRUE(reader.GetZigZagVlqInt(&result)); @@ -2020,10 +2198,12 @@ TEST(BitStreamUtil, ZigZag) { } static void TestZigZag64(int64_t v, std::array buffer_expect) { - uint8_t buffer[bit_util::BitReader::kMaxVlqByteLengthForInt64] = {}; + uint8_t buffer[bit_util::kMaxLEB128ByteLenFor] = {}; bit_util::BitWriter writer(buffer, sizeof(buffer)); - bit_util::BitReader reader(buffer, sizeof(buffer)); writer.PutZigZagVlqInt(v); + // WARNING: The reader reads and caches the input when created, so it must be created + // after the data is written in the buffer. + bit_util::BitReader reader(buffer, sizeof(buffer)); EXPECT_THAT(buffer, testing::ElementsAreArray(buffer_expect)); int64_t result = 0; EXPECT_TRUE(reader.GetZigZagVlqInt(&result)); diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h index 4750e697fc7..141d558c3a8 100644 --- a/cpp/src/arrow/util/bitmap.h +++ b/cpp/src/arrow/util/bitmap.h @@ -37,7 +37,7 @@ #include "arrow/util/endian.h" #include "arrow/util/functional.h" #include "arrow/util/span.h" -#include "arrow/util/string_builder.h" +#include "arrow/util/string_util.h" #include "arrow/util/visibility.h" namespace arrow { diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc index e623e65911b..c27cfd52655 100644 --- a/cpp/src/arrow/util/bitmap_ops.cc +++ b/cpp/src/arrow/util/bitmap_ops.cc @@ -101,6 +101,8 @@ int64_t CountAndSetBits(const uint8_t* left_bitmap, int64_t left_offset, return count; } +namespace { + enum class TransferMode : bool { Copy, Invert }; // Reverse all bits from entire byte(uint8) @@ -213,6 +215,8 @@ void ReverseBlockOffsets(const uint8_t* data, int64_t offset, int64_t length, } } +} // namespace + template Result> TransferBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t length, diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h index 5526c87dbca..d95fd921f48 100644 --- a/cpp/src/arrow/util/bitmap_reader.h +++ b/cpp/src/arrow/util/bitmap_reader.h @@ -31,6 +31,8 @@ namespace internal { class BitmapReader { public: + BitmapReader() = default; + BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) : bitmap_(bitmap), position_(0), length_(length) { current_byte_ = 0; diff --git a/cpp/src/arrow/util/bitset_stack.h b/cpp/src/arrow/util/bitset_stack.h deleted file mode 100644 index 9b334b3605e..00000000000 --- a/cpp/src/arrow/util/bitset_stack.h +++ /dev/null @@ -1,89 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/memory_pool.h" -#include "arrow/result.h" -#include "arrow/type_fwd.h" -#include "arrow/util/bit_util.h" -#include "arrow/util/compare.h" -#include "arrow/util/functional.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_builder.h" -#include "arrow/util/type_traits.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace internal { - -/// \brief Store a stack of bitsets efficiently. The top bitset may be -/// accessed and its bits may be modified, but it may not be resized. -class BitsetStack { - public: - using reference = typename std::vector::reference; - - /// \brief push a bitset onto the stack - /// \param size number of bits in the next bitset - /// \param value initial value for bits in the pushed bitset - void Push(int size, bool value) { - offsets_.push_back(bit_count()); - bits_.resize(bit_count() + size, value); - } - - /// \brief number of bits in the bitset at the top of the stack - int TopSize() const { - if (offsets_.size() == 0) return 0; - return bit_count() - offsets_.back(); - } - - /// \brief pop a bitset off the stack - void Pop() { - bits_.resize(offsets_.back()); - offsets_.pop_back(); - } - - /// \brief get the value of a bit in the top bitset - /// \param i index of the bit to access - bool operator[](int i) const { return bits_[offsets_.back() + i]; } - - /// \brief get a mutable reference to a bit in the top bitset - /// \param i index of the bit to access - reference operator[](int i) { return bits_[offsets_.back() + i]; } - - private: - int bit_count() const { return static_cast(bits_.size()); } - std::vector bits_; - std::vector offsets_; -}; - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/util/bitset_stack_internal.h b/cpp/src/arrow/util/bitset_stack_internal.h new file mode 100644 index 00000000000..67a7e50af72 --- /dev/null +++ b/cpp/src/arrow/util/bitset_stack_internal.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/compare.h" +#include "arrow/util/functional.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_util.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +/// \brief Store a stack of bitsets efficiently. The top bitset may be +/// accessed and its bits may be modified, but it may not be resized. +class BitsetStack { + public: + using reference = typename std::vector::reference; + + /// \brief push a bitset onto the stack + /// \param size number of bits in the next bitset + /// \param value initial value for bits in the pushed bitset + void Push(int size, bool value) { + offsets_.push_back(bit_count()); + bits_.resize(bit_count() + size, value); + } + + /// \brief number of bits in the bitset at the top of the stack + int TopSize() const { + if (offsets_.size() == 0) return 0; + return bit_count() - offsets_.back(); + } + + /// \brief pop a bitset off the stack + void Pop() { + bits_.resize(offsets_.back()); + offsets_.pop_back(); + } + + /// \brief get the value of a bit in the top bitset + /// \param i index of the bit to access + bool operator[](int i) const { return bits_[offsets_.back() + i]; } + + /// \brief get a mutable reference to a bit in the top bitset + /// \param i index of the bit to access + reference operator[](int i) { return bits_[offsets_.back() + i]; } + + private: + int bit_count() const { return static_cast(bits_.size()); } + std::vector bits_; + std::vector offsets_; +}; + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 56ddd376293..990f76875aa 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -15,30 +15,30 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/bpacking.h" +#include "arrow/util/bpacking_internal.h" -#include "arrow/util/bpacking64_default.h" -#include "arrow/util/bpacking_default.h" +#include "arrow/util/bpacking64_default_internal.h" +#include "arrow/util/bpacking_default_internal.h" #include "arrow/util/cpu_info.h" -#include "arrow/util/dispatch.h" +#include "arrow/util/dispatch_internal.h" #include "arrow/util/logging_internal.h" #if defined(ARROW_HAVE_RUNTIME_AVX2) -# include "arrow/util/bpacking_avx2.h" +# include "arrow/util/bpacking_avx2_internal.h" #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) -# include "arrow/util/bpacking_avx512.h" +# include "arrow/util/bpacking_avx512_internal.h" #endif #if defined(ARROW_HAVE_NEON) -# include "arrow/util/bpacking_neon.h" +# include "arrow/util/bpacking_neon_internal.h" #endif namespace arrow { namespace internal { -namespace { +int unpack32_scalar(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) { + const uint32_t* in = reinterpret_cast(in_); -int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -149,11 +149,13 @@ int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_ return batch_size; } +namespace { + struct Unpack32DynamicFunction { - using FunctionType = decltype(&unpack32_default); + using FunctionType = decltype(&unpack32_scalar); static std::vector> implementations() { - return {{DispatchLevel::NONE, unpack32_default} + return {{DispatchLevel::NONE, unpack32_scalar} #if defined(ARROW_HAVE_RUNTIME_AVX2) , {DispatchLevel::AVX2, unpack32_avx2} @@ -168,7 +170,7 @@ struct Unpack32DynamicFunction { } // namespace -int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { +int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { #if defined(ARROW_HAVE_NEON) return unpack32_neon(in, out, batch_size, num_bits); #else @@ -177,9 +179,7 @@ int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { #endif } -namespace { - -int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { +int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -386,11 +386,9 @@ int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_b return batch_size; } -} // namespace - int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { // TODO: unpack64_neon, unpack64_avx2 and unpack64_avx512 - return unpack64_default(in, out, batch_size, num_bits); + return unpack64_scalar(in, out, batch_size, num_bits); } } // namespace internal diff --git a/cpp/src/arrow/util/bpacking.h b/cpp/src/arrow/util/bpacking.h deleted file mode 100644 index dd85c1638c7..00000000000 --- a/cpp/src/arrow/util/bpacking.h +++ /dev/null @@ -1,34 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/util/endian.h" -#include "arrow/util/visibility.h" - -#include - -namespace arrow { -namespace internal { - -ARROW_EXPORT -int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); -ARROW_EXPORT -int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits); - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking64_codegen.py b/cpp/src/arrow/util/bpacking64_codegen.py index f9b06b4d8fc..22135fcbb23 100644 --- a/cpp/src/arrow/util/bpacking64_codegen.py +++ b/cpp/src/arrow/util/bpacking64_codegen.py @@ -21,7 +21,7 @@ # https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py # Usage: -# python bpacking64_codegen.py > bpacking64_default.h +# python bpacking64_codegen.py > bpacking64_default_internal.h def howmany(bit): """ how many values are we going to pack? """ @@ -73,7 +73,7 @@ def howmanybytes(bit): print("inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) {") -print(" for(int k = 0; k < {0} ; k += 1) {{".format(howmany(0))) +print(f" for(int k = 0; k < {howmany(0)} ; k += 1) {{") print(" out[k] = 0;") print(" }") print(" return in;") @@ -81,47 +81,44 @@ def howmanybytes(bit): for bit in range(1, 65): print("") - print( - "inline const uint8_t* unpack{0}_64(const uint8_t* in, uint64_t* out) {{".format(bit)) + print(f"inline const uint8_t* unpack{bit}_64(const uint8_t* in, uint64_t* out) {{") if(bit < 64): - print(" const uint64_t mask = {0}ULL;".format((1 << bit)-1)) + print(f" const uint64_t mask = {((1 << bit)-1)}ULL;") maskstr = " & mask" if (bit == 64): maskstr = "" # no need for k in range(howmanywords(bit)-1): - print(" uint64_t w{0} = util::SafeLoadAs(in);".format(k)) - print(" w{0} = arrow::BitUtil::FromLittleEndian(w{0});".format(k)) - print(" in += 8;".format(k)) + print(f" uint64_t w{k} = util::SafeLoadAs(in);") + print(f" w{k} = arrow::BitUtil::FromLittleEndian(w{k});") + print(" in += 8;") k = howmanywords(bit) - 1 if (bit % 2 == 0): - print(" uint64_t w{0} = util::SafeLoadAs(in);".format(k)) - print(" w{0} = arrow::BitUtil::FromLittleEndian(w{0});".format(k)) - print(" in += 8;".format(k)) + print(f" uint64_t w{k} = util::SafeLoadAs(in);") + print(f" w{k} = arrow::BitUtil::FromLittleEndian(w{k});") + print(" in += 8;") else: - print(" uint64_t w{0} = util::SafeLoadAs(in);".format(k)) - print(" w{0} = arrow::BitUtil::FromLittleEndian(w{0});".format(k)) - print(" in += 4;".format(k)) + print(f" uint64_t w{k} = util::SafeLoadAs(in);") + print(f" w{k} = arrow::BitUtil::FromLittleEndian(w{k});") + print(" in += 4;") for j in range(howmany(bit)): firstword = j * bit // 64 secondword = (j * bit + bit - 1)//64 firstshift = (j*bit) % 64 - firstshiftstr = " >> {0}".format(firstshift) + firstshiftstr = f" >> {firstshift}" if(firstshift == 0): firstshiftstr = "" # no need if(firstword == secondword): if(firstshift + bit == 64): - print(" out[{0}] = w{1}{2};".format( - j, firstword, firstshiftstr, firstshift)) + print(f" out[{j}] = w{firstword}{firstshiftstr};") else: - print(" out[{0}] = (w{1}{2}){3};".format( - j, firstword, firstshiftstr, maskstr)) + print(f" out[{j}] = (w{firstword}{firstshiftstr}){maskstr};") else: secondshift = (64-firstshift) - print(" out[{0}] = ((w{1}{2}) | (w{3} << {4})){5};".format( - j, firstword, firstshiftstr, firstword+1, secondshift, maskstr)) + print(f" out[{j}] = ((w{firstword}{firstshiftstr}) | " + f"(w{firstword+1} << {secondshift})){maskstr};") print("") print(" return in;") print("}") diff --git a/cpp/src/arrow/util/bpacking64_default.h b/cpp/src/arrow/util/bpacking64_default.h deleted file mode 100644 index 4f45619b2a7..00000000000 --- a/cpp/src/arrow/util/bpacking64_default.h +++ /dev/null @@ -1,5642 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This file was generated by script which is modified from its original version in -// GitHub. Original source: -// https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py -// The original copyright notice follows. - -// This code is released under the -// Apache License Version 2.0 http://www.apache.org/licenses/. -// (c) Daniel Lemire 2013 - -#pragma once - -#include "arrow/util/bit_util.h" -#include "arrow/util/ubsan.h" - -namespace arrow { -namespace internal { - -inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) { - for (int k = 0; k < 32; k += 1) { - out[k] = 0; - } - return in; -} - -inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 1) & mask; - out[2] = (w0 >> 2) & mask; - out[3] = (w0 >> 3) & mask; - out[4] = (w0 >> 4) & mask; - out[5] = (w0 >> 5) & mask; - out[6] = (w0 >> 6) & mask; - out[7] = (w0 >> 7) & mask; - out[8] = (w0 >> 8) & mask; - out[9] = (w0 >> 9) & mask; - out[10] = (w0 >> 10) & mask; - out[11] = (w0 >> 11) & mask; - out[12] = (w0 >> 12) & mask; - out[13] = (w0 >> 13) & mask; - out[14] = (w0 >> 14) & mask; - out[15] = (w0 >> 15) & mask; - out[16] = (w0 >> 16) & mask; - out[17] = (w0 >> 17) & mask; - out[18] = (w0 >> 18) & mask; - out[19] = (w0 >> 19) & mask; - out[20] = (w0 >> 20) & mask; - out[21] = (w0 >> 21) & mask; - out[22] = (w0 >> 22) & mask; - out[23] = (w0 >> 23) & mask; - out[24] = (w0 >> 24) & mask; - out[25] = (w0 >> 25) & mask; - out[26] = (w0 >> 26) & mask; - out[27] = (w0 >> 27) & mask; - out[28] = (w0 >> 28) & mask; - out[29] = (w0 >> 29) & mask; - out[30] = (w0 >> 30) & mask; - out[31] = (w0 >> 31) & mask; - - return in; -} - -inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 3ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 2) & mask; - out[2] = (w0 >> 4) & mask; - out[3] = (w0 >> 6) & mask; - out[4] = (w0 >> 8) & mask; - out[5] = (w0 >> 10) & mask; - out[6] = (w0 >> 12) & mask; - out[7] = (w0 >> 14) & mask; - out[8] = (w0 >> 16) & mask; - out[9] = (w0 >> 18) & mask; - out[10] = (w0 >> 20) & mask; - out[11] = (w0 >> 22) & mask; - out[12] = (w0 >> 24) & mask; - out[13] = (w0 >> 26) & mask; - out[14] = (w0 >> 28) & mask; - out[15] = (w0 >> 30) & mask; - out[16] = (w0 >> 32) & mask; - out[17] = (w0 >> 34) & mask; - out[18] = (w0 >> 36) & mask; - out[19] = (w0 >> 38) & mask; - out[20] = (w0 >> 40) & mask; - out[21] = (w0 >> 42) & mask; - out[22] = (w0 >> 44) & mask; - out[23] = (w0 >> 46) & mask; - out[24] = (w0 >> 48) & mask; - out[25] = (w0 >> 50) & mask; - out[26] = (w0 >> 52) & mask; - out[27] = (w0 >> 54) & mask; - out[28] = (w0 >> 56) & mask; - out[29] = (w0 >> 58) & mask; - out[30] = (w0 >> 60) & mask; - out[31] = w0 >> 62; - - return in; -} - -inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 7ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 3) & mask; - out[2] = (w0 >> 6) & mask; - out[3] = (w0 >> 9) & mask; - out[4] = (w0 >> 12) & mask; - out[5] = (w0 >> 15) & mask; - out[6] = (w0 >> 18) & mask; - out[7] = (w0 >> 21) & mask; - out[8] = (w0 >> 24) & mask; - out[9] = (w0 >> 27) & mask; - out[10] = (w0 >> 30) & mask; - out[11] = (w0 >> 33) & mask; - out[12] = (w0 >> 36) & mask; - out[13] = (w0 >> 39) & mask; - out[14] = (w0 >> 42) & mask; - out[15] = (w0 >> 45) & mask; - out[16] = (w0 >> 48) & mask; - out[17] = (w0 >> 51) & mask; - out[18] = (w0 >> 54) & mask; - out[19] = (w0 >> 57) & mask; - out[20] = (w0 >> 60) & mask; - out[21] = ((w0 >> 63) | (w1 << 1)) & mask; - out[22] = (w1 >> 2) & mask; - out[23] = (w1 >> 5) & mask; - out[24] = (w1 >> 8) & mask; - out[25] = (w1 >> 11) & mask; - out[26] = (w1 >> 14) & mask; - out[27] = (w1 >> 17) & mask; - out[28] = (w1 >> 20) & mask; - out[29] = (w1 >> 23) & mask; - out[30] = (w1 >> 26) & mask; - out[31] = (w1 >> 29) & mask; - - return in; -} - -inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 15ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 4) & mask; - out[2] = (w0 >> 8) & mask; - out[3] = (w0 >> 12) & mask; - out[4] = (w0 >> 16) & mask; - out[5] = (w0 >> 20) & mask; - out[6] = (w0 >> 24) & mask; - out[7] = (w0 >> 28) & mask; - out[8] = (w0 >> 32) & mask; - out[9] = (w0 >> 36) & mask; - out[10] = (w0 >> 40) & mask; - out[11] = (w0 >> 44) & mask; - out[12] = (w0 >> 48) & mask; - out[13] = (w0 >> 52) & mask; - out[14] = (w0 >> 56) & mask; - out[15] = w0 >> 60; - out[16] = (w1)&mask; - out[17] = (w1 >> 4) & mask; - out[18] = (w1 >> 8) & mask; - out[19] = (w1 >> 12) & mask; - out[20] = (w1 >> 16) & mask; - out[21] = (w1 >> 20) & mask; - out[22] = (w1 >> 24) & mask; - out[23] = (w1 >> 28) & mask; - out[24] = (w1 >> 32) & mask; - out[25] = (w1 >> 36) & mask; - out[26] = (w1 >> 40) & mask; - out[27] = (w1 >> 44) & mask; - out[28] = (w1 >> 48) & mask; - out[29] = (w1 >> 52) & mask; - out[30] = (w1 >> 56) & mask; - out[31] = w1 >> 60; - - return in; -} - -inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 31ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 5) & mask; - out[2] = (w0 >> 10) & mask; - out[3] = (w0 >> 15) & mask; - out[4] = (w0 >> 20) & mask; - out[5] = (w0 >> 25) & mask; - out[6] = (w0 >> 30) & mask; - out[7] = (w0 >> 35) & mask; - out[8] = (w0 >> 40) & mask; - out[9] = (w0 >> 45) & mask; - out[10] = (w0 >> 50) & mask; - out[11] = (w0 >> 55) & mask; - out[12] = ((w0 >> 60) | (w1 << 4)) & mask; - out[13] = (w1 >> 1) & mask; - out[14] = (w1 >> 6) & mask; - out[15] = (w1 >> 11) & mask; - out[16] = (w1 >> 16) & mask; - out[17] = (w1 >> 21) & mask; - out[18] = (w1 >> 26) & mask; - out[19] = (w1 >> 31) & mask; - out[20] = (w1 >> 36) & mask; - out[21] = (w1 >> 41) & mask; - out[22] = (w1 >> 46) & mask; - out[23] = (w1 >> 51) & mask; - out[24] = (w1 >> 56) & mask; - out[25] = ((w1 >> 61) | (w2 << 3)) & mask; - out[26] = (w2 >> 2) & mask; - out[27] = (w2 >> 7) & mask; - out[28] = (w2 >> 12) & mask; - out[29] = (w2 >> 17) & mask; - out[30] = (w2 >> 22) & mask; - out[31] = (w2 >> 27) & mask; - - return in; -} - -inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 63ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 6) & mask; - out[2] = (w0 >> 12) & mask; - out[3] = (w0 >> 18) & mask; - out[4] = (w0 >> 24) & mask; - out[5] = (w0 >> 30) & mask; - out[6] = (w0 >> 36) & mask; - out[7] = (w0 >> 42) & mask; - out[8] = (w0 >> 48) & mask; - out[9] = (w0 >> 54) & mask; - out[10] = ((w0 >> 60) | (w1 << 4)) & mask; - out[11] = (w1 >> 2) & mask; - out[12] = (w1 >> 8) & mask; - out[13] = (w1 >> 14) & mask; - out[14] = (w1 >> 20) & mask; - out[15] = (w1 >> 26) & mask; - out[16] = (w1 >> 32) & mask; - out[17] = (w1 >> 38) & mask; - out[18] = (w1 >> 44) & mask; - out[19] = (w1 >> 50) & mask; - out[20] = (w1 >> 56) & mask; - out[21] = ((w1 >> 62) | (w2 << 2)) & mask; - out[22] = (w2 >> 4) & mask; - out[23] = (w2 >> 10) & mask; - out[24] = (w2 >> 16) & mask; - out[25] = (w2 >> 22) & mask; - out[26] = (w2 >> 28) & mask; - out[27] = (w2 >> 34) & mask; - out[28] = (w2 >> 40) & mask; - out[29] = (w2 >> 46) & mask; - out[30] = (w2 >> 52) & mask; - out[31] = w2 >> 58; - - return in; -} - -inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 127ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 7) & mask; - out[2] = (w0 >> 14) & mask; - out[3] = (w0 >> 21) & mask; - out[4] = (w0 >> 28) & mask; - out[5] = (w0 >> 35) & mask; - out[6] = (w0 >> 42) & mask; - out[7] = (w0 >> 49) & mask; - out[8] = (w0 >> 56) & mask; - out[9] = ((w0 >> 63) | (w1 << 1)) & mask; - out[10] = (w1 >> 6) & mask; - out[11] = (w1 >> 13) & mask; - out[12] = (w1 >> 20) & mask; - out[13] = (w1 >> 27) & mask; - out[14] = (w1 >> 34) & mask; - out[15] = (w1 >> 41) & mask; - out[16] = (w1 >> 48) & mask; - out[17] = (w1 >> 55) & mask; - out[18] = ((w1 >> 62) | (w2 << 2)) & mask; - out[19] = (w2 >> 5) & mask; - out[20] = (w2 >> 12) & mask; - out[21] = (w2 >> 19) & mask; - out[22] = (w2 >> 26) & mask; - out[23] = (w2 >> 33) & mask; - out[24] = (w2 >> 40) & mask; - out[25] = (w2 >> 47) & mask; - out[26] = (w2 >> 54) & mask; - out[27] = ((w2 >> 61) | (w3 << 3)) & mask; - out[28] = (w3 >> 4) & mask; - out[29] = (w3 >> 11) & mask; - out[30] = (w3 >> 18) & mask; - out[31] = (w3 >> 25) & mask; - - return in; -} - -inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 255ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 8) & mask; - out[2] = (w0 >> 16) & mask; - out[3] = (w0 >> 24) & mask; - out[4] = (w0 >> 32) & mask; - out[5] = (w0 >> 40) & mask; - out[6] = (w0 >> 48) & mask; - out[7] = w0 >> 56; - out[8] = (w1)&mask; - out[9] = (w1 >> 8) & mask; - out[10] = (w1 >> 16) & mask; - out[11] = (w1 >> 24) & mask; - out[12] = (w1 >> 32) & mask; - out[13] = (w1 >> 40) & mask; - out[14] = (w1 >> 48) & mask; - out[15] = w1 >> 56; - out[16] = (w2)&mask; - out[17] = (w2 >> 8) & mask; - out[18] = (w2 >> 16) & mask; - out[19] = (w2 >> 24) & mask; - out[20] = (w2 >> 32) & mask; - out[21] = (w2 >> 40) & mask; - out[22] = (w2 >> 48) & mask; - out[23] = w2 >> 56; - out[24] = (w3)&mask; - out[25] = (w3 >> 8) & mask; - out[26] = (w3 >> 16) & mask; - out[27] = (w3 >> 24) & mask; - out[28] = (w3 >> 32) & mask; - out[29] = (w3 >> 40) & mask; - out[30] = (w3 >> 48) & mask; - out[31] = w3 >> 56; - - return in; -} - -inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 511ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 9) & mask; - out[2] = (w0 >> 18) & mask; - out[3] = (w0 >> 27) & mask; - out[4] = (w0 >> 36) & mask; - out[5] = (w0 >> 45) & mask; - out[6] = (w0 >> 54) & mask; - out[7] = ((w0 >> 63) | (w1 << 1)) & mask; - out[8] = (w1 >> 8) & mask; - out[9] = (w1 >> 17) & mask; - out[10] = (w1 >> 26) & mask; - out[11] = (w1 >> 35) & mask; - out[12] = (w1 >> 44) & mask; - out[13] = (w1 >> 53) & mask; - out[14] = ((w1 >> 62) | (w2 << 2)) & mask; - out[15] = (w2 >> 7) & mask; - out[16] = (w2 >> 16) & mask; - out[17] = (w2 >> 25) & mask; - out[18] = (w2 >> 34) & mask; - out[19] = (w2 >> 43) & mask; - out[20] = (w2 >> 52) & mask; - out[21] = ((w2 >> 61) | (w3 << 3)) & mask; - out[22] = (w3 >> 6) & mask; - out[23] = (w3 >> 15) & mask; - out[24] = (w3 >> 24) & mask; - out[25] = (w3 >> 33) & mask; - out[26] = (w3 >> 42) & mask; - out[27] = (w3 >> 51) & mask; - out[28] = ((w3 >> 60) | (w4 << 4)) & mask; - out[29] = (w4 >> 5) & mask; - out[30] = (w4 >> 14) & mask; - out[31] = (w4 >> 23) & mask; - - return in; -} - -inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1023ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 10) & mask; - out[2] = (w0 >> 20) & mask; - out[3] = (w0 >> 30) & mask; - out[4] = (w0 >> 40) & mask; - out[5] = (w0 >> 50) & mask; - out[6] = ((w0 >> 60) | (w1 << 4)) & mask; - out[7] = (w1 >> 6) & mask; - out[8] = (w1 >> 16) & mask; - out[9] = (w1 >> 26) & mask; - out[10] = (w1 >> 36) & mask; - out[11] = (w1 >> 46) & mask; - out[12] = ((w1 >> 56) | (w2 << 8)) & mask; - out[13] = (w2 >> 2) & mask; - out[14] = (w2 >> 12) & mask; - out[15] = (w2 >> 22) & mask; - out[16] = (w2 >> 32) & mask; - out[17] = (w2 >> 42) & mask; - out[18] = (w2 >> 52) & mask; - out[19] = ((w2 >> 62) | (w3 << 2)) & mask; - out[20] = (w3 >> 8) & mask; - out[21] = (w3 >> 18) & mask; - out[22] = (w3 >> 28) & mask; - out[23] = (w3 >> 38) & mask; - out[24] = (w3 >> 48) & mask; - out[25] = ((w3 >> 58) | (w4 << 6)) & mask; - out[26] = (w4 >> 4) & mask; - out[27] = (w4 >> 14) & mask; - out[28] = (w4 >> 24) & mask; - out[29] = (w4 >> 34) & mask; - out[30] = (w4 >> 44) & mask; - out[31] = w4 >> 54; - - return in; -} - -inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2047ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 11) & mask; - out[2] = (w0 >> 22) & mask; - out[3] = (w0 >> 33) & mask; - out[4] = (w0 >> 44) & mask; - out[5] = ((w0 >> 55) | (w1 << 9)) & mask; - out[6] = (w1 >> 2) & mask; - out[7] = (w1 >> 13) & mask; - out[8] = (w1 >> 24) & mask; - out[9] = (w1 >> 35) & mask; - out[10] = (w1 >> 46) & mask; - out[11] = ((w1 >> 57) | (w2 << 7)) & mask; - out[12] = (w2 >> 4) & mask; - out[13] = (w2 >> 15) & mask; - out[14] = (w2 >> 26) & mask; - out[15] = (w2 >> 37) & mask; - out[16] = (w2 >> 48) & mask; - out[17] = ((w2 >> 59) | (w3 << 5)) & mask; - out[18] = (w3 >> 6) & mask; - out[19] = (w3 >> 17) & mask; - out[20] = (w3 >> 28) & mask; - out[21] = (w3 >> 39) & mask; - out[22] = (w3 >> 50) & mask; - out[23] = ((w3 >> 61) | (w4 << 3)) & mask; - out[24] = (w4 >> 8) & mask; - out[25] = (w4 >> 19) & mask; - out[26] = (w4 >> 30) & mask; - out[27] = (w4 >> 41) & mask; - out[28] = (w4 >> 52) & mask; - out[29] = ((w4 >> 63) | (w5 << 1)) & mask; - out[30] = (w5 >> 10) & mask; - out[31] = (w5 >> 21) & mask; - - return in; -} - -inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4095ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 12) & mask; - out[2] = (w0 >> 24) & mask; - out[3] = (w0 >> 36) & mask; - out[4] = (w0 >> 48) & mask; - out[5] = ((w0 >> 60) | (w1 << 4)) & mask; - out[6] = (w1 >> 8) & mask; - out[7] = (w1 >> 20) & mask; - out[8] = (w1 >> 32) & mask; - out[9] = (w1 >> 44) & mask; - out[10] = ((w1 >> 56) | (w2 << 8)) & mask; - out[11] = (w2 >> 4) & mask; - out[12] = (w2 >> 16) & mask; - out[13] = (w2 >> 28) & mask; - out[14] = (w2 >> 40) & mask; - out[15] = w2 >> 52; - out[16] = (w3)&mask; - out[17] = (w3 >> 12) & mask; - out[18] = (w3 >> 24) & mask; - out[19] = (w3 >> 36) & mask; - out[20] = (w3 >> 48) & mask; - out[21] = ((w3 >> 60) | (w4 << 4)) & mask; - out[22] = (w4 >> 8) & mask; - out[23] = (w4 >> 20) & mask; - out[24] = (w4 >> 32) & mask; - out[25] = (w4 >> 44) & mask; - out[26] = ((w4 >> 56) | (w5 << 8)) & mask; - out[27] = (w5 >> 4) & mask; - out[28] = (w5 >> 16) & mask; - out[29] = (w5 >> 28) & mask; - out[30] = (w5 >> 40) & mask; - out[31] = w5 >> 52; - - return in; -} - -inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 8191ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 13) & mask; - out[2] = (w0 >> 26) & mask; - out[3] = (w0 >> 39) & mask; - out[4] = ((w0 >> 52) | (w1 << 12)) & mask; - out[5] = (w1 >> 1) & mask; - out[6] = (w1 >> 14) & mask; - out[7] = (w1 >> 27) & mask; - out[8] = (w1 >> 40) & mask; - out[9] = ((w1 >> 53) | (w2 << 11)) & mask; - out[10] = (w2 >> 2) & mask; - out[11] = (w2 >> 15) & mask; - out[12] = (w2 >> 28) & mask; - out[13] = (w2 >> 41) & mask; - out[14] = ((w2 >> 54) | (w3 << 10)) & mask; - out[15] = (w3 >> 3) & mask; - out[16] = (w3 >> 16) & mask; - out[17] = (w3 >> 29) & mask; - out[18] = (w3 >> 42) & mask; - out[19] = ((w3 >> 55) | (w4 << 9)) & mask; - out[20] = (w4 >> 4) & mask; - out[21] = (w4 >> 17) & mask; - out[22] = (w4 >> 30) & mask; - out[23] = (w4 >> 43) & mask; - out[24] = ((w4 >> 56) | (w5 << 8)) & mask; - out[25] = (w5 >> 5) & mask; - out[26] = (w5 >> 18) & mask; - out[27] = (w5 >> 31) & mask; - out[28] = (w5 >> 44) & mask; - out[29] = ((w5 >> 57) | (w6 << 7)) & mask; - out[30] = (w6 >> 6) & mask; - out[31] = (w6 >> 19) & mask; - - return in; -} - -inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 16383ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 14) & mask; - out[2] = (w0 >> 28) & mask; - out[3] = (w0 >> 42) & mask; - out[4] = ((w0 >> 56) | (w1 << 8)) & mask; - out[5] = (w1 >> 6) & mask; - out[6] = (w1 >> 20) & mask; - out[7] = (w1 >> 34) & mask; - out[8] = (w1 >> 48) & mask; - out[9] = ((w1 >> 62) | (w2 << 2)) & mask; - out[10] = (w2 >> 12) & mask; - out[11] = (w2 >> 26) & mask; - out[12] = (w2 >> 40) & mask; - out[13] = ((w2 >> 54) | (w3 << 10)) & mask; - out[14] = (w3 >> 4) & mask; - out[15] = (w3 >> 18) & mask; - out[16] = (w3 >> 32) & mask; - out[17] = (w3 >> 46) & mask; - out[18] = ((w3 >> 60) | (w4 << 4)) & mask; - out[19] = (w4 >> 10) & mask; - out[20] = (w4 >> 24) & mask; - out[21] = (w4 >> 38) & mask; - out[22] = ((w4 >> 52) | (w5 << 12)) & mask; - out[23] = (w5 >> 2) & mask; - out[24] = (w5 >> 16) & mask; - out[25] = (w5 >> 30) & mask; - out[26] = (w5 >> 44) & mask; - out[27] = ((w5 >> 58) | (w6 << 6)) & mask; - out[28] = (w6 >> 8) & mask; - out[29] = (w6 >> 22) & mask; - out[30] = (w6 >> 36) & mask; - out[31] = w6 >> 50; - - return in; -} - -inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 32767ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 15) & mask; - out[2] = (w0 >> 30) & mask; - out[3] = (w0 >> 45) & mask; - out[4] = ((w0 >> 60) | (w1 << 4)) & mask; - out[5] = (w1 >> 11) & mask; - out[6] = (w1 >> 26) & mask; - out[7] = (w1 >> 41) & mask; - out[8] = ((w1 >> 56) | (w2 << 8)) & mask; - out[9] = (w2 >> 7) & mask; - out[10] = (w2 >> 22) & mask; - out[11] = (w2 >> 37) & mask; - out[12] = ((w2 >> 52) | (w3 << 12)) & mask; - out[13] = (w3 >> 3) & mask; - out[14] = (w3 >> 18) & mask; - out[15] = (w3 >> 33) & mask; - out[16] = (w3 >> 48) & mask; - out[17] = ((w3 >> 63) | (w4 << 1)) & mask; - out[18] = (w4 >> 14) & mask; - out[19] = (w4 >> 29) & mask; - out[20] = (w4 >> 44) & mask; - out[21] = ((w4 >> 59) | (w5 << 5)) & mask; - out[22] = (w5 >> 10) & mask; - out[23] = (w5 >> 25) & mask; - out[24] = (w5 >> 40) & mask; - out[25] = ((w5 >> 55) | (w6 << 9)) & mask; - out[26] = (w6 >> 6) & mask; - out[27] = (w6 >> 21) & mask; - out[28] = (w6 >> 36) & mask; - out[29] = ((w6 >> 51) | (w7 << 13)) & mask; - out[30] = (w7 >> 2) & mask; - out[31] = (w7 >> 17) & mask; - - return in; -} - -inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 65535ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 16) & mask; - out[2] = (w0 >> 32) & mask; - out[3] = w0 >> 48; - out[4] = (w1)&mask; - out[5] = (w1 >> 16) & mask; - out[6] = (w1 >> 32) & mask; - out[7] = w1 >> 48; - out[8] = (w2)&mask; - out[9] = (w2 >> 16) & mask; - out[10] = (w2 >> 32) & mask; - out[11] = w2 >> 48; - out[12] = (w3)&mask; - out[13] = (w3 >> 16) & mask; - out[14] = (w3 >> 32) & mask; - out[15] = w3 >> 48; - out[16] = (w4)&mask; - out[17] = (w4 >> 16) & mask; - out[18] = (w4 >> 32) & mask; - out[19] = w4 >> 48; - out[20] = (w5)&mask; - out[21] = (w5 >> 16) & mask; - out[22] = (w5 >> 32) & mask; - out[23] = w5 >> 48; - out[24] = (w6)&mask; - out[25] = (w6 >> 16) & mask; - out[26] = (w6 >> 32) & mask; - out[27] = w6 >> 48; - out[28] = (w7)&mask; - out[29] = (w7 >> 16) & mask; - out[30] = (w7 >> 32) & mask; - out[31] = w7 >> 48; - - return in; -} - -inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 131071ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 17) & mask; - out[2] = (w0 >> 34) & mask; - out[3] = ((w0 >> 51) | (w1 << 13)) & mask; - out[4] = (w1 >> 4) & mask; - out[5] = (w1 >> 21) & mask; - out[6] = (w1 >> 38) & mask; - out[7] = ((w1 >> 55) | (w2 << 9)) & mask; - out[8] = (w2 >> 8) & mask; - out[9] = (w2 >> 25) & mask; - out[10] = (w2 >> 42) & mask; - out[11] = ((w2 >> 59) | (w3 << 5)) & mask; - out[12] = (w3 >> 12) & mask; - out[13] = (w3 >> 29) & mask; - out[14] = (w3 >> 46) & mask; - out[15] = ((w3 >> 63) | (w4 << 1)) & mask; - out[16] = (w4 >> 16) & mask; - out[17] = (w4 >> 33) & mask; - out[18] = ((w4 >> 50) | (w5 << 14)) & mask; - out[19] = (w5 >> 3) & mask; - out[20] = (w5 >> 20) & mask; - out[21] = (w5 >> 37) & mask; - out[22] = ((w5 >> 54) | (w6 << 10)) & mask; - out[23] = (w6 >> 7) & mask; - out[24] = (w6 >> 24) & mask; - out[25] = (w6 >> 41) & mask; - out[26] = ((w6 >> 58) | (w7 << 6)) & mask; - out[27] = (w7 >> 11) & mask; - out[28] = (w7 >> 28) & mask; - out[29] = (w7 >> 45) & mask; - out[30] = ((w7 >> 62) | (w8 << 2)) & mask; - out[31] = (w8 >> 15) & mask; - - return in; -} - -inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 262143ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 18) & mask; - out[2] = (w0 >> 36) & mask; - out[3] = ((w0 >> 54) | (w1 << 10)) & mask; - out[4] = (w1 >> 8) & mask; - out[5] = (w1 >> 26) & mask; - out[6] = (w1 >> 44) & mask; - out[7] = ((w1 >> 62) | (w2 << 2)) & mask; - out[8] = (w2 >> 16) & mask; - out[9] = (w2 >> 34) & mask; - out[10] = ((w2 >> 52) | (w3 << 12)) & mask; - out[11] = (w3 >> 6) & mask; - out[12] = (w3 >> 24) & mask; - out[13] = (w3 >> 42) & mask; - out[14] = ((w3 >> 60) | (w4 << 4)) & mask; - out[15] = (w4 >> 14) & mask; - out[16] = (w4 >> 32) & mask; - out[17] = ((w4 >> 50) | (w5 << 14)) & mask; - out[18] = (w5 >> 4) & mask; - out[19] = (w5 >> 22) & mask; - out[20] = (w5 >> 40) & mask; - out[21] = ((w5 >> 58) | (w6 << 6)) & mask; - out[22] = (w6 >> 12) & mask; - out[23] = (w6 >> 30) & mask; - out[24] = ((w6 >> 48) | (w7 << 16)) & mask; - out[25] = (w7 >> 2) & mask; - out[26] = (w7 >> 20) & mask; - out[27] = (w7 >> 38) & mask; - out[28] = ((w7 >> 56) | (w8 << 8)) & mask; - out[29] = (w8 >> 10) & mask; - out[30] = (w8 >> 28) & mask; - out[31] = w8 >> 46; - - return in; -} - -inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 524287ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 19) & mask; - out[2] = (w0 >> 38) & mask; - out[3] = ((w0 >> 57) | (w1 << 7)) & mask; - out[4] = (w1 >> 12) & mask; - out[5] = (w1 >> 31) & mask; - out[6] = ((w1 >> 50) | (w2 << 14)) & mask; - out[7] = (w2 >> 5) & mask; - out[8] = (w2 >> 24) & mask; - out[9] = (w2 >> 43) & mask; - out[10] = ((w2 >> 62) | (w3 << 2)) & mask; - out[11] = (w3 >> 17) & mask; - out[12] = (w3 >> 36) & mask; - out[13] = ((w3 >> 55) | (w4 << 9)) & mask; - out[14] = (w4 >> 10) & mask; - out[15] = (w4 >> 29) & mask; - out[16] = ((w4 >> 48) | (w5 << 16)) & mask; - out[17] = (w5 >> 3) & mask; - out[18] = (w5 >> 22) & mask; - out[19] = (w5 >> 41) & mask; - out[20] = ((w5 >> 60) | (w6 << 4)) & mask; - out[21] = (w6 >> 15) & mask; - out[22] = (w6 >> 34) & mask; - out[23] = ((w6 >> 53) | (w7 << 11)) & mask; - out[24] = (w7 >> 8) & mask; - out[25] = (w7 >> 27) & mask; - out[26] = ((w7 >> 46) | (w8 << 18)) & mask; - out[27] = (w8 >> 1) & mask; - out[28] = (w8 >> 20) & mask; - out[29] = (w8 >> 39) & mask; - out[30] = ((w8 >> 58) | (w9 << 6)) & mask; - out[31] = (w9 >> 13) & mask; - - return in; -} - -inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1048575ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 20) & mask; - out[2] = (w0 >> 40) & mask; - out[3] = ((w0 >> 60) | (w1 << 4)) & mask; - out[4] = (w1 >> 16) & mask; - out[5] = (w1 >> 36) & mask; - out[6] = ((w1 >> 56) | (w2 << 8)) & mask; - out[7] = (w2 >> 12) & mask; - out[8] = (w2 >> 32) & mask; - out[9] = ((w2 >> 52) | (w3 << 12)) & mask; - out[10] = (w3 >> 8) & mask; - out[11] = (w3 >> 28) & mask; - out[12] = ((w3 >> 48) | (w4 << 16)) & mask; - out[13] = (w4 >> 4) & mask; - out[14] = (w4 >> 24) & mask; - out[15] = w4 >> 44; - out[16] = (w5)&mask; - out[17] = (w5 >> 20) & mask; - out[18] = (w5 >> 40) & mask; - out[19] = ((w5 >> 60) | (w6 << 4)) & mask; - out[20] = (w6 >> 16) & mask; - out[21] = (w6 >> 36) & mask; - out[22] = ((w6 >> 56) | (w7 << 8)) & mask; - out[23] = (w7 >> 12) & mask; - out[24] = (w7 >> 32) & mask; - out[25] = ((w7 >> 52) | (w8 << 12)) & mask; - out[26] = (w8 >> 8) & mask; - out[27] = (w8 >> 28) & mask; - out[28] = ((w8 >> 48) | (w9 << 16)) & mask; - out[29] = (w9 >> 4) & mask; - out[30] = (w9 >> 24) & mask; - out[31] = w9 >> 44; - - return in; -} - -inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2097151ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 21) & mask; - out[2] = (w0 >> 42) & mask; - out[3] = ((w0 >> 63) | (w1 << 1)) & mask; - out[4] = (w1 >> 20) & mask; - out[5] = (w1 >> 41) & mask; - out[6] = ((w1 >> 62) | (w2 << 2)) & mask; - out[7] = (w2 >> 19) & mask; - out[8] = (w2 >> 40) & mask; - out[9] = ((w2 >> 61) | (w3 << 3)) & mask; - out[10] = (w3 >> 18) & mask; - out[11] = (w3 >> 39) & mask; - out[12] = ((w3 >> 60) | (w4 << 4)) & mask; - out[13] = (w4 >> 17) & mask; - out[14] = (w4 >> 38) & mask; - out[15] = ((w4 >> 59) | (w5 << 5)) & mask; - out[16] = (w5 >> 16) & mask; - out[17] = (w5 >> 37) & mask; - out[18] = ((w5 >> 58) | (w6 << 6)) & mask; - out[19] = (w6 >> 15) & mask; - out[20] = (w6 >> 36) & mask; - out[21] = ((w6 >> 57) | (w7 << 7)) & mask; - out[22] = (w7 >> 14) & mask; - out[23] = (w7 >> 35) & mask; - out[24] = ((w7 >> 56) | (w8 << 8)) & mask; - out[25] = (w8 >> 13) & mask; - out[26] = (w8 >> 34) & mask; - out[27] = ((w8 >> 55) | (w9 << 9)) & mask; - out[28] = (w9 >> 12) & mask; - out[29] = (w9 >> 33) & mask; - out[30] = ((w9 >> 54) | (w10 << 10)) & mask; - out[31] = (w10 >> 11) & mask; - - return in; -} - -inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4194303ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 22) & mask; - out[2] = ((w0 >> 44) | (w1 << 20)) & mask; - out[3] = (w1 >> 2) & mask; - out[4] = (w1 >> 24) & mask; - out[5] = ((w1 >> 46) | (w2 << 18)) & mask; - out[6] = (w2 >> 4) & mask; - out[7] = (w2 >> 26) & mask; - out[8] = ((w2 >> 48) | (w3 << 16)) & mask; - out[9] = (w3 >> 6) & mask; - out[10] = (w3 >> 28) & mask; - out[11] = ((w3 >> 50) | (w4 << 14)) & mask; - out[12] = (w4 >> 8) & mask; - out[13] = (w4 >> 30) & mask; - out[14] = ((w4 >> 52) | (w5 << 12)) & mask; - out[15] = (w5 >> 10) & mask; - out[16] = (w5 >> 32) & mask; - out[17] = ((w5 >> 54) | (w6 << 10)) & mask; - out[18] = (w6 >> 12) & mask; - out[19] = (w6 >> 34) & mask; - out[20] = ((w6 >> 56) | (w7 << 8)) & mask; - out[21] = (w7 >> 14) & mask; - out[22] = (w7 >> 36) & mask; - out[23] = ((w7 >> 58) | (w8 << 6)) & mask; - out[24] = (w8 >> 16) & mask; - out[25] = (w8 >> 38) & mask; - out[26] = ((w8 >> 60) | (w9 << 4)) & mask; - out[27] = (w9 >> 18) & mask; - out[28] = (w9 >> 40) & mask; - out[29] = ((w9 >> 62) | (w10 << 2)) & mask; - out[30] = (w10 >> 20) & mask; - out[31] = w10 >> 42; - - return in; -} - -inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 8388607ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 23) & mask; - out[2] = ((w0 >> 46) | (w1 << 18)) & mask; - out[3] = (w1 >> 5) & mask; - out[4] = (w1 >> 28) & mask; - out[5] = ((w1 >> 51) | (w2 << 13)) & mask; - out[6] = (w2 >> 10) & mask; - out[7] = (w2 >> 33) & mask; - out[8] = ((w2 >> 56) | (w3 << 8)) & mask; - out[9] = (w3 >> 15) & mask; - out[10] = (w3 >> 38) & mask; - out[11] = ((w3 >> 61) | (w4 << 3)) & mask; - out[12] = (w4 >> 20) & mask; - out[13] = ((w4 >> 43) | (w5 << 21)) & mask; - out[14] = (w5 >> 2) & mask; - out[15] = (w5 >> 25) & mask; - out[16] = ((w5 >> 48) | (w6 << 16)) & mask; - out[17] = (w6 >> 7) & mask; - out[18] = (w6 >> 30) & mask; - out[19] = ((w6 >> 53) | (w7 << 11)) & mask; - out[20] = (w7 >> 12) & mask; - out[21] = (w7 >> 35) & mask; - out[22] = ((w7 >> 58) | (w8 << 6)) & mask; - out[23] = (w8 >> 17) & mask; - out[24] = (w8 >> 40) & mask; - out[25] = ((w8 >> 63) | (w9 << 1)) & mask; - out[26] = (w9 >> 22) & mask; - out[27] = ((w9 >> 45) | (w10 << 19)) & mask; - out[28] = (w10 >> 4) & mask; - out[29] = (w10 >> 27) & mask; - out[30] = ((w10 >> 50) | (w11 << 14)) & mask; - out[31] = (w11 >> 9) & mask; - - return in; -} - -inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 16777215ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 24) & mask; - out[2] = ((w0 >> 48) | (w1 << 16)) & mask; - out[3] = (w1 >> 8) & mask; - out[4] = (w1 >> 32) & mask; - out[5] = ((w1 >> 56) | (w2 << 8)) & mask; - out[6] = (w2 >> 16) & mask; - out[7] = w2 >> 40; - out[8] = (w3)&mask; - out[9] = (w3 >> 24) & mask; - out[10] = ((w3 >> 48) | (w4 << 16)) & mask; - out[11] = (w4 >> 8) & mask; - out[12] = (w4 >> 32) & mask; - out[13] = ((w4 >> 56) | (w5 << 8)) & mask; - out[14] = (w5 >> 16) & mask; - out[15] = w5 >> 40; - out[16] = (w6)&mask; - out[17] = (w6 >> 24) & mask; - out[18] = ((w6 >> 48) | (w7 << 16)) & mask; - out[19] = (w7 >> 8) & mask; - out[20] = (w7 >> 32) & mask; - out[21] = ((w7 >> 56) | (w8 << 8)) & mask; - out[22] = (w8 >> 16) & mask; - out[23] = w8 >> 40; - out[24] = (w9)&mask; - out[25] = (w9 >> 24) & mask; - out[26] = ((w9 >> 48) | (w10 << 16)) & mask; - out[27] = (w10 >> 8) & mask; - out[28] = (w10 >> 32) & mask; - out[29] = ((w10 >> 56) | (w11 << 8)) & mask; - out[30] = (w11 >> 16) & mask; - out[31] = w11 >> 40; - - return in; -} - -inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 33554431ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 25) & mask; - out[2] = ((w0 >> 50) | (w1 << 14)) & mask; - out[3] = (w1 >> 11) & mask; - out[4] = (w1 >> 36) & mask; - out[5] = ((w1 >> 61) | (w2 << 3)) & mask; - out[6] = (w2 >> 22) & mask; - out[7] = ((w2 >> 47) | (w3 << 17)) & mask; - out[8] = (w3 >> 8) & mask; - out[9] = (w3 >> 33) & mask; - out[10] = ((w3 >> 58) | (w4 << 6)) & mask; - out[11] = (w4 >> 19) & mask; - out[12] = ((w4 >> 44) | (w5 << 20)) & mask; - out[13] = (w5 >> 5) & mask; - out[14] = (w5 >> 30) & mask; - out[15] = ((w5 >> 55) | (w6 << 9)) & mask; - out[16] = (w6 >> 16) & mask; - out[17] = ((w6 >> 41) | (w7 << 23)) & mask; - out[18] = (w7 >> 2) & mask; - out[19] = (w7 >> 27) & mask; - out[20] = ((w7 >> 52) | (w8 << 12)) & mask; - out[21] = (w8 >> 13) & mask; - out[22] = (w8 >> 38) & mask; - out[23] = ((w8 >> 63) | (w9 << 1)) & mask; - out[24] = (w9 >> 24) & mask; - out[25] = ((w9 >> 49) | (w10 << 15)) & mask; - out[26] = (w10 >> 10) & mask; - out[27] = (w10 >> 35) & mask; - out[28] = ((w10 >> 60) | (w11 << 4)) & mask; - out[29] = (w11 >> 21) & mask; - out[30] = ((w11 >> 46) | (w12 << 18)) & mask; - out[31] = (w12 >> 7) & mask; - - return in; -} - -inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 67108863ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 26) & mask; - out[2] = ((w0 >> 52) | (w1 << 12)) & mask; - out[3] = (w1 >> 14) & mask; - out[4] = ((w1 >> 40) | (w2 << 24)) & mask; - out[5] = (w2 >> 2) & mask; - out[6] = (w2 >> 28) & mask; - out[7] = ((w2 >> 54) | (w3 << 10)) & mask; - out[8] = (w3 >> 16) & mask; - out[9] = ((w3 >> 42) | (w4 << 22)) & mask; - out[10] = (w4 >> 4) & mask; - out[11] = (w4 >> 30) & mask; - out[12] = ((w4 >> 56) | (w5 << 8)) & mask; - out[13] = (w5 >> 18) & mask; - out[14] = ((w5 >> 44) | (w6 << 20)) & mask; - out[15] = (w6 >> 6) & mask; - out[16] = (w6 >> 32) & mask; - out[17] = ((w6 >> 58) | (w7 << 6)) & mask; - out[18] = (w7 >> 20) & mask; - out[19] = ((w7 >> 46) | (w8 << 18)) & mask; - out[20] = (w8 >> 8) & mask; - out[21] = (w8 >> 34) & mask; - out[22] = ((w8 >> 60) | (w9 << 4)) & mask; - out[23] = (w9 >> 22) & mask; - out[24] = ((w9 >> 48) | (w10 << 16)) & mask; - out[25] = (w10 >> 10) & mask; - out[26] = (w10 >> 36) & mask; - out[27] = ((w10 >> 62) | (w11 << 2)) & mask; - out[28] = (w11 >> 24) & mask; - out[29] = ((w11 >> 50) | (w12 << 14)) & mask; - out[30] = (w12 >> 12) & mask; - out[31] = w12 >> 38; - - return in; -} - -inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 134217727ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 27) & mask; - out[2] = ((w0 >> 54) | (w1 << 10)) & mask; - out[3] = (w1 >> 17) & mask; - out[4] = ((w1 >> 44) | (w2 << 20)) & mask; - out[5] = (w2 >> 7) & mask; - out[6] = (w2 >> 34) & mask; - out[7] = ((w2 >> 61) | (w3 << 3)) & mask; - out[8] = (w3 >> 24) & mask; - out[9] = ((w3 >> 51) | (w4 << 13)) & mask; - out[10] = (w4 >> 14) & mask; - out[11] = ((w4 >> 41) | (w5 << 23)) & mask; - out[12] = (w5 >> 4) & mask; - out[13] = (w5 >> 31) & mask; - out[14] = ((w5 >> 58) | (w6 << 6)) & mask; - out[15] = (w6 >> 21) & mask; - out[16] = ((w6 >> 48) | (w7 << 16)) & mask; - out[17] = (w7 >> 11) & mask; - out[18] = ((w7 >> 38) | (w8 << 26)) & mask; - out[19] = (w8 >> 1) & mask; - out[20] = (w8 >> 28) & mask; - out[21] = ((w8 >> 55) | (w9 << 9)) & mask; - out[22] = (w9 >> 18) & mask; - out[23] = ((w9 >> 45) | (w10 << 19)) & mask; - out[24] = (w10 >> 8) & mask; - out[25] = (w10 >> 35) & mask; - out[26] = ((w10 >> 62) | (w11 << 2)) & mask; - out[27] = (w11 >> 25) & mask; - out[28] = ((w11 >> 52) | (w12 << 12)) & mask; - out[29] = (w12 >> 15) & mask; - out[30] = ((w12 >> 42) | (w13 << 22)) & mask; - out[31] = (w13 >> 5) & mask; - - return in; -} - -inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 268435455ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 28) & mask; - out[2] = ((w0 >> 56) | (w1 << 8)) & mask; - out[3] = (w1 >> 20) & mask; - out[4] = ((w1 >> 48) | (w2 << 16)) & mask; - out[5] = (w2 >> 12) & mask; - out[6] = ((w2 >> 40) | (w3 << 24)) & mask; - out[7] = (w3 >> 4) & mask; - out[8] = (w3 >> 32) & mask; - out[9] = ((w3 >> 60) | (w4 << 4)) & mask; - out[10] = (w4 >> 24) & mask; - out[11] = ((w4 >> 52) | (w5 << 12)) & mask; - out[12] = (w5 >> 16) & mask; - out[13] = ((w5 >> 44) | (w6 << 20)) & mask; - out[14] = (w6 >> 8) & mask; - out[15] = w6 >> 36; - out[16] = (w7)&mask; - out[17] = (w7 >> 28) & mask; - out[18] = ((w7 >> 56) | (w8 << 8)) & mask; - out[19] = (w8 >> 20) & mask; - out[20] = ((w8 >> 48) | (w9 << 16)) & mask; - out[21] = (w9 >> 12) & mask; - out[22] = ((w9 >> 40) | (w10 << 24)) & mask; - out[23] = (w10 >> 4) & mask; - out[24] = (w10 >> 32) & mask; - out[25] = ((w10 >> 60) | (w11 << 4)) & mask; - out[26] = (w11 >> 24) & mask; - out[27] = ((w11 >> 52) | (w12 << 12)) & mask; - out[28] = (w12 >> 16) & mask; - out[29] = ((w12 >> 44) | (w13 << 20)) & mask; - out[30] = (w13 >> 8) & mask; - out[31] = w13 >> 36; - - return in; -} - -inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 536870911ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 29) & mask; - out[2] = ((w0 >> 58) | (w1 << 6)) & mask; - out[3] = (w1 >> 23) & mask; - out[4] = ((w1 >> 52) | (w2 << 12)) & mask; - out[5] = (w2 >> 17) & mask; - out[6] = ((w2 >> 46) | (w3 << 18)) & mask; - out[7] = (w3 >> 11) & mask; - out[8] = ((w3 >> 40) | (w4 << 24)) & mask; - out[9] = (w4 >> 5) & mask; - out[10] = (w4 >> 34) & mask; - out[11] = ((w4 >> 63) | (w5 << 1)) & mask; - out[12] = (w5 >> 28) & mask; - out[13] = ((w5 >> 57) | (w6 << 7)) & mask; - out[14] = (w6 >> 22) & mask; - out[15] = ((w6 >> 51) | (w7 << 13)) & mask; - out[16] = (w7 >> 16) & mask; - out[17] = ((w7 >> 45) | (w8 << 19)) & mask; - out[18] = (w8 >> 10) & mask; - out[19] = ((w8 >> 39) | (w9 << 25)) & mask; - out[20] = (w9 >> 4) & mask; - out[21] = (w9 >> 33) & mask; - out[22] = ((w9 >> 62) | (w10 << 2)) & mask; - out[23] = (w10 >> 27) & mask; - out[24] = ((w10 >> 56) | (w11 << 8)) & mask; - out[25] = (w11 >> 21) & mask; - out[26] = ((w11 >> 50) | (w12 << 14)) & mask; - out[27] = (w12 >> 15) & mask; - out[28] = ((w12 >> 44) | (w13 << 20)) & mask; - out[29] = (w13 >> 9) & mask; - out[30] = ((w13 >> 38) | (w14 << 26)) & mask; - out[31] = (w14 >> 3) & mask; - - return in; -} - -inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1073741823ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 30) & mask; - out[2] = ((w0 >> 60) | (w1 << 4)) & mask; - out[3] = (w1 >> 26) & mask; - out[4] = ((w1 >> 56) | (w2 << 8)) & mask; - out[5] = (w2 >> 22) & mask; - out[6] = ((w2 >> 52) | (w3 << 12)) & mask; - out[7] = (w3 >> 18) & mask; - out[8] = ((w3 >> 48) | (w4 << 16)) & mask; - out[9] = (w4 >> 14) & mask; - out[10] = ((w4 >> 44) | (w5 << 20)) & mask; - out[11] = (w5 >> 10) & mask; - out[12] = ((w5 >> 40) | (w6 << 24)) & mask; - out[13] = (w6 >> 6) & mask; - out[14] = ((w6 >> 36) | (w7 << 28)) & mask; - out[15] = (w7 >> 2) & mask; - out[16] = (w7 >> 32) & mask; - out[17] = ((w7 >> 62) | (w8 << 2)) & mask; - out[18] = (w8 >> 28) & mask; - out[19] = ((w8 >> 58) | (w9 << 6)) & mask; - out[20] = (w9 >> 24) & mask; - out[21] = ((w9 >> 54) | (w10 << 10)) & mask; - out[22] = (w10 >> 20) & mask; - out[23] = ((w10 >> 50) | (w11 << 14)) & mask; - out[24] = (w11 >> 16) & mask; - out[25] = ((w11 >> 46) | (w12 << 18)) & mask; - out[26] = (w12 >> 12) & mask; - out[27] = ((w12 >> 42) | (w13 << 22)) & mask; - out[28] = (w13 >> 8) & mask; - out[29] = ((w13 >> 38) | (w14 << 26)) & mask; - out[30] = (w14 >> 4) & mask; - out[31] = w14 >> 34; - - return in; -} - -inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2147483647ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 31) & mask; - out[2] = ((w0 >> 62) | (w1 << 2)) & mask; - out[3] = (w1 >> 29) & mask; - out[4] = ((w1 >> 60) | (w2 << 4)) & mask; - out[5] = (w2 >> 27) & mask; - out[6] = ((w2 >> 58) | (w3 << 6)) & mask; - out[7] = (w3 >> 25) & mask; - out[8] = ((w3 >> 56) | (w4 << 8)) & mask; - out[9] = (w4 >> 23) & mask; - out[10] = ((w4 >> 54) | (w5 << 10)) & mask; - out[11] = (w5 >> 21) & mask; - out[12] = ((w5 >> 52) | (w6 << 12)) & mask; - out[13] = (w6 >> 19) & mask; - out[14] = ((w6 >> 50) | (w7 << 14)) & mask; - out[15] = (w7 >> 17) & mask; - out[16] = ((w7 >> 48) | (w8 << 16)) & mask; - out[17] = (w8 >> 15) & mask; - out[18] = ((w8 >> 46) | (w9 << 18)) & mask; - out[19] = (w9 >> 13) & mask; - out[20] = ((w9 >> 44) | (w10 << 20)) & mask; - out[21] = (w10 >> 11) & mask; - out[22] = ((w10 >> 42) | (w11 << 22)) & mask; - out[23] = (w11 >> 9) & mask; - out[24] = ((w11 >> 40) | (w12 << 24)) & mask; - out[25] = (w12 >> 7) & mask; - out[26] = ((w12 >> 38) | (w13 << 26)) & mask; - out[27] = (w13 >> 5) & mask; - out[28] = ((w13 >> 36) | (w14 << 28)) & mask; - out[29] = (w14 >> 3) & mask; - out[30] = ((w14 >> 34) | (w15 << 30)) & mask; - out[31] = (w15 >> 1) & mask; - - return in; -} - -inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4294967295ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - out[0] = (w0)&mask; - out[1] = w0 >> 32; - out[2] = (w1)&mask; - out[3] = w1 >> 32; - out[4] = (w2)&mask; - out[5] = w2 >> 32; - out[6] = (w3)&mask; - out[7] = w3 >> 32; - out[8] = (w4)&mask; - out[9] = w4 >> 32; - out[10] = (w5)&mask; - out[11] = w5 >> 32; - out[12] = (w6)&mask; - out[13] = w6 >> 32; - out[14] = (w7)&mask; - out[15] = w7 >> 32; - out[16] = (w8)&mask; - out[17] = w8 >> 32; - out[18] = (w9)&mask; - out[19] = w9 >> 32; - out[20] = (w10)&mask; - out[21] = w10 >> 32; - out[22] = (w11)&mask; - out[23] = w11 >> 32; - out[24] = (w12)&mask; - out[25] = w12 >> 32; - out[26] = (w13)&mask; - out[27] = w13 >> 32; - out[28] = (w14)&mask; - out[29] = w14 >> 32; - out[30] = (w15)&mask; - out[31] = w15 >> 32; - - return in; -} - -inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 8589934591ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 33) | (w1 << 31)) & mask; - out[2] = (w1 >> 2) & mask; - out[3] = ((w1 >> 35) | (w2 << 29)) & mask; - out[4] = (w2 >> 4) & mask; - out[5] = ((w2 >> 37) | (w3 << 27)) & mask; - out[6] = (w3 >> 6) & mask; - out[7] = ((w3 >> 39) | (w4 << 25)) & mask; - out[8] = (w4 >> 8) & mask; - out[9] = ((w4 >> 41) | (w5 << 23)) & mask; - out[10] = (w5 >> 10) & mask; - out[11] = ((w5 >> 43) | (w6 << 21)) & mask; - out[12] = (w6 >> 12) & mask; - out[13] = ((w6 >> 45) | (w7 << 19)) & mask; - out[14] = (w7 >> 14) & mask; - out[15] = ((w7 >> 47) | (w8 << 17)) & mask; - out[16] = (w8 >> 16) & mask; - out[17] = ((w8 >> 49) | (w9 << 15)) & mask; - out[18] = (w9 >> 18) & mask; - out[19] = ((w9 >> 51) | (w10 << 13)) & mask; - out[20] = (w10 >> 20) & mask; - out[21] = ((w10 >> 53) | (w11 << 11)) & mask; - out[22] = (w11 >> 22) & mask; - out[23] = ((w11 >> 55) | (w12 << 9)) & mask; - out[24] = (w12 >> 24) & mask; - out[25] = ((w12 >> 57) | (w13 << 7)) & mask; - out[26] = (w13 >> 26) & mask; - out[27] = ((w13 >> 59) | (w14 << 5)) & mask; - out[28] = (w14 >> 28) & mask; - out[29] = ((w14 >> 61) | (w15 << 3)) & mask; - out[30] = (w15 >> 30) & mask; - out[31] = ((w15 >> 63) | (w16 << 1)) & mask; - - return in; -} - -inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 17179869183ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 34) | (w1 << 30)) & mask; - out[2] = (w1 >> 4) & mask; - out[3] = ((w1 >> 38) | (w2 << 26)) & mask; - out[4] = (w2 >> 8) & mask; - out[5] = ((w2 >> 42) | (w3 << 22)) & mask; - out[6] = (w3 >> 12) & mask; - out[7] = ((w3 >> 46) | (w4 << 18)) & mask; - out[8] = (w4 >> 16) & mask; - out[9] = ((w4 >> 50) | (w5 << 14)) & mask; - out[10] = (w5 >> 20) & mask; - out[11] = ((w5 >> 54) | (w6 << 10)) & mask; - out[12] = (w6 >> 24) & mask; - out[13] = ((w6 >> 58) | (w7 << 6)) & mask; - out[14] = (w7 >> 28) & mask; - out[15] = ((w7 >> 62) | (w8 << 2)) & mask; - out[16] = ((w8 >> 32) | (w9 << 32)) & mask; - out[17] = (w9 >> 2) & mask; - out[18] = ((w9 >> 36) | (w10 << 28)) & mask; - out[19] = (w10 >> 6) & mask; - out[20] = ((w10 >> 40) | (w11 << 24)) & mask; - out[21] = (w11 >> 10) & mask; - out[22] = ((w11 >> 44) | (w12 << 20)) & mask; - out[23] = (w12 >> 14) & mask; - out[24] = ((w12 >> 48) | (w13 << 16)) & mask; - out[25] = (w13 >> 18) & mask; - out[26] = ((w13 >> 52) | (w14 << 12)) & mask; - out[27] = (w14 >> 22) & mask; - out[28] = ((w14 >> 56) | (w15 << 8)) & mask; - out[29] = (w15 >> 26) & mask; - out[30] = ((w15 >> 60) | (w16 << 4)) & mask; - out[31] = w16 >> 30; - - return in; -} - -inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 34359738367ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 35) | (w1 << 29)) & mask; - out[2] = (w1 >> 6) & mask; - out[3] = ((w1 >> 41) | (w2 << 23)) & mask; - out[4] = (w2 >> 12) & mask; - out[5] = ((w2 >> 47) | (w3 << 17)) & mask; - out[6] = (w3 >> 18) & mask; - out[7] = ((w3 >> 53) | (w4 << 11)) & mask; - out[8] = (w4 >> 24) & mask; - out[9] = ((w4 >> 59) | (w5 << 5)) & mask; - out[10] = ((w5 >> 30) | (w6 << 34)) & mask; - out[11] = (w6 >> 1) & mask; - out[12] = ((w6 >> 36) | (w7 << 28)) & mask; - out[13] = (w7 >> 7) & mask; - out[14] = ((w7 >> 42) | (w8 << 22)) & mask; - out[15] = (w8 >> 13) & mask; - out[16] = ((w8 >> 48) | (w9 << 16)) & mask; - out[17] = (w9 >> 19) & mask; - out[18] = ((w9 >> 54) | (w10 << 10)) & mask; - out[19] = (w10 >> 25) & mask; - out[20] = ((w10 >> 60) | (w11 << 4)) & mask; - out[21] = ((w11 >> 31) | (w12 << 33)) & mask; - out[22] = (w12 >> 2) & mask; - out[23] = ((w12 >> 37) | (w13 << 27)) & mask; - out[24] = (w13 >> 8) & mask; - out[25] = ((w13 >> 43) | (w14 << 21)) & mask; - out[26] = (w14 >> 14) & mask; - out[27] = ((w14 >> 49) | (w15 << 15)) & mask; - out[28] = (w15 >> 20) & mask; - out[29] = ((w15 >> 55) | (w16 << 9)) & mask; - out[30] = (w16 >> 26) & mask; - out[31] = ((w16 >> 61) | (w17 << 3)) & mask; - - return in; -} - -inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 68719476735ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 36) | (w1 << 28)) & mask; - out[2] = (w1 >> 8) & mask; - out[3] = ((w1 >> 44) | (w2 << 20)) & mask; - out[4] = (w2 >> 16) & mask; - out[5] = ((w2 >> 52) | (w3 << 12)) & mask; - out[6] = (w3 >> 24) & mask; - out[7] = ((w3 >> 60) | (w4 << 4)) & mask; - out[8] = ((w4 >> 32) | (w5 << 32)) & mask; - out[9] = (w5 >> 4) & mask; - out[10] = ((w5 >> 40) | (w6 << 24)) & mask; - out[11] = (w6 >> 12) & mask; - out[12] = ((w6 >> 48) | (w7 << 16)) & mask; - out[13] = (w7 >> 20) & mask; - out[14] = ((w7 >> 56) | (w8 << 8)) & mask; - out[15] = w8 >> 28; - out[16] = (w9)&mask; - out[17] = ((w9 >> 36) | (w10 << 28)) & mask; - out[18] = (w10 >> 8) & mask; - out[19] = ((w10 >> 44) | (w11 << 20)) & mask; - out[20] = (w11 >> 16) & mask; - out[21] = ((w11 >> 52) | (w12 << 12)) & mask; - out[22] = (w12 >> 24) & mask; - out[23] = ((w12 >> 60) | (w13 << 4)) & mask; - out[24] = ((w13 >> 32) | (w14 << 32)) & mask; - out[25] = (w14 >> 4) & mask; - out[26] = ((w14 >> 40) | (w15 << 24)) & mask; - out[27] = (w15 >> 12) & mask; - out[28] = ((w15 >> 48) | (w16 << 16)) & mask; - out[29] = (w16 >> 20) & mask; - out[30] = ((w16 >> 56) | (w17 << 8)) & mask; - out[31] = w17 >> 28; - - return in; -} - -inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 137438953471ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 37) | (w1 << 27)) & mask; - out[2] = (w1 >> 10) & mask; - out[3] = ((w1 >> 47) | (w2 << 17)) & mask; - out[4] = (w2 >> 20) & mask; - out[5] = ((w2 >> 57) | (w3 << 7)) & mask; - out[6] = ((w3 >> 30) | (w4 << 34)) & mask; - out[7] = (w4 >> 3) & mask; - out[8] = ((w4 >> 40) | (w5 << 24)) & mask; - out[9] = (w5 >> 13) & mask; - out[10] = ((w5 >> 50) | (w6 << 14)) & mask; - out[11] = (w6 >> 23) & mask; - out[12] = ((w6 >> 60) | (w7 << 4)) & mask; - out[13] = ((w7 >> 33) | (w8 << 31)) & mask; - out[14] = (w8 >> 6) & mask; - out[15] = ((w8 >> 43) | (w9 << 21)) & mask; - out[16] = (w9 >> 16) & mask; - out[17] = ((w9 >> 53) | (w10 << 11)) & mask; - out[18] = (w10 >> 26) & mask; - out[19] = ((w10 >> 63) | (w11 << 1)) & mask; - out[20] = ((w11 >> 36) | (w12 << 28)) & mask; - out[21] = (w12 >> 9) & mask; - out[22] = ((w12 >> 46) | (w13 << 18)) & mask; - out[23] = (w13 >> 19) & mask; - out[24] = ((w13 >> 56) | (w14 << 8)) & mask; - out[25] = ((w14 >> 29) | (w15 << 35)) & mask; - out[26] = (w15 >> 2) & mask; - out[27] = ((w15 >> 39) | (w16 << 25)) & mask; - out[28] = (w16 >> 12) & mask; - out[29] = ((w16 >> 49) | (w17 << 15)) & mask; - out[30] = (w17 >> 22) & mask; - out[31] = ((w17 >> 59) | (w18 << 5)) & mask; - - return in; -} - -inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 274877906943ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 38) | (w1 << 26)) & mask; - out[2] = (w1 >> 12) & mask; - out[3] = ((w1 >> 50) | (w2 << 14)) & mask; - out[4] = (w2 >> 24) & mask; - out[5] = ((w2 >> 62) | (w3 << 2)) & mask; - out[6] = ((w3 >> 36) | (w4 << 28)) & mask; - out[7] = (w4 >> 10) & mask; - out[8] = ((w4 >> 48) | (w5 << 16)) & mask; - out[9] = (w5 >> 22) & mask; - out[10] = ((w5 >> 60) | (w6 << 4)) & mask; - out[11] = ((w6 >> 34) | (w7 << 30)) & mask; - out[12] = (w7 >> 8) & mask; - out[13] = ((w7 >> 46) | (w8 << 18)) & mask; - out[14] = (w8 >> 20) & mask; - out[15] = ((w8 >> 58) | (w9 << 6)) & mask; - out[16] = ((w9 >> 32) | (w10 << 32)) & mask; - out[17] = (w10 >> 6) & mask; - out[18] = ((w10 >> 44) | (w11 << 20)) & mask; - out[19] = (w11 >> 18) & mask; - out[20] = ((w11 >> 56) | (w12 << 8)) & mask; - out[21] = ((w12 >> 30) | (w13 << 34)) & mask; - out[22] = (w13 >> 4) & mask; - out[23] = ((w13 >> 42) | (w14 << 22)) & mask; - out[24] = (w14 >> 16) & mask; - out[25] = ((w14 >> 54) | (w15 << 10)) & mask; - out[26] = ((w15 >> 28) | (w16 << 36)) & mask; - out[27] = (w16 >> 2) & mask; - out[28] = ((w16 >> 40) | (w17 << 24)) & mask; - out[29] = (w17 >> 14) & mask; - out[30] = ((w17 >> 52) | (w18 << 12)) & mask; - out[31] = w18 >> 26; - - return in; -} - -inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 549755813887ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 39) | (w1 << 25)) & mask; - out[2] = (w1 >> 14) & mask; - out[3] = ((w1 >> 53) | (w2 << 11)) & mask; - out[4] = ((w2 >> 28) | (w3 << 36)) & mask; - out[5] = (w3 >> 3) & mask; - out[6] = ((w3 >> 42) | (w4 << 22)) & mask; - out[7] = (w4 >> 17) & mask; - out[8] = ((w4 >> 56) | (w5 << 8)) & mask; - out[9] = ((w5 >> 31) | (w6 << 33)) & mask; - out[10] = (w6 >> 6) & mask; - out[11] = ((w6 >> 45) | (w7 << 19)) & mask; - out[12] = (w7 >> 20) & mask; - out[13] = ((w7 >> 59) | (w8 << 5)) & mask; - out[14] = ((w8 >> 34) | (w9 << 30)) & mask; - out[15] = (w9 >> 9) & mask; - out[16] = ((w9 >> 48) | (w10 << 16)) & mask; - out[17] = (w10 >> 23) & mask; - out[18] = ((w10 >> 62) | (w11 << 2)) & mask; - out[19] = ((w11 >> 37) | (w12 << 27)) & mask; - out[20] = (w12 >> 12) & mask; - out[21] = ((w12 >> 51) | (w13 << 13)) & mask; - out[22] = ((w13 >> 26) | (w14 << 38)) & mask; - out[23] = (w14 >> 1) & mask; - out[24] = ((w14 >> 40) | (w15 << 24)) & mask; - out[25] = (w15 >> 15) & mask; - out[26] = ((w15 >> 54) | (w16 << 10)) & mask; - out[27] = ((w16 >> 29) | (w17 << 35)) & mask; - out[28] = (w17 >> 4) & mask; - out[29] = ((w17 >> 43) | (w18 << 21)) & mask; - out[30] = (w18 >> 18) & mask; - out[31] = ((w18 >> 57) | (w19 << 7)) & mask; - - return in; -} - -inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1099511627775ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 40) | (w1 << 24)) & mask; - out[2] = (w1 >> 16) & mask; - out[3] = ((w1 >> 56) | (w2 << 8)) & mask; - out[4] = ((w2 >> 32) | (w3 << 32)) & mask; - out[5] = (w3 >> 8) & mask; - out[6] = ((w3 >> 48) | (w4 << 16)) & mask; - out[7] = w4 >> 24; - out[8] = (w5)&mask; - out[9] = ((w5 >> 40) | (w6 << 24)) & mask; - out[10] = (w6 >> 16) & mask; - out[11] = ((w6 >> 56) | (w7 << 8)) & mask; - out[12] = ((w7 >> 32) | (w8 << 32)) & mask; - out[13] = (w8 >> 8) & mask; - out[14] = ((w8 >> 48) | (w9 << 16)) & mask; - out[15] = w9 >> 24; - out[16] = (w10)&mask; - out[17] = ((w10 >> 40) | (w11 << 24)) & mask; - out[18] = (w11 >> 16) & mask; - out[19] = ((w11 >> 56) | (w12 << 8)) & mask; - out[20] = ((w12 >> 32) | (w13 << 32)) & mask; - out[21] = (w13 >> 8) & mask; - out[22] = ((w13 >> 48) | (w14 << 16)) & mask; - out[23] = w14 >> 24; - out[24] = (w15)&mask; - out[25] = ((w15 >> 40) | (w16 << 24)) & mask; - out[26] = (w16 >> 16) & mask; - out[27] = ((w16 >> 56) | (w17 << 8)) & mask; - out[28] = ((w17 >> 32) | (w18 << 32)) & mask; - out[29] = (w18 >> 8) & mask; - out[30] = ((w18 >> 48) | (w19 << 16)) & mask; - out[31] = w19 >> 24; - - return in; -} - -inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2199023255551ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 41) | (w1 << 23)) & mask; - out[2] = (w1 >> 18) & mask; - out[3] = ((w1 >> 59) | (w2 << 5)) & mask; - out[4] = ((w2 >> 36) | (w3 << 28)) & mask; - out[5] = (w3 >> 13) & mask; - out[6] = ((w3 >> 54) | (w4 << 10)) & mask; - out[7] = ((w4 >> 31) | (w5 << 33)) & mask; - out[8] = (w5 >> 8) & mask; - out[9] = ((w5 >> 49) | (w6 << 15)) & mask; - out[10] = ((w6 >> 26) | (w7 << 38)) & mask; - out[11] = (w7 >> 3) & mask; - out[12] = ((w7 >> 44) | (w8 << 20)) & mask; - out[13] = (w8 >> 21) & mask; - out[14] = ((w8 >> 62) | (w9 << 2)) & mask; - out[15] = ((w9 >> 39) | (w10 << 25)) & mask; - out[16] = (w10 >> 16) & mask; - out[17] = ((w10 >> 57) | (w11 << 7)) & mask; - out[18] = ((w11 >> 34) | (w12 << 30)) & mask; - out[19] = (w12 >> 11) & mask; - out[20] = ((w12 >> 52) | (w13 << 12)) & mask; - out[21] = ((w13 >> 29) | (w14 << 35)) & mask; - out[22] = (w14 >> 6) & mask; - out[23] = ((w14 >> 47) | (w15 << 17)) & mask; - out[24] = ((w15 >> 24) | (w16 << 40)) & mask; - out[25] = (w16 >> 1) & mask; - out[26] = ((w16 >> 42) | (w17 << 22)) & mask; - out[27] = (w17 >> 19) & mask; - out[28] = ((w17 >> 60) | (w18 << 4)) & mask; - out[29] = ((w18 >> 37) | (w19 << 27)) & mask; - out[30] = (w19 >> 14) & mask; - out[31] = ((w19 >> 55) | (w20 << 9)) & mask; - - return in; -} - -inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4398046511103ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 42) | (w1 << 22)) & mask; - out[2] = (w1 >> 20) & mask; - out[3] = ((w1 >> 62) | (w2 << 2)) & mask; - out[4] = ((w2 >> 40) | (w3 << 24)) & mask; - out[5] = (w3 >> 18) & mask; - out[6] = ((w3 >> 60) | (w4 << 4)) & mask; - out[7] = ((w4 >> 38) | (w5 << 26)) & mask; - out[8] = (w5 >> 16) & mask; - out[9] = ((w5 >> 58) | (w6 << 6)) & mask; - out[10] = ((w6 >> 36) | (w7 << 28)) & mask; - out[11] = (w7 >> 14) & mask; - out[12] = ((w7 >> 56) | (w8 << 8)) & mask; - out[13] = ((w8 >> 34) | (w9 << 30)) & mask; - out[14] = (w9 >> 12) & mask; - out[15] = ((w9 >> 54) | (w10 << 10)) & mask; - out[16] = ((w10 >> 32) | (w11 << 32)) & mask; - out[17] = (w11 >> 10) & mask; - out[18] = ((w11 >> 52) | (w12 << 12)) & mask; - out[19] = ((w12 >> 30) | (w13 << 34)) & mask; - out[20] = (w13 >> 8) & mask; - out[21] = ((w13 >> 50) | (w14 << 14)) & mask; - out[22] = ((w14 >> 28) | (w15 << 36)) & mask; - out[23] = (w15 >> 6) & mask; - out[24] = ((w15 >> 48) | (w16 << 16)) & mask; - out[25] = ((w16 >> 26) | (w17 << 38)) & mask; - out[26] = (w17 >> 4) & mask; - out[27] = ((w17 >> 46) | (w18 << 18)) & mask; - out[28] = ((w18 >> 24) | (w19 << 40)) & mask; - out[29] = (w19 >> 2) & mask; - out[30] = ((w19 >> 44) | (w20 << 20)) & mask; - out[31] = w20 >> 22; - - return in; -} - -inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 8796093022207ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 43) | (w1 << 21)) & mask; - out[2] = ((w1 >> 22) | (w2 << 42)) & mask; - out[3] = (w2 >> 1) & mask; - out[4] = ((w2 >> 44) | (w3 << 20)) & mask; - out[5] = ((w3 >> 23) | (w4 << 41)) & mask; - out[6] = (w4 >> 2) & mask; - out[7] = ((w4 >> 45) | (w5 << 19)) & mask; - out[8] = ((w5 >> 24) | (w6 << 40)) & mask; - out[9] = (w6 >> 3) & mask; - out[10] = ((w6 >> 46) | (w7 << 18)) & mask; - out[11] = ((w7 >> 25) | (w8 << 39)) & mask; - out[12] = (w8 >> 4) & mask; - out[13] = ((w8 >> 47) | (w9 << 17)) & mask; - out[14] = ((w9 >> 26) | (w10 << 38)) & mask; - out[15] = (w10 >> 5) & mask; - out[16] = ((w10 >> 48) | (w11 << 16)) & mask; - out[17] = ((w11 >> 27) | (w12 << 37)) & mask; - out[18] = (w12 >> 6) & mask; - out[19] = ((w12 >> 49) | (w13 << 15)) & mask; - out[20] = ((w13 >> 28) | (w14 << 36)) & mask; - out[21] = (w14 >> 7) & mask; - out[22] = ((w14 >> 50) | (w15 << 14)) & mask; - out[23] = ((w15 >> 29) | (w16 << 35)) & mask; - out[24] = (w16 >> 8) & mask; - out[25] = ((w16 >> 51) | (w17 << 13)) & mask; - out[26] = ((w17 >> 30) | (w18 << 34)) & mask; - out[27] = (w18 >> 9) & mask; - out[28] = ((w18 >> 52) | (w19 << 12)) & mask; - out[29] = ((w19 >> 31) | (w20 << 33)) & mask; - out[30] = (w20 >> 10) & mask; - out[31] = ((w20 >> 53) | (w21 << 11)) & mask; - - return in; -} - -inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 17592186044415ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 44) | (w1 << 20)) & mask; - out[2] = ((w1 >> 24) | (w2 << 40)) & mask; - out[3] = (w2 >> 4) & mask; - out[4] = ((w2 >> 48) | (w3 << 16)) & mask; - out[5] = ((w3 >> 28) | (w4 << 36)) & mask; - out[6] = (w4 >> 8) & mask; - out[7] = ((w4 >> 52) | (w5 << 12)) & mask; - out[8] = ((w5 >> 32) | (w6 << 32)) & mask; - out[9] = (w6 >> 12) & mask; - out[10] = ((w6 >> 56) | (w7 << 8)) & mask; - out[11] = ((w7 >> 36) | (w8 << 28)) & mask; - out[12] = (w8 >> 16) & mask; - out[13] = ((w8 >> 60) | (w9 << 4)) & mask; - out[14] = ((w9 >> 40) | (w10 << 24)) & mask; - out[15] = w10 >> 20; - out[16] = (w11)&mask; - out[17] = ((w11 >> 44) | (w12 << 20)) & mask; - out[18] = ((w12 >> 24) | (w13 << 40)) & mask; - out[19] = (w13 >> 4) & mask; - out[20] = ((w13 >> 48) | (w14 << 16)) & mask; - out[21] = ((w14 >> 28) | (w15 << 36)) & mask; - out[22] = (w15 >> 8) & mask; - out[23] = ((w15 >> 52) | (w16 << 12)) & mask; - out[24] = ((w16 >> 32) | (w17 << 32)) & mask; - out[25] = (w17 >> 12) & mask; - out[26] = ((w17 >> 56) | (w18 << 8)) & mask; - out[27] = ((w18 >> 36) | (w19 << 28)) & mask; - out[28] = (w19 >> 16) & mask; - out[29] = ((w19 >> 60) | (w20 << 4)) & mask; - out[30] = ((w20 >> 40) | (w21 << 24)) & mask; - out[31] = w21 >> 20; - - return in; -} - -inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 35184372088831ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 45) | (w1 << 19)) & mask; - out[2] = ((w1 >> 26) | (w2 << 38)) & mask; - out[3] = (w2 >> 7) & mask; - out[4] = ((w2 >> 52) | (w3 << 12)) & mask; - out[5] = ((w3 >> 33) | (w4 << 31)) & mask; - out[6] = (w4 >> 14) & mask; - out[7] = ((w4 >> 59) | (w5 << 5)) & mask; - out[8] = ((w5 >> 40) | (w6 << 24)) & mask; - out[9] = ((w6 >> 21) | (w7 << 43)) & mask; - out[10] = (w7 >> 2) & mask; - out[11] = ((w7 >> 47) | (w8 << 17)) & mask; - out[12] = ((w8 >> 28) | (w9 << 36)) & mask; - out[13] = (w9 >> 9) & mask; - out[14] = ((w9 >> 54) | (w10 << 10)) & mask; - out[15] = ((w10 >> 35) | (w11 << 29)) & mask; - out[16] = (w11 >> 16) & mask; - out[17] = ((w11 >> 61) | (w12 << 3)) & mask; - out[18] = ((w12 >> 42) | (w13 << 22)) & mask; - out[19] = ((w13 >> 23) | (w14 << 41)) & mask; - out[20] = (w14 >> 4) & mask; - out[21] = ((w14 >> 49) | (w15 << 15)) & mask; - out[22] = ((w15 >> 30) | (w16 << 34)) & mask; - out[23] = (w16 >> 11) & mask; - out[24] = ((w16 >> 56) | (w17 << 8)) & mask; - out[25] = ((w17 >> 37) | (w18 << 27)) & mask; - out[26] = (w18 >> 18) & mask; - out[27] = ((w18 >> 63) | (w19 << 1)) & mask; - out[28] = ((w19 >> 44) | (w20 << 20)) & mask; - out[29] = ((w20 >> 25) | (w21 << 39)) & mask; - out[30] = (w21 >> 6) & mask; - out[31] = ((w21 >> 51) | (w22 << 13)) & mask; - - return in; -} - -inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 70368744177663ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 46) | (w1 << 18)) & mask; - out[2] = ((w1 >> 28) | (w2 << 36)) & mask; - out[3] = (w2 >> 10) & mask; - out[4] = ((w2 >> 56) | (w3 << 8)) & mask; - out[5] = ((w3 >> 38) | (w4 << 26)) & mask; - out[6] = ((w4 >> 20) | (w5 << 44)) & mask; - out[7] = (w5 >> 2) & mask; - out[8] = ((w5 >> 48) | (w6 << 16)) & mask; - out[9] = ((w6 >> 30) | (w7 << 34)) & mask; - out[10] = (w7 >> 12) & mask; - out[11] = ((w7 >> 58) | (w8 << 6)) & mask; - out[12] = ((w8 >> 40) | (w9 << 24)) & mask; - out[13] = ((w9 >> 22) | (w10 << 42)) & mask; - out[14] = (w10 >> 4) & mask; - out[15] = ((w10 >> 50) | (w11 << 14)) & mask; - out[16] = ((w11 >> 32) | (w12 << 32)) & mask; - out[17] = (w12 >> 14) & mask; - out[18] = ((w12 >> 60) | (w13 << 4)) & mask; - out[19] = ((w13 >> 42) | (w14 << 22)) & mask; - out[20] = ((w14 >> 24) | (w15 << 40)) & mask; - out[21] = (w15 >> 6) & mask; - out[22] = ((w15 >> 52) | (w16 << 12)) & mask; - out[23] = ((w16 >> 34) | (w17 << 30)) & mask; - out[24] = (w17 >> 16) & mask; - out[25] = ((w17 >> 62) | (w18 << 2)) & mask; - out[26] = ((w18 >> 44) | (w19 << 20)) & mask; - out[27] = ((w19 >> 26) | (w20 << 38)) & mask; - out[28] = (w20 >> 8) & mask; - out[29] = ((w20 >> 54) | (w21 << 10)) & mask; - out[30] = ((w21 >> 36) | (w22 << 28)) & mask; - out[31] = w22 >> 18; - - return in; -} - -inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 140737488355327ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 47) | (w1 << 17)) & mask; - out[2] = ((w1 >> 30) | (w2 << 34)) & mask; - out[3] = (w2 >> 13) & mask; - out[4] = ((w2 >> 60) | (w3 << 4)) & mask; - out[5] = ((w3 >> 43) | (w4 << 21)) & mask; - out[6] = ((w4 >> 26) | (w5 << 38)) & mask; - out[7] = (w5 >> 9) & mask; - out[8] = ((w5 >> 56) | (w6 << 8)) & mask; - out[9] = ((w6 >> 39) | (w7 << 25)) & mask; - out[10] = ((w7 >> 22) | (w8 << 42)) & mask; - out[11] = (w8 >> 5) & mask; - out[12] = ((w8 >> 52) | (w9 << 12)) & mask; - out[13] = ((w9 >> 35) | (w10 << 29)) & mask; - out[14] = ((w10 >> 18) | (w11 << 46)) & mask; - out[15] = (w11 >> 1) & mask; - out[16] = ((w11 >> 48) | (w12 << 16)) & mask; - out[17] = ((w12 >> 31) | (w13 << 33)) & mask; - out[18] = (w13 >> 14) & mask; - out[19] = ((w13 >> 61) | (w14 << 3)) & mask; - out[20] = ((w14 >> 44) | (w15 << 20)) & mask; - out[21] = ((w15 >> 27) | (w16 << 37)) & mask; - out[22] = (w16 >> 10) & mask; - out[23] = ((w16 >> 57) | (w17 << 7)) & mask; - out[24] = ((w17 >> 40) | (w18 << 24)) & mask; - out[25] = ((w18 >> 23) | (w19 << 41)) & mask; - out[26] = (w19 >> 6) & mask; - out[27] = ((w19 >> 53) | (w20 << 11)) & mask; - out[28] = ((w20 >> 36) | (w21 << 28)) & mask; - out[29] = ((w21 >> 19) | (w22 << 45)) & mask; - out[30] = (w22 >> 2) & mask; - out[31] = ((w22 >> 49) | (w23 << 15)) & mask; - - return in; -} - -inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 281474976710655ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 48) | (w1 << 16)) & mask; - out[2] = ((w1 >> 32) | (w2 << 32)) & mask; - out[3] = w2 >> 16; - out[4] = (w3)&mask; - out[5] = ((w3 >> 48) | (w4 << 16)) & mask; - out[6] = ((w4 >> 32) | (w5 << 32)) & mask; - out[7] = w5 >> 16; - out[8] = (w6)&mask; - out[9] = ((w6 >> 48) | (w7 << 16)) & mask; - out[10] = ((w7 >> 32) | (w8 << 32)) & mask; - out[11] = w8 >> 16; - out[12] = (w9)&mask; - out[13] = ((w9 >> 48) | (w10 << 16)) & mask; - out[14] = ((w10 >> 32) | (w11 << 32)) & mask; - out[15] = w11 >> 16; - out[16] = (w12)&mask; - out[17] = ((w12 >> 48) | (w13 << 16)) & mask; - out[18] = ((w13 >> 32) | (w14 << 32)) & mask; - out[19] = w14 >> 16; - out[20] = (w15)&mask; - out[21] = ((w15 >> 48) | (w16 << 16)) & mask; - out[22] = ((w16 >> 32) | (w17 << 32)) & mask; - out[23] = w17 >> 16; - out[24] = (w18)&mask; - out[25] = ((w18 >> 48) | (w19 << 16)) & mask; - out[26] = ((w19 >> 32) | (w20 << 32)) & mask; - out[27] = w20 >> 16; - out[28] = (w21)&mask; - out[29] = ((w21 >> 48) | (w22 << 16)) & mask; - out[30] = ((w22 >> 32) | (w23 << 32)) & mask; - out[31] = w23 >> 16; - - return in; -} - -inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 562949953421311ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 49) | (w1 << 15)) & mask; - out[2] = ((w1 >> 34) | (w2 << 30)) & mask; - out[3] = ((w2 >> 19) | (w3 << 45)) & mask; - out[4] = (w3 >> 4) & mask; - out[5] = ((w3 >> 53) | (w4 << 11)) & mask; - out[6] = ((w4 >> 38) | (w5 << 26)) & mask; - out[7] = ((w5 >> 23) | (w6 << 41)) & mask; - out[8] = (w6 >> 8) & mask; - out[9] = ((w6 >> 57) | (w7 << 7)) & mask; - out[10] = ((w7 >> 42) | (w8 << 22)) & mask; - out[11] = ((w8 >> 27) | (w9 << 37)) & mask; - out[12] = (w9 >> 12) & mask; - out[13] = ((w9 >> 61) | (w10 << 3)) & mask; - out[14] = ((w10 >> 46) | (w11 << 18)) & mask; - out[15] = ((w11 >> 31) | (w12 << 33)) & mask; - out[16] = ((w12 >> 16) | (w13 << 48)) & mask; - out[17] = (w13 >> 1) & mask; - out[18] = ((w13 >> 50) | (w14 << 14)) & mask; - out[19] = ((w14 >> 35) | (w15 << 29)) & mask; - out[20] = ((w15 >> 20) | (w16 << 44)) & mask; - out[21] = (w16 >> 5) & mask; - out[22] = ((w16 >> 54) | (w17 << 10)) & mask; - out[23] = ((w17 >> 39) | (w18 << 25)) & mask; - out[24] = ((w18 >> 24) | (w19 << 40)) & mask; - out[25] = (w19 >> 9) & mask; - out[26] = ((w19 >> 58) | (w20 << 6)) & mask; - out[27] = ((w20 >> 43) | (w21 << 21)) & mask; - out[28] = ((w21 >> 28) | (w22 << 36)) & mask; - out[29] = (w22 >> 13) & mask; - out[30] = ((w22 >> 62) | (w23 << 2)) & mask; - out[31] = ((w23 >> 47) | (w24 << 17)) & mask; - - return in; -} - -inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1125899906842623ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 50) | (w1 << 14)) & mask; - out[2] = ((w1 >> 36) | (w2 << 28)) & mask; - out[3] = ((w2 >> 22) | (w3 << 42)) & mask; - out[4] = (w3 >> 8) & mask; - out[5] = ((w3 >> 58) | (w4 << 6)) & mask; - out[6] = ((w4 >> 44) | (w5 << 20)) & mask; - out[7] = ((w5 >> 30) | (w6 << 34)) & mask; - out[8] = ((w6 >> 16) | (w7 << 48)) & mask; - out[9] = (w7 >> 2) & mask; - out[10] = ((w7 >> 52) | (w8 << 12)) & mask; - out[11] = ((w8 >> 38) | (w9 << 26)) & mask; - out[12] = ((w9 >> 24) | (w10 << 40)) & mask; - out[13] = (w10 >> 10) & mask; - out[14] = ((w10 >> 60) | (w11 << 4)) & mask; - out[15] = ((w11 >> 46) | (w12 << 18)) & mask; - out[16] = ((w12 >> 32) | (w13 << 32)) & mask; - out[17] = ((w13 >> 18) | (w14 << 46)) & mask; - out[18] = (w14 >> 4) & mask; - out[19] = ((w14 >> 54) | (w15 << 10)) & mask; - out[20] = ((w15 >> 40) | (w16 << 24)) & mask; - out[21] = ((w16 >> 26) | (w17 << 38)) & mask; - out[22] = (w17 >> 12) & mask; - out[23] = ((w17 >> 62) | (w18 << 2)) & mask; - out[24] = ((w18 >> 48) | (w19 << 16)) & mask; - out[25] = ((w19 >> 34) | (w20 << 30)) & mask; - out[26] = ((w20 >> 20) | (w21 << 44)) & mask; - out[27] = (w21 >> 6) & mask; - out[28] = ((w21 >> 56) | (w22 << 8)) & mask; - out[29] = ((w22 >> 42) | (w23 << 22)) & mask; - out[30] = ((w23 >> 28) | (w24 << 36)) & mask; - out[31] = w24 >> 14; - - return in; -} - -inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2251799813685247ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 51) | (w1 << 13)) & mask; - out[2] = ((w1 >> 38) | (w2 << 26)) & mask; - out[3] = ((w2 >> 25) | (w3 << 39)) & mask; - out[4] = (w3 >> 12) & mask; - out[5] = ((w3 >> 63) | (w4 << 1)) & mask; - out[6] = ((w4 >> 50) | (w5 << 14)) & mask; - out[7] = ((w5 >> 37) | (w6 << 27)) & mask; - out[8] = ((w6 >> 24) | (w7 << 40)) & mask; - out[9] = (w7 >> 11) & mask; - out[10] = ((w7 >> 62) | (w8 << 2)) & mask; - out[11] = ((w8 >> 49) | (w9 << 15)) & mask; - out[12] = ((w9 >> 36) | (w10 << 28)) & mask; - out[13] = ((w10 >> 23) | (w11 << 41)) & mask; - out[14] = (w11 >> 10) & mask; - out[15] = ((w11 >> 61) | (w12 << 3)) & mask; - out[16] = ((w12 >> 48) | (w13 << 16)) & mask; - out[17] = ((w13 >> 35) | (w14 << 29)) & mask; - out[18] = ((w14 >> 22) | (w15 << 42)) & mask; - out[19] = (w15 >> 9) & mask; - out[20] = ((w15 >> 60) | (w16 << 4)) & mask; - out[21] = ((w16 >> 47) | (w17 << 17)) & mask; - out[22] = ((w17 >> 34) | (w18 << 30)) & mask; - out[23] = ((w18 >> 21) | (w19 << 43)) & mask; - out[24] = (w19 >> 8) & mask; - out[25] = ((w19 >> 59) | (w20 << 5)) & mask; - out[26] = ((w20 >> 46) | (w21 << 18)) & mask; - out[27] = ((w21 >> 33) | (w22 << 31)) & mask; - out[28] = ((w22 >> 20) | (w23 << 44)) & mask; - out[29] = (w23 >> 7) & mask; - out[30] = ((w23 >> 58) | (w24 << 6)) & mask; - out[31] = ((w24 >> 45) | (w25 << 19)) & mask; - - return in; -} - -inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4503599627370495ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 52) | (w1 << 12)) & mask; - out[2] = ((w1 >> 40) | (w2 << 24)) & mask; - out[3] = ((w2 >> 28) | (w3 << 36)) & mask; - out[4] = ((w3 >> 16) | (w4 << 48)) & mask; - out[5] = (w4 >> 4) & mask; - out[6] = ((w4 >> 56) | (w5 << 8)) & mask; - out[7] = ((w5 >> 44) | (w6 << 20)) & mask; - out[8] = ((w6 >> 32) | (w7 << 32)) & mask; - out[9] = ((w7 >> 20) | (w8 << 44)) & mask; - out[10] = (w8 >> 8) & mask; - out[11] = ((w8 >> 60) | (w9 << 4)) & mask; - out[12] = ((w9 >> 48) | (w10 << 16)) & mask; - out[13] = ((w10 >> 36) | (w11 << 28)) & mask; - out[14] = ((w11 >> 24) | (w12 << 40)) & mask; - out[15] = w12 >> 12; - out[16] = (w13)&mask; - out[17] = ((w13 >> 52) | (w14 << 12)) & mask; - out[18] = ((w14 >> 40) | (w15 << 24)) & mask; - out[19] = ((w15 >> 28) | (w16 << 36)) & mask; - out[20] = ((w16 >> 16) | (w17 << 48)) & mask; - out[21] = (w17 >> 4) & mask; - out[22] = ((w17 >> 56) | (w18 << 8)) & mask; - out[23] = ((w18 >> 44) | (w19 << 20)) & mask; - out[24] = ((w19 >> 32) | (w20 << 32)) & mask; - out[25] = ((w20 >> 20) | (w21 << 44)) & mask; - out[26] = (w21 >> 8) & mask; - out[27] = ((w21 >> 60) | (w22 << 4)) & mask; - out[28] = ((w22 >> 48) | (w23 << 16)) & mask; - out[29] = ((w23 >> 36) | (w24 << 28)) & mask; - out[30] = ((w24 >> 24) | (w25 << 40)) & mask; - out[31] = w25 >> 12; - - return in; -} - -inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 9007199254740991ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 53) | (w1 << 11)) & mask; - out[2] = ((w1 >> 42) | (w2 << 22)) & mask; - out[3] = ((w2 >> 31) | (w3 << 33)) & mask; - out[4] = ((w3 >> 20) | (w4 << 44)) & mask; - out[5] = (w4 >> 9) & mask; - out[6] = ((w4 >> 62) | (w5 << 2)) & mask; - out[7] = ((w5 >> 51) | (w6 << 13)) & mask; - out[8] = ((w6 >> 40) | (w7 << 24)) & mask; - out[9] = ((w7 >> 29) | (w8 << 35)) & mask; - out[10] = ((w8 >> 18) | (w9 << 46)) & mask; - out[11] = (w9 >> 7) & mask; - out[12] = ((w9 >> 60) | (w10 << 4)) & mask; - out[13] = ((w10 >> 49) | (w11 << 15)) & mask; - out[14] = ((w11 >> 38) | (w12 << 26)) & mask; - out[15] = ((w12 >> 27) | (w13 << 37)) & mask; - out[16] = ((w13 >> 16) | (w14 << 48)) & mask; - out[17] = (w14 >> 5) & mask; - out[18] = ((w14 >> 58) | (w15 << 6)) & mask; - out[19] = ((w15 >> 47) | (w16 << 17)) & mask; - out[20] = ((w16 >> 36) | (w17 << 28)) & mask; - out[21] = ((w17 >> 25) | (w18 << 39)) & mask; - out[22] = ((w18 >> 14) | (w19 << 50)) & mask; - out[23] = (w19 >> 3) & mask; - out[24] = ((w19 >> 56) | (w20 << 8)) & mask; - out[25] = ((w20 >> 45) | (w21 << 19)) & mask; - out[26] = ((w21 >> 34) | (w22 << 30)) & mask; - out[27] = ((w22 >> 23) | (w23 << 41)) & mask; - out[28] = ((w23 >> 12) | (w24 << 52)) & mask; - out[29] = (w24 >> 1) & mask; - out[30] = ((w24 >> 54) | (w25 << 10)) & mask; - out[31] = ((w25 >> 43) | (w26 << 21)) & mask; - - return in; -} - -inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 18014398509481983ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 54) | (w1 << 10)) & mask; - out[2] = ((w1 >> 44) | (w2 << 20)) & mask; - out[3] = ((w2 >> 34) | (w3 << 30)) & mask; - out[4] = ((w3 >> 24) | (w4 << 40)) & mask; - out[5] = ((w4 >> 14) | (w5 << 50)) & mask; - out[6] = (w5 >> 4) & mask; - out[7] = ((w5 >> 58) | (w6 << 6)) & mask; - out[8] = ((w6 >> 48) | (w7 << 16)) & mask; - out[9] = ((w7 >> 38) | (w8 << 26)) & mask; - out[10] = ((w8 >> 28) | (w9 << 36)) & mask; - out[11] = ((w9 >> 18) | (w10 << 46)) & mask; - out[12] = (w10 >> 8) & mask; - out[13] = ((w10 >> 62) | (w11 << 2)) & mask; - out[14] = ((w11 >> 52) | (w12 << 12)) & mask; - out[15] = ((w12 >> 42) | (w13 << 22)) & mask; - out[16] = ((w13 >> 32) | (w14 << 32)) & mask; - out[17] = ((w14 >> 22) | (w15 << 42)) & mask; - out[18] = ((w15 >> 12) | (w16 << 52)) & mask; - out[19] = (w16 >> 2) & mask; - out[20] = ((w16 >> 56) | (w17 << 8)) & mask; - out[21] = ((w17 >> 46) | (w18 << 18)) & mask; - out[22] = ((w18 >> 36) | (w19 << 28)) & mask; - out[23] = ((w19 >> 26) | (w20 << 38)) & mask; - out[24] = ((w20 >> 16) | (w21 << 48)) & mask; - out[25] = (w21 >> 6) & mask; - out[26] = ((w21 >> 60) | (w22 << 4)) & mask; - out[27] = ((w22 >> 50) | (w23 << 14)) & mask; - out[28] = ((w23 >> 40) | (w24 << 24)) & mask; - out[29] = ((w24 >> 30) | (w25 << 34)) & mask; - out[30] = ((w25 >> 20) | (w26 << 44)) & mask; - out[31] = w26 >> 10; - - return in; -} - -inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 36028797018963967ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 55) | (w1 << 9)) & mask; - out[2] = ((w1 >> 46) | (w2 << 18)) & mask; - out[3] = ((w2 >> 37) | (w3 << 27)) & mask; - out[4] = ((w3 >> 28) | (w4 << 36)) & mask; - out[5] = ((w4 >> 19) | (w5 << 45)) & mask; - out[6] = ((w5 >> 10) | (w6 << 54)) & mask; - out[7] = (w6 >> 1) & mask; - out[8] = ((w6 >> 56) | (w7 << 8)) & mask; - out[9] = ((w7 >> 47) | (w8 << 17)) & mask; - out[10] = ((w8 >> 38) | (w9 << 26)) & mask; - out[11] = ((w9 >> 29) | (w10 << 35)) & mask; - out[12] = ((w10 >> 20) | (w11 << 44)) & mask; - out[13] = ((w11 >> 11) | (w12 << 53)) & mask; - out[14] = (w12 >> 2) & mask; - out[15] = ((w12 >> 57) | (w13 << 7)) & mask; - out[16] = ((w13 >> 48) | (w14 << 16)) & mask; - out[17] = ((w14 >> 39) | (w15 << 25)) & mask; - out[18] = ((w15 >> 30) | (w16 << 34)) & mask; - out[19] = ((w16 >> 21) | (w17 << 43)) & mask; - out[20] = ((w17 >> 12) | (w18 << 52)) & mask; - out[21] = (w18 >> 3) & mask; - out[22] = ((w18 >> 58) | (w19 << 6)) & mask; - out[23] = ((w19 >> 49) | (w20 << 15)) & mask; - out[24] = ((w20 >> 40) | (w21 << 24)) & mask; - out[25] = ((w21 >> 31) | (w22 << 33)) & mask; - out[26] = ((w22 >> 22) | (w23 << 42)) & mask; - out[27] = ((w23 >> 13) | (w24 << 51)) & mask; - out[28] = (w24 >> 4) & mask; - out[29] = ((w24 >> 59) | (w25 << 5)) & mask; - out[30] = ((w25 >> 50) | (w26 << 14)) & mask; - out[31] = ((w26 >> 41) | (w27 << 23)) & mask; - - return in; -} - -inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 72057594037927935ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 56) | (w1 << 8)) & mask; - out[2] = ((w1 >> 48) | (w2 << 16)) & mask; - out[3] = ((w2 >> 40) | (w3 << 24)) & mask; - out[4] = ((w3 >> 32) | (w4 << 32)) & mask; - out[5] = ((w4 >> 24) | (w5 << 40)) & mask; - out[6] = ((w5 >> 16) | (w6 << 48)) & mask; - out[7] = w6 >> 8; - out[8] = (w7)&mask; - out[9] = ((w7 >> 56) | (w8 << 8)) & mask; - out[10] = ((w8 >> 48) | (w9 << 16)) & mask; - out[11] = ((w9 >> 40) | (w10 << 24)) & mask; - out[12] = ((w10 >> 32) | (w11 << 32)) & mask; - out[13] = ((w11 >> 24) | (w12 << 40)) & mask; - out[14] = ((w12 >> 16) | (w13 << 48)) & mask; - out[15] = w13 >> 8; - out[16] = (w14)&mask; - out[17] = ((w14 >> 56) | (w15 << 8)) & mask; - out[18] = ((w15 >> 48) | (w16 << 16)) & mask; - out[19] = ((w16 >> 40) | (w17 << 24)) & mask; - out[20] = ((w17 >> 32) | (w18 << 32)) & mask; - out[21] = ((w18 >> 24) | (w19 << 40)) & mask; - out[22] = ((w19 >> 16) | (w20 << 48)) & mask; - out[23] = w20 >> 8; - out[24] = (w21)&mask; - out[25] = ((w21 >> 56) | (w22 << 8)) & mask; - out[26] = ((w22 >> 48) | (w23 << 16)) & mask; - out[27] = ((w23 >> 40) | (w24 << 24)) & mask; - out[28] = ((w24 >> 32) | (w25 << 32)) & mask; - out[29] = ((w25 >> 24) | (w26 << 40)) & mask; - out[30] = ((w26 >> 16) | (w27 << 48)) & mask; - out[31] = w27 >> 8; - - return in; -} - -inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 144115188075855871ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 57) | (w1 << 7)) & mask; - out[2] = ((w1 >> 50) | (w2 << 14)) & mask; - out[3] = ((w2 >> 43) | (w3 << 21)) & mask; - out[4] = ((w3 >> 36) | (w4 << 28)) & mask; - out[5] = ((w4 >> 29) | (w5 << 35)) & mask; - out[6] = ((w5 >> 22) | (w6 << 42)) & mask; - out[7] = ((w6 >> 15) | (w7 << 49)) & mask; - out[8] = ((w7 >> 8) | (w8 << 56)) & mask; - out[9] = (w8 >> 1) & mask; - out[10] = ((w8 >> 58) | (w9 << 6)) & mask; - out[11] = ((w9 >> 51) | (w10 << 13)) & mask; - out[12] = ((w10 >> 44) | (w11 << 20)) & mask; - out[13] = ((w11 >> 37) | (w12 << 27)) & mask; - out[14] = ((w12 >> 30) | (w13 << 34)) & mask; - out[15] = ((w13 >> 23) | (w14 << 41)) & mask; - out[16] = ((w14 >> 16) | (w15 << 48)) & mask; - out[17] = ((w15 >> 9) | (w16 << 55)) & mask; - out[18] = (w16 >> 2) & mask; - out[19] = ((w16 >> 59) | (w17 << 5)) & mask; - out[20] = ((w17 >> 52) | (w18 << 12)) & mask; - out[21] = ((w18 >> 45) | (w19 << 19)) & mask; - out[22] = ((w19 >> 38) | (w20 << 26)) & mask; - out[23] = ((w20 >> 31) | (w21 << 33)) & mask; - out[24] = ((w21 >> 24) | (w22 << 40)) & mask; - out[25] = ((w22 >> 17) | (w23 << 47)) & mask; - out[26] = ((w23 >> 10) | (w24 << 54)) & mask; - out[27] = (w24 >> 3) & mask; - out[28] = ((w24 >> 60) | (w25 << 4)) & mask; - out[29] = ((w25 >> 53) | (w26 << 11)) & mask; - out[30] = ((w26 >> 46) | (w27 << 18)) & mask; - out[31] = ((w27 >> 39) | (w28 << 25)) & mask; - - return in; -} - -inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 288230376151711743ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 58) | (w1 << 6)) & mask; - out[2] = ((w1 >> 52) | (w2 << 12)) & mask; - out[3] = ((w2 >> 46) | (w3 << 18)) & mask; - out[4] = ((w3 >> 40) | (w4 << 24)) & mask; - out[5] = ((w4 >> 34) | (w5 << 30)) & mask; - out[6] = ((w5 >> 28) | (w6 << 36)) & mask; - out[7] = ((w6 >> 22) | (w7 << 42)) & mask; - out[8] = ((w7 >> 16) | (w8 << 48)) & mask; - out[9] = ((w8 >> 10) | (w9 << 54)) & mask; - out[10] = (w9 >> 4) & mask; - out[11] = ((w9 >> 62) | (w10 << 2)) & mask; - out[12] = ((w10 >> 56) | (w11 << 8)) & mask; - out[13] = ((w11 >> 50) | (w12 << 14)) & mask; - out[14] = ((w12 >> 44) | (w13 << 20)) & mask; - out[15] = ((w13 >> 38) | (w14 << 26)) & mask; - out[16] = ((w14 >> 32) | (w15 << 32)) & mask; - out[17] = ((w15 >> 26) | (w16 << 38)) & mask; - out[18] = ((w16 >> 20) | (w17 << 44)) & mask; - out[19] = ((w17 >> 14) | (w18 << 50)) & mask; - out[20] = ((w18 >> 8) | (w19 << 56)) & mask; - out[21] = (w19 >> 2) & mask; - out[22] = ((w19 >> 60) | (w20 << 4)) & mask; - out[23] = ((w20 >> 54) | (w21 << 10)) & mask; - out[24] = ((w21 >> 48) | (w22 << 16)) & mask; - out[25] = ((w22 >> 42) | (w23 << 22)) & mask; - out[26] = ((w23 >> 36) | (w24 << 28)) & mask; - out[27] = ((w24 >> 30) | (w25 << 34)) & mask; - out[28] = ((w25 >> 24) | (w26 << 40)) & mask; - out[29] = ((w26 >> 18) | (w27 << 46)) & mask; - out[30] = ((w27 >> 12) | (w28 << 52)) & mask; - out[31] = w28 >> 6; - - return in; -} - -inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 576460752303423487ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 59) | (w1 << 5)) & mask; - out[2] = ((w1 >> 54) | (w2 << 10)) & mask; - out[3] = ((w2 >> 49) | (w3 << 15)) & mask; - out[4] = ((w3 >> 44) | (w4 << 20)) & mask; - out[5] = ((w4 >> 39) | (w5 << 25)) & mask; - out[6] = ((w5 >> 34) | (w6 << 30)) & mask; - out[7] = ((w6 >> 29) | (w7 << 35)) & mask; - out[8] = ((w7 >> 24) | (w8 << 40)) & mask; - out[9] = ((w8 >> 19) | (w9 << 45)) & mask; - out[10] = ((w9 >> 14) | (w10 << 50)) & mask; - out[11] = ((w10 >> 9) | (w11 << 55)) & mask; - out[12] = (w11 >> 4) & mask; - out[13] = ((w11 >> 63) | (w12 << 1)) & mask; - out[14] = ((w12 >> 58) | (w13 << 6)) & mask; - out[15] = ((w13 >> 53) | (w14 << 11)) & mask; - out[16] = ((w14 >> 48) | (w15 << 16)) & mask; - out[17] = ((w15 >> 43) | (w16 << 21)) & mask; - out[18] = ((w16 >> 38) | (w17 << 26)) & mask; - out[19] = ((w17 >> 33) | (w18 << 31)) & mask; - out[20] = ((w18 >> 28) | (w19 << 36)) & mask; - out[21] = ((w19 >> 23) | (w20 << 41)) & mask; - out[22] = ((w20 >> 18) | (w21 << 46)) & mask; - out[23] = ((w21 >> 13) | (w22 << 51)) & mask; - out[24] = ((w22 >> 8) | (w23 << 56)) & mask; - out[25] = (w23 >> 3) & mask; - out[26] = ((w23 >> 62) | (w24 << 2)) & mask; - out[27] = ((w24 >> 57) | (w25 << 7)) & mask; - out[28] = ((w25 >> 52) | (w26 << 12)) & mask; - out[29] = ((w26 >> 47) | (w27 << 17)) & mask; - out[30] = ((w27 >> 42) | (w28 << 22)) & mask; - out[31] = ((w28 >> 37) | (w29 << 27)) & mask; - - return in; -} - -inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1152921504606846975ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 60) | (w1 << 4)) & mask; - out[2] = ((w1 >> 56) | (w2 << 8)) & mask; - out[3] = ((w2 >> 52) | (w3 << 12)) & mask; - out[4] = ((w3 >> 48) | (w4 << 16)) & mask; - out[5] = ((w4 >> 44) | (w5 << 20)) & mask; - out[6] = ((w5 >> 40) | (w6 << 24)) & mask; - out[7] = ((w6 >> 36) | (w7 << 28)) & mask; - out[8] = ((w7 >> 32) | (w8 << 32)) & mask; - out[9] = ((w8 >> 28) | (w9 << 36)) & mask; - out[10] = ((w9 >> 24) | (w10 << 40)) & mask; - out[11] = ((w10 >> 20) | (w11 << 44)) & mask; - out[12] = ((w11 >> 16) | (w12 << 48)) & mask; - out[13] = ((w12 >> 12) | (w13 << 52)) & mask; - out[14] = ((w13 >> 8) | (w14 << 56)) & mask; - out[15] = w14 >> 4; - out[16] = (w15)&mask; - out[17] = ((w15 >> 60) | (w16 << 4)) & mask; - out[18] = ((w16 >> 56) | (w17 << 8)) & mask; - out[19] = ((w17 >> 52) | (w18 << 12)) & mask; - out[20] = ((w18 >> 48) | (w19 << 16)) & mask; - out[21] = ((w19 >> 44) | (w20 << 20)) & mask; - out[22] = ((w20 >> 40) | (w21 << 24)) & mask; - out[23] = ((w21 >> 36) | (w22 << 28)) & mask; - out[24] = ((w22 >> 32) | (w23 << 32)) & mask; - out[25] = ((w23 >> 28) | (w24 << 36)) & mask; - out[26] = ((w24 >> 24) | (w25 << 40)) & mask; - out[27] = ((w25 >> 20) | (w26 << 44)) & mask; - out[28] = ((w26 >> 16) | (w27 << 48)) & mask; - out[29] = ((w27 >> 12) | (w28 << 52)) & mask; - out[30] = ((w28 >> 8) | (w29 << 56)) & mask; - out[31] = w29 >> 4; - - return in; -} - -inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2305843009213693951ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - uint64_t w30 = util::SafeLoadAs(in); - w30 = arrow::bit_util::FromLittleEndian(w30); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 61) | (w1 << 3)) & mask; - out[2] = ((w1 >> 58) | (w2 << 6)) & mask; - out[3] = ((w2 >> 55) | (w3 << 9)) & mask; - out[4] = ((w3 >> 52) | (w4 << 12)) & mask; - out[5] = ((w4 >> 49) | (w5 << 15)) & mask; - out[6] = ((w5 >> 46) | (w6 << 18)) & mask; - out[7] = ((w6 >> 43) | (w7 << 21)) & mask; - out[8] = ((w7 >> 40) | (w8 << 24)) & mask; - out[9] = ((w8 >> 37) | (w9 << 27)) & mask; - out[10] = ((w9 >> 34) | (w10 << 30)) & mask; - out[11] = ((w10 >> 31) | (w11 << 33)) & mask; - out[12] = ((w11 >> 28) | (w12 << 36)) & mask; - out[13] = ((w12 >> 25) | (w13 << 39)) & mask; - out[14] = ((w13 >> 22) | (w14 << 42)) & mask; - out[15] = ((w14 >> 19) | (w15 << 45)) & mask; - out[16] = ((w15 >> 16) | (w16 << 48)) & mask; - out[17] = ((w16 >> 13) | (w17 << 51)) & mask; - out[18] = ((w17 >> 10) | (w18 << 54)) & mask; - out[19] = ((w18 >> 7) | (w19 << 57)) & mask; - out[20] = ((w19 >> 4) | (w20 << 60)) & mask; - out[21] = (w20 >> 1) & mask; - out[22] = ((w20 >> 62) | (w21 << 2)) & mask; - out[23] = ((w21 >> 59) | (w22 << 5)) & mask; - out[24] = ((w22 >> 56) | (w23 << 8)) & mask; - out[25] = ((w23 >> 53) | (w24 << 11)) & mask; - out[26] = ((w24 >> 50) | (w25 << 14)) & mask; - out[27] = ((w25 >> 47) | (w26 << 17)) & mask; - out[28] = ((w26 >> 44) | (w27 << 20)) & mask; - out[29] = ((w27 >> 41) | (w28 << 23)) & mask; - out[30] = ((w28 >> 38) | (w29 << 26)) & mask; - out[31] = ((w29 >> 35) | (w30 << 29)) & mask; - - return in; -} - -inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4611686018427387903ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - uint64_t w30 = util::SafeLoadAs(in); - w30 = arrow::bit_util::FromLittleEndian(w30); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 62) | (w1 << 2)) & mask; - out[2] = ((w1 >> 60) | (w2 << 4)) & mask; - out[3] = ((w2 >> 58) | (w3 << 6)) & mask; - out[4] = ((w3 >> 56) | (w4 << 8)) & mask; - out[5] = ((w4 >> 54) | (w5 << 10)) & mask; - out[6] = ((w5 >> 52) | (w6 << 12)) & mask; - out[7] = ((w6 >> 50) | (w7 << 14)) & mask; - out[8] = ((w7 >> 48) | (w8 << 16)) & mask; - out[9] = ((w8 >> 46) | (w9 << 18)) & mask; - out[10] = ((w9 >> 44) | (w10 << 20)) & mask; - out[11] = ((w10 >> 42) | (w11 << 22)) & mask; - out[12] = ((w11 >> 40) | (w12 << 24)) & mask; - out[13] = ((w12 >> 38) | (w13 << 26)) & mask; - out[14] = ((w13 >> 36) | (w14 << 28)) & mask; - out[15] = ((w14 >> 34) | (w15 << 30)) & mask; - out[16] = ((w15 >> 32) | (w16 << 32)) & mask; - out[17] = ((w16 >> 30) | (w17 << 34)) & mask; - out[18] = ((w17 >> 28) | (w18 << 36)) & mask; - out[19] = ((w18 >> 26) | (w19 << 38)) & mask; - out[20] = ((w19 >> 24) | (w20 << 40)) & mask; - out[21] = ((w20 >> 22) | (w21 << 42)) & mask; - out[22] = ((w21 >> 20) | (w22 << 44)) & mask; - out[23] = ((w22 >> 18) | (w23 << 46)) & mask; - out[24] = ((w23 >> 16) | (w24 << 48)) & mask; - out[25] = ((w24 >> 14) | (w25 << 50)) & mask; - out[26] = ((w25 >> 12) | (w26 << 52)) & mask; - out[27] = ((w26 >> 10) | (w27 << 54)) & mask; - out[28] = ((w27 >> 8) | (w28 << 56)) & mask; - out[29] = ((w28 >> 6) | (w29 << 58)) & mask; - out[30] = ((w29 >> 4) | (w30 << 60)) & mask; - out[31] = w30 >> 2; - - return in; -} - -inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 9223372036854775807ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - uint64_t w30 = util::SafeLoadAs(in); - w30 = arrow::bit_util::FromLittleEndian(w30); - in += 8; - uint64_t w31 = util::SafeLoadAs(in); - w31 = arrow::bit_util::FromLittleEndian(w31); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 63) | (w1 << 1)) & mask; - out[2] = ((w1 >> 62) | (w2 << 2)) & mask; - out[3] = ((w2 >> 61) | (w3 << 3)) & mask; - out[4] = ((w3 >> 60) | (w4 << 4)) & mask; - out[5] = ((w4 >> 59) | (w5 << 5)) & mask; - out[6] = ((w5 >> 58) | (w6 << 6)) & mask; - out[7] = ((w6 >> 57) | (w7 << 7)) & mask; - out[8] = ((w7 >> 56) | (w8 << 8)) & mask; - out[9] = ((w8 >> 55) | (w9 << 9)) & mask; - out[10] = ((w9 >> 54) | (w10 << 10)) & mask; - out[11] = ((w10 >> 53) | (w11 << 11)) & mask; - out[12] = ((w11 >> 52) | (w12 << 12)) & mask; - out[13] = ((w12 >> 51) | (w13 << 13)) & mask; - out[14] = ((w13 >> 50) | (w14 << 14)) & mask; - out[15] = ((w14 >> 49) | (w15 << 15)) & mask; - out[16] = ((w15 >> 48) | (w16 << 16)) & mask; - out[17] = ((w16 >> 47) | (w17 << 17)) & mask; - out[18] = ((w17 >> 46) | (w18 << 18)) & mask; - out[19] = ((w18 >> 45) | (w19 << 19)) & mask; - out[20] = ((w19 >> 44) | (w20 << 20)) & mask; - out[21] = ((w20 >> 43) | (w21 << 21)) & mask; - out[22] = ((w21 >> 42) | (w22 << 22)) & mask; - out[23] = ((w22 >> 41) | (w23 << 23)) & mask; - out[24] = ((w23 >> 40) | (w24 << 24)) & mask; - out[25] = ((w24 >> 39) | (w25 << 25)) & mask; - out[26] = ((w25 >> 38) | (w26 << 26)) & mask; - out[27] = ((w26 >> 37) | (w27 << 27)) & mask; - out[28] = ((w27 >> 36) | (w28 << 28)) & mask; - out[29] = ((w28 >> 35) | (w29 << 29)) & mask; - out[30] = ((w29 >> 34) | (w30 << 30)) & mask; - out[31] = ((w30 >> 33) | (w31 << 31)) & mask; - - return in; -} - -inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out) { - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - uint64_t w30 = util::SafeLoadAs(in); - w30 = arrow::bit_util::FromLittleEndian(w30); - in += 8; - uint64_t w31 = util::SafeLoadAs(in); - w31 = arrow::bit_util::FromLittleEndian(w31); - in += 8; - out[0] = w0; - out[1] = w1; - out[2] = w2; - out[3] = w3; - out[4] = w4; - out[5] = w5; - out[6] = w6; - out[7] = w7; - out[8] = w8; - out[9] = w9; - out[10] = w10; - out[11] = w11; - out[12] = w12; - out[13] = w13; - out[14] = w14; - out[15] = w15; - out[16] = w16; - out[17] = w17; - out[18] = w18; - out[19] = w19; - out[20] = w20; - out[21] = w21; - out[22] = w22; - out[23] = w23; - out[24] = w24; - out[25] = w25; - out[26] = w26; - out[27] = w27; - out[28] = w28; - out[29] = w29; - out[30] = w30; - out[31] = w31; - - return in; -} - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking64_default_internal.h b/cpp/src/arrow/util/bpacking64_default_internal.h new file mode 100644 index 00000000000..256cdda87e3 --- /dev/null +++ b/cpp/src/arrow/util/bpacking64_default_internal.h @@ -0,0 +1,5640 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file was generated by script which is modified from its original version in +// GitHub. Original source: +// https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py +// The original copyright notice follows. + +// This code is released under the +// Apache License Version 2.0 http://www.apache.org/licenses/. +// (c) Daniel Lemire 2013 + +#pragma once + +#include "arrow/util/endian.h" +#include "arrow/util/ubsan.h" + +namespace arrow::internal { + +inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) { + for (int k = 0; k < 32; k += 1) { + out[k] = 0; + } + return in; +} + +inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 1ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 1) & mask; + out[2] = (w0 >> 2) & mask; + out[3] = (w0 >> 3) & mask; + out[4] = (w0 >> 4) & mask; + out[5] = (w0 >> 5) & mask; + out[6] = (w0 >> 6) & mask; + out[7] = (w0 >> 7) & mask; + out[8] = (w0 >> 8) & mask; + out[9] = (w0 >> 9) & mask; + out[10] = (w0 >> 10) & mask; + out[11] = (w0 >> 11) & mask; + out[12] = (w0 >> 12) & mask; + out[13] = (w0 >> 13) & mask; + out[14] = (w0 >> 14) & mask; + out[15] = (w0 >> 15) & mask; + out[16] = (w0 >> 16) & mask; + out[17] = (w0 >> 17) & mask; + out[18] = (w0 >> 18) & mask; + out[19] = (w0 >> 19) & mask; + out[20] = (w0 >> 20) & mask; + out[21] = (w0 >> 21) & mask; + out[22] = (w0 >> 22) & mask; + out[23] = (w0 >> 23) & mask; + out[24] = (w0 >> 24) & mask; + out[25] = (w0 >> 25) & mask; + out[26] = (w0 >> 26) & mask; + out[27] = (w0 >> 27) & mask; + out[28] = (w0 >> 28) & mask; + out[29] = (w0 >> 29) & mask; + out[30] = (w0 >> 30) & mask; + out[31] = (w0 >> 31) & mask; + + return in; +} + +inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 3ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 2) & mask; + out[2] = (w0 >> 4) & mask; + out[3] = (w0 >> 6) & mask; + out[4] = (w0 >> 8) & mask; + out[5] = (w0 >> 10) & mask; + out[6] = (w0 >> 12) & mask; + out[7] = (w0 >> 14) & mask; + out[8] = (w0 >> 16) & mask; + out[9] = (w0 >> 18) & mask; + out[10] = (w0 >> 20) & mask; + out[11] = (w0 >> 22) & mask; + out[12] = (w0 >> 24) & mask; + out[13] = (w0 >> 26) & mask; + out[14] = (w0 >> 28) & mask; + out[15] = (w0 >> 30) & mask; + out[16] = (w0 >> 32) & mask; + out[17] = (w0 >> 34) & mask; + out[18] = (w0 >> 36) & mask; + out[19] = (w0 >> 38) & mask; + out[20] = (w0 >> 40) & mask; + out[21] = (w0 >> 42) & mask; + out[22] = (w0 >> 44) & mask; + out[23] = (w0 >> 46) & mask; + out[24] = (w0 >> 48) & mask; + out[25] = (w0 >> 50) & mask; + out[26] = (w0 >> 52) & mask; + out[27] = (w0 >> 54) & mask; + out[28] = (w0 >> 56) & mask; + out[29] = (w0 >> 58) & mask; + out[30] = (w0 >> 60) & mask; + out[31] = w0 >> 62; + + return in; +} + +inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 7ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 3) & mask; + out[2] = (w0 >> 6) & mask; + out[3] = (w0 >> 9) & mask; + out[4] = (w0 >> 12) & mask; + out[5] = (w0 >> 15) & mask; + out[6] = (w0 >> 18) & mask; + out[7] = (w0 >> 21) & mask; + out[8] = (w0 >> 24) & mask; + out[9] = (w0 >> 27) & mask; + out[10] = (w0 >> 30) & mask; + out[11] = (w0 >> 33) & mask; + out[12] = (w0 >> 36) & mask; + out[13] = (w0 >> 39) & mask; + out[14] = (w0 >> 42) & mask; + out[15] = (w0 >> 45) & mask; + out[16] = (w0 >> 48) & mask; + out[17] = (w0 >> 51) & mask; + out[18] = (w0 >> 54) & mask; + out[19] = (w0 >> 57) & mask; + out[20] = (w0 >> 60) & mask; + out[21] = ((w0 >> 63) | (w1 << 1)) & mask; + out[22] = (w1 >> 2) & mask; + out[23] = (w1 >> 5) & mask; + out[24] = (w1 >> 8) & mask; + out[25] = (w1 >> 11) & mask; + out[26] = (w1 >> 14) & mask; + out[27] = (w1 >> 17) & mask; + out[28] = (w1 >> 20) & mask; + out[29] = (w1 >> 23) & mask; + out[30] = (w1 >> 26) & mask; + out[31] = (w1 >> 29) & mask; + + return in; +} + +inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 15ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 4) & mask; + out[2] = (w0 >> 8) & mask; + out[3] = (w0 >> 12) & mask; + out[4] = (w0 >> 16) & mask; + out[5] = (w0 >> 20) & mask; + out[6] = (w0 >> 24) & mask; + out[7] = (w0 >> 28) & mask; + out[8] = (w0 >> 32) & mask; + out[9] = (w0 >> 36) & mask; + out[10] = (w0 >> 40) & mask; + out[11] = (w0 >> 44) & mask; + out[12] = (w0 >> 48) & mask; + out[13] = (w0 >> 52) & mask; + out[14] = (w0 >> 56) & mask; + out[15] = w0 >> 60; + out[16] = (w1)&mask; + out[17] = (w1 >> 4) & mask; + out[18] = (w1 >> 8) & mask; + out[19] = (w1 >> 12) & mask; + out[20] = (w1 >> 16) & mask; + out[21] = (w1 >> 20) & mask; + out[22] = (w1 >> 24) & mask; + out[23] = (w1 >> 28) & mask; + out[24] = (w1 >> 32) & mask; + out[25] = (w1 >> 36) & mask; + out[26] = (w1 >> 40) & mask; + out[27] = (w1 >> 44) & mask; + out[28] = (w1 >> 48) & mask; + out[29] = (w1 >> 52) & mask; + out[30] = (w1 >> 56) & mask; + out[31] = w1 >> 60; + + return in; +} + +inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 31ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 5) & mask; + out[2] = (w0 >> 10) & mask; + out[3] = (w0 >> 15) & mask; + out[4] = (w0 >> 20) & mask; + out[5] = (w0 >> 25) & mask; + out[6] = (w0 >> 30) & mask; + out[7] = (w0 >> 35) & mask; + out[8] = (w0 >> 40) & mask; + out[9] = (w0 >> 45) & mask; + out[10] = (w0 >> 50) & mask; + out[11] = (w0 >> 55) & mask; + out[12] = ((w0 >> 60) | (w1 << 4)) & mask; + out[13] = (w1 >> 1) & mask; + out[14] = (w1 >> 6) & mask; + out[15] = (w1 >> 11) & mask; + out[16] = (w1 >> 16) & mask; + out[17] = (w1 >> 21) & mask; + out[18] = (w1 >> 26) & mask; + out[19] = (w1 >> 31) & mask; + out[20] = (w1 >> 36) & mask; + out[21] = (w1 >> 41) & mask; + out[22] = (w1 >> 46) & mask; + out[23] = (w1 >> 51) & mask; + out[24] = (w1 >> 56) & mask; + out[25] = ((w1 >> 61) | (w2 << 3)) & mask; + out[26] = (w2 >> 2) & mask; + out[27] = (w2 >> 7) & mask; + out[28] = (w2 >> 12) & mask; + out[29] = (w2 >> 17) & mask; + out[30] = (w2 >> 22) & mask; + out[31] = (w2 >> 27) & mask; + + return in; +} + +inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 63ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 6) & mask; + out[2] = (w0 >> 12) & mask; + out[3] = (w0 >> 18) & mask; + out[4] = (w0 >> 24) & mask; + out[5] = (w0 >> 30) & mask; + out[6] = (w0 >> 36) & mask; + out[7] = (w0 >> 42) & mask; + out[8] = (w0 >> 48) & mask; + out[9] = (w0 >> 54) & mask; + out[10] = ((w0 >> 60) | (w1 << 4)) & mask; + out[11] = (w1 >> 2) & mask; + out[12] = (w1 >> 8) & mask; + out[13] = (w1 >> 14) & mask; + out[14] = (w1 >> 20) & mask; + out[15] = (w1 >> 26) & mask; + out[16] = (w1 >> 32) & mask; + out[17] = (w1 >> 38) & mask; + out[18] = (w1 >> 44) & mask; + out[19] = (w1 >> 50) & mask; + out[20] = (w1 >> 56) & mask; + out[21] = ((w1 >> 62) | (w2 << 2)) & mask; + out[22] = (w2 >> 4) & mask; + out[23] = (w2 >> 10) & mask; + out[24] = (w2 >> 16) & mask; + out[25] = (w2 >> 22) & mask; + out[26] = (w2 >> 28) & mask; + out[27] = (w2 >> 34) & mask; + out[28] = (w2 >> 40) & mask; + out[29] = (w2 >> 46) & mask; + out[30] = (w2 >> 52) & mask; + out[31] = w2 >> 58; + + return in; +} + +inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 127ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 7) & mask; + out[2] = (w0 >> 14) & mask; + out[3] = (w0 >> 21) & mask; + out[4] = (w0 >> 28) & mask; + out[5] = (w0 >> 35) & mask; + out[6] = (w0 >> 42) & mask; + out[7] = (w0 >> 49) & mask; + out[8] = (w0 >> 56) & mask; + out[9] = ((w0 >> 63) | (w1 << 1)) & mask; + out[10] = (w1 >> 6) & mask; + out[11] = (w1 >> 13) & mask; + out[12] = (w1 >> 20) & mask; + out[13] = (w1 >> 27) & mask; + out[14] = (w1 >> 34) & mask; + out[15] = (w1 >> 41) & mask; + out[16] = (w1 >> 48) & mask; + out[17] = (w1 >> 55) & mask; + out[18] = ((w1 >> 62) | (w2 << 2)) & mask; + out[19] = (w2 >> 5) & mask; + out[20] = (w2 >> 12) & mask; + out[21] = (w2 >> 19) & mask; + out[22] = (w2 >> 26) & mask; + out[23] = (w2 >> 33) & mask; + out[24] = (w2 >> 40) & mask; + out[25] = (w2 >> 47) & mask; + out[26] = (w2 >> 54) & mask; + out[27] = ((w2 >> 61) | (w3 << 3)) & mask; + out[28] = (w3 >> 4) & mask; + out[29] = (w3 >> 11) & mask; + out[30] = (w3 >> 18) & mask; + out[31] = (w3 >> 25) & mask; + + return in; +} + +inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 255ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 8) & mask; + out[2] = (w0 >> 16) & mask; + out[3] = (w0 >> 24) & mask; + out[4] = (w0 >> 32) & mask; + out[5] = (w0 >> 40) & mask; + out[6] = (w0 >> 48) & mask; + out[7] = w0 >> 56; + out[8] = (w1)&mask; + out[9] = (w1 >> 8) & mask; + out[10] = (w1 >> 16) & mask; + out[11] = (w1 >> 24) & mask; + out[12] = (w1 >> 32) & mask; + out[13] = (w1 >> 40) & mask; + out[14] = (w1 >> 48) & mask; + out[15] = w1 >> 56; + out[16] = (w2)&mask; + out[17] = (w2 >> 8) & mask; + out[18] = (w2 >> 16) & mask; + out[19] = (w2 >> 24) & mask; + out[20] = (w2 >> 32) & mask; + out[21] = (w2 >> 40) & mask; + out[22] = (w2 >> 48) & mask; + out[23] = w2 >> 56; + out[24] = (w3)&mask; + out[25] = (w3 >> 8) & mask; + out[26] = (w3 >> 16) & mask; + out[27] = (w3 >> 24) & mask; + out[28] = (w3 >> 32) & mask; + out[29] = (w3 >> 40) & mask; + out[30] = (w3 >> 48) & mask; + out[31] = w3 >> 56; + + return in; +} + +inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 511ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 9) & mask; + out[2] = (w0 >> 18) & mask; + out[3] = (w0 >> 27) & mask; + out[4] = (w0 >> 36) & mask; + out[5] = (w0 >> 45) & mask; + out[6] = (w0 >> 54) & mask; + out[7] = ((w0 >> 63) | (w1 << 1)) & mask; + out[8] = (w1 >> 8) & mask; + out[9] = (w1 >> 17) & mask; + out[10] = (w1 >> 26) & mask; + out[11] = (w1 >> 35) & mask; + out[12] = (w1 >> 44) & mask; + out[13] = (w1 >> 53) & mask; + out[14] = ((w1 >> 62) | (w2 << 2)) & mask; + out[15] = (w2 >> 7) & mask; + out[16] = (w2 >> 16) & mask; + out[17] = (w2 >> 25) & mask; + out[18] = (w2 >> 34) & mask; + out[19] = (w2 >> 43) & mask; + out[20] = (w2 >> 52) & mask; + out[21] = ((w2 >> 61) | (w3 << 3)) & mask; + out[22] = (w3 >> 6) & mask; + out[23] = (w3 >> 15) & mask; + out[24] = (w3 >> 24) & mask; + out[25] = (w3 >> 33) & mask; + out[26] = (w3 >> 42) & mask; + out[27] = (w3 >> 51) & mask; + out[28] = ((w3 >> 60) | (w4 << 4)) & mask; + out[29] = (w4 >> 5) & mask; + out[30] = (w4 >> 14) & mask; + out[31] = (w4 >> 23) & mask; + + return in; +} + +inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 1023ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 10) & mask; + out[2] = (w0 >> 20) & mask; + out[3] = (w0 >> 30) & mask; + out[4] = (w0 >> 40) & mask; + out[5] = (w0 >> 50) & mask; + out[6] = ((w0 >> 60) | (w1 << 4)) & mask; + out[7] = (w1 >> 6) & mask; + out[8] = (w1 >> 16) & mask; + out[9] = (w1 >> 26) & mask; + out[10] = (w1 >> 36) & mask; + out[11] = (w1 >> 46) & mask; + out[12] = ((w1 >> 56) | (w2 << 8)) & mask; + out[13] = (w2 >> 2) & mask; + out[14] = (w2 >> 12) & mask; + out[15] = (w2 >> 22) & mask; + out[16] = (w2 >> 32) & mask; + out[17] = (w2 >> 42) & mask; + out[18] = (w2 >> 52) & mask; + out[19] = ((w2 >> 62) | (w3 << 2)) & mask; + out[20] = (w3 >> 8) & mask; + out[21] = (w3 >> 18) & mask; + out[22] = (w3 >> 28) & mask; + out[23] = (w3 >> 38) & mask; + out[24] = (w3 >> 48) & mask; + out[25] = ((w3 >> 58) | (w4 << 6)) & mask; + out[26] = (w4 >> 4) & mask; + out[27] = (w4 >> 14) & mask; + out[28] = (w4 >> 24) & mask; + out[29] = (w4 >> 34) & mask; + out[30] = (w4 >> 44) & mask; + out[31] = w4 >> 54; + + return in; +} + +inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 2047ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 11) & mask; + out[2] = (w0 >> 22) & mask; + out[3] = (w0 >> 33) & mask; + out[4] = (w0 >> 44) & mask; + out[5] = ((w0 >> 55) | (w1 << 9)) & mask; + out[6] = (w1 >> 2) & mask; + out[7] = (w1 >> 13) & mask; + out[8] = (w1 >> 24) & mask; + out[9] = (w1 >> 35) & mask; + out[10] = (w1 >> 46) & mask; + out[11] = ((w1 >> 57) | (w2 << 7)) & mask; + out[12] = (w2 >> 4) & mask; + out[13] = (w2 >> 15) & mask; + out[14] = (w2 >> 26) & mask; + out[15] = (w2 >> 37) & mask; + out[16] = (w2 >> 48) & mask; + out[17] = ((w2 >> 59) | (w3 << 5)) & mask; + out[18] = (w3 >> 6) & mask; + out[19] = (w3 >> 17) & mask; + out[20] = (w3 >> 28) & mask; + out[21] = (w3 >> 39) & mask; + out[22] = (w3 >> 50) & mask; + out[23] = ((w3 >> 61) | (w4 << 3)) & mask; + out[24] = (w4 >> 8) & mask; + out[25] = (w4 >> 19) & mask; + out[26] = (w4 >> 30) & mask; + out[27] = (w4 >> 41) & mask; + out[28] = (w4 >> 52) & mask; + out[29] = ((w4 >> 63) | (w5 << 1)) & mask; + out[30] = (w5 >> 10) & mask; + out[31] = (w5 >> 21) & mask; + + return in; +} + +inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 4095ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 12) & mask; + out[2] = (w0 >> 24) & mask; + out[3] = (w0 >> 36) & mask; + out[4] = (w0 >> 48) & mask; + out[5] = ((w0 >> 60) | (w1 << 4)) & mask; + out[6] = (w1 >> 8) & mask; + out[7] = (w1 >> 20) & mask; + out[8] = (w1 >> 32) & mask; + out[9] = (w1 >> 44) & mask; + out[10] = ((w1 >> 56) | (w2 << 8)) & mask; + out[11] = (w2 >> 4) & mask; + out[12] = (w2 >> 16) & mask; + out[13] = (w2 >> 28) & mask; + out[14] = (w2 >> 40) & mask; + out[15] = w2 >> 52; + out[16] = (w3)&mask; + out[17] = (w3 >> 12) & mask; + out[18] = (w3 >> 24) & mask; + out[19] = (w3 >> 36) & mask; + out[20] = (w3 >> 48) & mask; + out[21] = ((w3 >> 60) | (w4 << 4)) & mask; + out[22] = (w4 >> 8) & mask; + out[23] = (w4 >> 20) & mask; + out[24] = (w4 >> 32) & mask; + out[25] = (w4 >> 44) & mask; + out[26] = ((w4 >> 56) | (w5 << 8)) & mask; + out[27] = (w5 >> 4) & mask; + out[28] = (w5 >> 16) & mask; + out[29] = (w5 >> 28) & mask; + out[30] = (w5 >> 40) & mask; + out[31] = w5 >> 52; + + return in; +} + +inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 8191ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 13) & mask; + out[2] = (w0 >> 26) & mask; + out[3] = (w0 >> 39) & mask; + out[4] = ((w0 >> 52) | (w1 << 12)) & mask; + out[5] = (w1 >> 1) & mask; + out[6] = (w1 >> 14) & mask; + out[7] = (w1 >> 27) & mask; + out[8] = (w1 >> 40) & mask; + out[9] = ((w1 >> 53) | (w2 << 11)) & mask; + out[10] = (w2 >> 2) & mask; + out[11] = (w2 >> 15) & mask; + out[12] = (w2 >> 28) & mask; + out[13] = (w2 >> 41) & mask; + out[14] = ((w2 >> 54) | (w3 << 10)) & mask; + out[15] = (w3 >> 3) & mask; + out[16] = (w3 >> 16) & mask; + out[17] = (w3 >> 29) & mask; + out[18] = (w3 >> 42) & mask; + out[19] = ((w3 >> 55) | (w4 << 9)) & mask; + out[20] = (w4 >> 4) & mask; + out[21] = (w4 >> 17) & mask; + out[22] = (w4 >> 30) & mask; + out[23] = (w4 >> 43) & mask; + out[24] = ((w4 >> 56) | (w5 << 8)) & mask; + out[25] = (w5 >> 5) & mask; + out[26] = (w5 >> 18) & mask; + out[27] = (w5 >> 31) & mask; + out[28] = (w5 >> 44) & mask; + out[29] = ((w5 >> 57) | (w6 << 7)) & mask; + out[30] = (w6 >> 6) & mask; + out[31] = (w6 >> 19) & mask; + + return in; +} + +inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 16383ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 14) & mask; + out[2] = (w0 >> 28) & mask; + out[3] = (w0 >> 42) & mask; + out[4] = ((w0 >> 56) | (w1 << 8)) & mask; + out[5] = (w1 >> 6) & mask; + out[6] = (w1 >> 20) & mask; + out[7] = (w1 >> 34) & mask; + out[8] = (w1 >> 48) & mask; + out[9] = ((w1 >> 62) | (w2 << 2)) & mask; + out[10] = (w2 >> 12) & mask; + out[11] = (w2 >> 26) & mask; + out[12] = (w2 >> 40) & mask; + out[13] = ((w2 >> 54) | (w3 << 10)) & mask; + out[14] = (w3 >> 4) & mask; + out[15] = (w3 >> 18) & mask; + out[16] = (w3 >> 32) & mask; + out[17] = (w3 >> 46) & mask; + out[18] = ((w3 >> 60) | (w4 << 4)) & mask; + out[19] = (w4 >> 10) & mask; + out[20] = (w4 >> 24) & mask; + out[21] = (w4 >> 38) & mask; + out[22] = ((w4 >> 52) | (w5 << 12)) & mask; + out[23] = (w5 >> 2) & mask; + out[24] = (w5 >> 16) & mask; + out[25] = (w5 >> 30) & mask; + out[26] = (w5 >> 44) & mask; + out[27] = ((w5 >> 58) | (w6 << 6)) & mask; + out[28] = (w6 >> 8) & mask; + out[29] = (w6 >> 22) & mask; + out[30] = (w6 >> 36) & mask; + out[31] = w6 >> 50; + + return in; +} + +inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 32767ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 15) & mask; + out[2] = (w0 >> 30) & mask; + out[3] = (w0 >> 45) & mask; + out[4] = ((w0 >> 60) | (w1 << 4)) & mask; + out[5] = (w1 >> 11) & mask; + out[6] = (w1 >> 26) & mask; + out[7] = (w1 >> 41) & mask; + out[8] = ((w1 >> 56) | (w2 << 8)) & mask; + out[9] = (w2 >> 7) & mask; + out[10] = (w2 >> 22) & mask; + out[11] = (w2 >> 37) & mask; + out[12] = ((w2 >> 52) | (w3 << 12)) & mask; + out[13] = (w3 >> 3) & mask; + out[14] = (w3 >> 18) & mask; + out[15] = (w3 >> 33) & mask; + out[16] = (w3 >> 48) & mask; + out[17] = ((w3 >> 63) | (w4 << 1)) & mask; + out[18] = (w4 >> 14) & mask; + out[19] = (w4 >> 29) & mask; + out[20] = (w4 >> 44) & mask; + out[21] = ((w4 >> 59) | (w5 << 5)) & mask; + out[22] = (w5 >> 10) & mask; + out[23] = (w5 >> 25) & mask; + out[24] = (w5 >> 40) & mask; + out[25] = ((w5 >> 55) | (w6 << 9)) & mask; + out[26] = (w6 >> 6) & mask; + out[27] = (w6 >> 21) & mask; + out[28] = (w6 >> 36) & mask; + out[29] = ((w6 >> 51) | (w7 << 13)) & mask; + out[30] = (w7 >> 2) & mask; + out[31] = (w7 >> 17) & mask; + + return in; +} + +inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 65535ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 16) & mask; + out[2] = (w0 >> 32) & mask; + out[3] = w0 >> 48; + out[4] = (w1)&mask; + out[5] = (w1 >> 16) & mask; + out[6] = (w1 >> 32) & mask; + out[7] = w1 >> 48; + out[8] = (w2)&mask; + out[9] = (w2 >> 16) & mask; + out[10] = (w2 >> 32) & mask; + out[11] = w2 >> 48; + out[12] = (w3)&mask; + out[13] = (w3 >> 16) & mask; + out[14] = (w3 >> 32) & mask; + out[15] = w3 >> 48; + out[16] = (w4)&mask; + out[17] = (w4 >> 16) & mask; + out[18] = (w4 >> 32) & mask; + out[19] = w4 >> 48; + out[20] = (w5)&mask; + out[21] = (w5 >> 16) & mask; + out[22] = (w5 >> 32) & mask; + out[23] = w5 >> 48; + out[24] = (w6)&mask; + out[25] = (w6 >> 16) & mask; + out[26] = (w6 >> 32) & mask; + out[27] = w6 >> 48; + out[28] = (w7)&mask; + out[29] = (w7 >> 16) & mask; + out[30] = (w7 >> 32) & mask; + out[31] = w7 >> 48; + + return in; +} + +inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 131071ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 17) & mask; + out[2] = (w0 >> 34) & mask; + out[3] = ((w0 >> 51) | (w1 << 13)) & mask; + out[4] = (w1 >> 4) & mask; + out[5] = (w1 >> 21) & mask; + out[6] = (w1 >> 38) & mask; + out[7] = ((w1 >> 55) | (w2 << 9)) & mask; + out[8] = (w2 >> 8) & mask; + out[9] = (w2 >> 25) & mask; + out[10] = (w2 >> 42) & mask; + out[11] = ((w2 >> 59) | (w3 << 5)) & mask; + out[12] = (w3 >> 12) & mask; + out[13] = (w3 >> 29) & mask; + out[14] = (w3 >> 46) & mask; + out[15] = ((w3 >> 63) | (w4 << 1)) & mask; + out[16] = (w4 >> 16) & mask; + out[17] = (w4 >> 33) & mask; + out[18] = ((w4 >> 50) | (w5 << 14)) & mask; + out[19] = (w5 >> 3) & mask; + out[20] = (w5 >> 20) & mask; + out[21] = (w5 >> 37) & mask; + out[22] = ((w5 >> 54) | (w6 << 10)) & mask; + out[23] = (w6 >> 7) & mask; + out[24] = (w6 >> 24) & mask; + out[25] = (w6 >> 41) & mask; + out[26] = ((w6 >> 58) | (w7 << 6)) & mask; + out[27] = (w7 >> 11) & mask; + out[28] = (w7 >> 28) & mask; + out[29] = (w7 >> 45) & mask; + out[30] = ((w7 >> 62) | (w8 << 2)) & mask; + out[31] = (w8 >> 15) & mask; + + return in; +} + +inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 262143ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 18) & mask; + out[2] = (w0 >> 36) & mask; + out[3] = ((w0 >> 54) | (w1 << 10)) & mask; + out[4] = (w1 >> 8) & mask; + out[5] = (w1 >> 26) & mask; + out[6] = (w1 >> 44) & mask; + out[7] = ((w1 >> 62) | (w2 << 2)) & mask; + out[8] = (w2 >> 16) & mask; + out[9] = (w2 >> 34) & mask; + out[10] = ((w2 >> 52) | (w3 << 12)) & mask; + out[11] = (w3 >> 6) & mask; + out[12] = (w3 >> 24) & mask; + out[13] = (w3 >> 42) & mask; + out[14] = ((w3 >> 60) | (w4 << 4)) & mask; + out[15] = (w4 >> 14) & mask; + out[16] = (w4 >> 32) & mask; + out[17] = ((w4 >> 50) | (w5 << 14)) & mask; + out[18] = (w5 >> 4) & mask; + out[19] = (w5 >> 22) & mask; + out[20] = (w5 >> 40) & mask; + out[21] = ((w5 >> 58) | (w6 << 6)) & mask; + out[22] = (w6 >> 12) & mask; + out[23] = (w6 >> 30) & mask; + out[24] = ((w6 >> 48) | (w7 << 16)) & mask; + out[25] = (w7 >> 2) & mask; + out[26] = (w7 >> 20) & mask; + out[27] = (w7 >> 38) & mask; + out[28] = ((w7 >> 56) | (w8 << 8)) & mask; + out[29] = (w8 >> 10) & mask; + out[30] = (w8 >> 28) & mask; + out[31] = w8 >> 46; + + return in; +} + +inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 524287ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 19) & mask; + out[2] = (w0 >> 38) & mask; + out[3] = ((w0 >> 57) | (w1 << 7)) & mask; + out[4] = (w1 >> 12) & mask; + out[5] = (w1 >> 31) & mask; + out[6] = ((w1 >> 50) | (w2 << 14)) & mask; + out[7] = (w2 >> 5) & mask; + out[8] = (w2 >> 24) & mask; + out[9] = (w2 >> 43) & mask; + out[10] = ((w2 >> 62) | (w3 << 2)) & mask; + out[11] = (w3 >> 17) & mask; + out[12] = (w3 >> 36) & mask; + out[13] = ((w3 >> 55) | (w4 << 9)) & mask; + out[14] = (w4 >> 10) & mask; + out[15] = (w4 >> 29) & mask; + out[16] = ((w4 >> 48) | (w5 << 16)) & mask; + out[17] = (w5 >> 3) & mask; + out[18] = (w5 >> 22) & mask; + out[19] = (w5 >> 41) & mask; + out[20] = ((w5 >> 60) | (w6 << 4)) & mask; + out[21] = (w6 >> 15) & mask; + out[22] = (w6 >> 34) & mask; + out[23] = ((w6 >> 53) | (w7 << 11)) & mask; + out[24] = (w7 >> 8) & mask; + out[25] = (w7 >> 27) & mask; + out[26] = ((w7 >> 46) | (w8 << 18)) & mask; + out[27] = (w8 >> 1) & mask; + out[28] = (w8 >> 20) & mask; + out[29] = (w8 >> 39) & mask; + out[30] = ((w8 >> 58) | (w9 << 6)) & mask; + out[31] = (w9 >> 13) & mask; + + return in; +} + +inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 1048575ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 20) & mask; + out[2] = (w0 >> 40) & mask; + out[3] = ((w0 >> 60) | (w1 << 4)) & mask; + out[4] = (w1 >> 16) & mask; + out[5] = (w1 >> 36) & mask; + out[6] = ((w1 >> 56) | (w2 << 8)) & mask; + out[7] = (w2 >> 12) & mask; + out[8] = (w2 >> 32) & mask; + out[9] = ((w2 >> 52) | (w3 << 12)) & mask; + out[10] = (w3 >> 8) & mask; + out[11] = (w3 >> 28) & mask; + out[12] = ((w3 >> 48) | (w4 << 16)) & mask; + out[13] = (w4 >> 4) & mask; + out[14] = (w4 >> 24) & mask; + out[15] = w4 >> 44; + out[16] = (w5)&mask; + out[17] = (w5 >> 20) & mask; + out[18] = (w5 >> 40) & mask; + out[19] = ((w5 >> 60) | (w6 << 4)) & mask; + out[20] = (w6 >> 16) & mask; + out[21] = (w6 >> 36) & mask; + out[22] = ((w6 >> 56) | (w7 << 8)) & mask; + out[23] = (w7 >> 12) & mask; + out[24] = (w7 >> 32) & mask; + out[25] = ((w7 >> 52) | (w8 << 12)) & mask; + out[26] = (w8 >> 8) & mask; + out[27] = (w8 >> 28) & mask; + out[28] = ((w8 >> 48) | (w9 << 16)) & mask; + out[29] = (w9 >> 4) & mask; + out[30] = (w9 >> 24) & mask; + out[31] = w9 >> 44; + + return in; +} + +inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 2097151ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 21) & mask; + out[2] = (w0 >> 42) & mask; + out[3] = ((w0 >> 63) | (w1 << 1)) & mask; + out[4] = (w1 >> 20) & mask; + out[5] = (w1 >> 41) & mask; + out[6] = ((w1 >> 62) | (w2 << 2)) & mask; + out[7] = (w2 >> 19) & mask; + out[8] = (w2 >> 40) & mask; + out[9] = ((w2 >> 61) | (w3 << 3)) & mask; + out[10] = (w3 >> 18) & mask; + out[11] = (w3 >> 39) & mask; + out[12] = ((w3 >> 60) | (w4 << 4)) & mask; + out[13] = (w4 >> 17) & mask; + out[14] = (w4 >> 38) & mask; + out[15] = ((w4 >> 59) | (w5 << 5)) & mask; + out[16] = (w5 >> 16) & mask; + out[17] = (w5 >> 37) & mask; + out[18] = ((w5 >> 58) | (w6 << 6)) & mask; + out[19] = (w6 >> 15) & mask; + out[20] = (w6 >> 36) & mask; + out[21] = ((w6 >> 57) | (w7 << 7)) & mask; + out[22] = (w7 >> 14) & mask; + out[23] = (w7 >> 35) & mask; + out[24] = ((w7 >> 56) | (w8 << 8)) & mask; + out[25] = (w8 >> 13) & mask; + out[26] = (w8 >> 34) & mask; + out[27] = ((w8 >> 55) | (w9 << 9)) & mask; + out[28] = (w9 >> 12) & mask; + out[29] = (w9 >> 33) & mask; + out[30] = ((w9 >> 54) | (w10 << 10)) & mask; + out[31] = (w10 >> 11) & mask; + + return in; +} + +inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 4194303ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 22) & mask; + out[2] = ((w0 >> 44) | (w1 << 20)) & mask; + out[3] = (w1 >> 2) & mask; + out[4] = (w1 >> 24) & mask; + out[5] = ((w1 >> 46) | (w2 << 18)) & mask; + out[6] = (w2 >> 4) & mask; + out[7] = (w2 >> 26) & mask; + out[8] = ((w2 >> 48) | (w3 << 16)) & mask; + out[9] = (w3 >> 6) & mask; + out[10] = (w3 >> 28) & mask; + out[11] = ((w3 >> 50) | (w4 << 14)) & mask; + out[12] = (w4 >> 8) & mask; + out[13] = (w4 >> 30) & mask; + out[14] = ((w4 >> 52) | (w5 << 12)) & mask; + out[15] = (w5 >> 10) & mask; + out[16] = (w5 >> 32) & mask; + out[17] = ((w5 >> 54) | (w6 << 10)) & mask; + out[18] = (w6 >> 12) & mask; + out[19] = (w6 >> 34) & mask; + out[20] = ((w6 >> 56) | (w7 << 8)) & mask; + out[21] = (w7 >> 14) & mask; + out[22] = (w7 >> 36) & mask; + out[23] = ((w7 >> 58) | (w8 << 6)) & mask; + out[24] = (w8 >> 16) & mask; + out[25] = (w8 >> 38) & mask; + out[26] = ((w8 >> 60) | (w9 << 4)) & mask; + out[27] = (w9 >> 18) & mask; + out[28] = (w9 >> 40) & mask; + out[29] = ((w9 >> 62) | (w10 << 2)) & mask; + out[30] = (w10 >> 20) & mask; + out[31] = w10 >> 42; + + return in; +} + +inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 8388607ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 23) & mask; + out[2] = ((w0 >> 46) | (w1 << 18)) & mask; + out[3] = (w1 >> 5) & mask; + out[4] = (w1 >> 28) & mask; + out[5] = ((w1 >> 51) | (w2 << 13)) & mask; + out[6] = (w2 >> 10) & mask; + out[7] = (w2 >> 33) & mask; + out[8] = ((w2 >> 56) | (w3 << 8)) & mask; + out[9] = (w3 >> 15) & mask; + out[10] = (w3 >> 38) & mask; + out[11] = ((w3 >> 61) | (w4 << 3)) & mask; + out[12] = (w4 >> 20) & mask; + out[13] = ((w4 >> 43) | (w5 << 21)) & mask; + out[14] = (w5 >> 2) & mask; + out[15] = (w5 >> 25) & mask; + out[16] = ((w5 >> 48) | (w6 << 16)) & mask; + out[17] = (w6 >> 7) & mask; + out[18] = (w6 >> 30) & mask; + out[19] = ((w6 >> 53) | (w7 << 11)) & mask; + out[20] = (w7 >> 12) & mask; + out[21] = (w7 >> 35) & mask; + out[22] = ((w7 >> 58) | (w8 << 6)) & mask; + out[23] = (w8 >> 17) & mask; + out[24] = (w8 >> 40) & mask; + out[25] = ((w8 >> 63) | (w9 << 1)) & mask; + out[26] = (w9 >> 22) & mask; + out[27] = ((w9 >> 45) | (w10 << 19)) & mask; + out[28] = (w10 >> 4) & mask; + out[29] = (w10 >> 27) & mask; + out[30] = ((w10 >> 50) | (w11 << 14)) & mask; + out[31] = (w11 >> 9) & mask; + + return in; +} + +inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 16777215ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 24) & mask; + out[2] = ((w0 >> 48) | (w1 << 16)) & mask; + out[3] = (w1 >> 8) & mask; + out[4] = (w1 >> 32) & mask; + out[5] = ((w1 >> 56) | (w2 << 8)) & mask; + out[6] = (w2 >> 16) & mask; + out[7] = w2 >> 40; + out[8] = (w3)&mask; + out[9] = (w3 >> 24) & mask; + out[10] = ((w3 >> 48) | (w4 << 16)) & mask; + out[11] = (w4 >> 8) & mask; + out[12] = (w4 >> 32) & mask; + out[13] = ((w4 >> 56) | (w5 << 8)) & mask; + out[14] = (w5 >> 16) & mask; + out[15] = w5 >> 40; + out[16] = (w6)&mask; + out[17] = (w6 >> 24) & mask; + out[18] = ((w6 >> 48) | (w7 << 16)) & mask; + out[19] = (w7 >> 8) & mask; + out[20] = (w7 >> 32) & mask; + out[21] = ((w7 >> 56) | (w8 << 8)) & mask; + out[22] = (w8 >> 16) & mask; + out[23] = w8 >> 40; + out[24] = (w9)&mask; + out[25] = (w9 >> 24) & mask; + out[26] = ((w9 >> 48) | (w10 << 16)) & mask; + out[27] = (w10 >> 8) & mask; + out[28] = (w10 >> 32) & mask; + out[29] = ((w10 >> 56) | (w11 << 8)) & mask; + out[30] = (w11 >> 16) & mask; + out[31] = w11 >> 40; + + return in; +} + +inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 33554431ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 25) & mask; + out[2] = ((w0 >> 50) | (w1 << 14)) & mask; + out[3] = (w1 >> 11) & mask; + out[4] = (w1 >> 36) & mask; + out[5] = ((w1 >> 61) | (w2 << 3)) & mask; + out[6] = (w2 >> 22) & mask; + out[7] = ((w2 >> 47) | (w3 << 17)) & mask; + out[8] = (w3 >> 8) & mask; + out[9] = (w3 >> 33) & mask; + out[10] = ((w3 >> 58) | (w4 << 6)) & mask; + out[11] = (w4 >> 19) & mask; + out[12] = ((w4 >> 44) | (w5 << 20)) & mask; + out[13] = (w5 >> 5) & mask; + out[14] = (w5 >> 30) & mask; + out[15] = ((w5 >> 55) | (w6 << 9)) & mask; + out[16] = (w6 >> 16) & mask; + out[17] = ((w6 >> 41) | (w7 << 23)) & mask; + out[18] = (w7 >> 2) & mask; + out[19] = (w7 >> 27) & mask; + out[20] = ((w7 >> 52) | (w8 << 12)) & mask; + out[21] = (w8 >> 13) & mask; + out[22] = (w8 >> 38) & mask; + out[23] = ((w8 >> 63) | (w9 << 1)) & mask; + out[24] = (w9 >> 24) & mask; + out[25] = ((w9 >> 49) | (w10 << 15)) & mask; + out[26] = (w10 >> 10) & mask; + out[27] = (w10 >> 35) & mask; + out[28] = ((w10 >> 60) | (w11 << 4)) & mask; + out[29] = (w11 >> 21) & mask; + out[30] = ((w11 >> 46) | (w12 << 18)) & mask; + out[31] = (w12 >> 7) & mask; + + return in; +} + +inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 67108863ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 26) & mask; + out[2] = ((w0 >> 52) | (w1 << 12)) & mask; + out[3] = (w1 >> 14) & mask; + out[4] = ((w1 >> 40) | (w2 << 24)) & mask; + out[5] = (w2 >> 2) & mask; + out[6] = (w2 >> 28) & mask; + out[7] = ((w2 >> 54) | (w3 << 10)) & mask; + out[8] = (w3 >> 16) & mask; + out[9] = ((w3 >> 42) | (w4 << 22)) & mask; + out[10] = (w4 >> 4) & mask; + out[11] = (w4 >> 30) & mask; + out[12] = ((w4 >> 56) | (w5 << 8)) & mask; + out[13] = (w5 >> 18) & mask; + out[14] = ((w5 >> 44) | (w6 << 20)) & mask; + out[15] = (w6 >> 6) & mask; + out[16] = (w6 >> 32) & mask; + out[17] = ((w6 >> 58) | (w7 << 6)) & mask; + out[18] = (w7 >> 20) & mask; + out[19] = ((w7 >> 46) | (w8 << 18)) & mask; + out[20] = (w8 >> 8) & mask; + out[21] = (w8 >> 34) & mask; + out[22] = ((w8 >> 60) | (w9 << 4)) & mask; + out[23] = (w9 >> 22) & mask; + out[24] = ((w9 >> 48) | (w10 << 16)) & mask; + out[25] = (w10 >> 10) & mask; + out[26] = (w10 >> 36) & mask; + out[27] = ((w10 >> 62) | (w11 << 2)) & mask; + out[28] = (w11 >> 24) & mask; + out[29] = ((w11 >> 50) | (w12 << 14)) & mask; + out[30] = (w12 >> 12) & mask; + out[31] = w12 >> 38; + + return in; +} + +inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 134217727ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 27) & mask; + out[2] = ((w0 >> 54) | (w1 << 10)) & mask; + out[3] = (w1 >> 17) & mask; + out[4] = ((w1 >> 44) | (w2 << 20)) & mask; + out[5] = (w2 >> 7) & mask; + out[6] = (w2 >> 34) & mask; + out[7] = ((w2 >> 61) | (w3 << 3)) & mask; + out[8] = (w3 >> 24) & mask; + out[9] = ((w3 >> 51) | (w4 << 13)) & mask; + out[10] = (w4 >> 14) & mask; + out[11] = ((w4 >> 41) | (w5 << 23)) & mask; + out[12] = (w5 >> 4) & mask; + out[13] = (w5 >> 31) & mask; + out[14] = ((w5 >> 58) | (w6 << 6)) & mask; + out[15] = (w6 >> 21) & mask; + out[16] = ((w6 >> 48) | (w7 << 16)) & mask; + out[17] = (w7 >> 11) & mask; + out[18] = ((w7 >> 38) | (w8 << 26)) & mask; + out[19] = (w8 >> 1) & mask; + out[20] = (w8 >> 28) & mask; + out[21] = ((w8 >> 55) | (w9 << 9)) & mask; + out[22] = (w9 >> 18) & mask; + out[23] = ((w9 >> 45) | (w10 << 19)) & mask; + out[24] = (w10 >> 8) & mask; + out[25] = (w10 >> 35) & mask; + out[26] = ((w10 >> 62) | (w11 << 2)) & mask; + out[27] = (w11 >> 25) & mask; + out[28] = ((w11 >> 52) | (w12 << 12)) & mask; + out[29] = (w12 >> 15) & mask; + out[30] = ((w12 >> 42) | (w13 << 22)) & mask; + out[31] = (w13 >> 5) & mask; + + return in; +} + +inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 268435455ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 28) & mask; + out[2] = ((w0 >> 56) | (w1 << 8)) & mask; + out[3] = (w1 >> 20) & mask; + out[4] = ((w1 >> 48) | (w2 << 16)) & mask; + out[5] = (w2 >> 12) & mask; + out[6] = ((w2 >> 40) | (w3 << 24)) & mask; + out[7] = (w3 >> 4) & mask; + out[8] = (w3 >> 32) & mask; + out[9] = ((w3 >> 60) | (w4 << 4)) & mask; + out[10] = (w4 >> 24) & mask; + out[11] = ((w4 >> 52) | (w5 << 12)) & mask; + out[12] = (w5 >> 16) & mask; + out[13] = ((w5 >> 44) | (w6 << 20)) & mask; + out[14] = (w6 >> 8) & mask; + out[15] = w6 >> 36; + out[16] = (w7)&mask; + out[17] = (w7 >> 28) & mask; + out[18] = ((w7 >> 56) | (w8 << 8)) & mask; + out[19] = (w8 >> 20) & mask; + out[20] = ((w8 >> 48) | (w9 << 16)) & mask; + out[21] = (w9 >> 12) & mask; + out[22] = ((w9 >> 40) | (w10 << 24)) & mask; + out[23] = (w10 >> 4) & mask; + out[24] = (w10 >> 32) & mask; + out[25] = ((w10 >> 60) | (w11 << 4)) & mask; + out[26] = (w11 >> 24) & mask; + out[27] = ((w11 >> 52) | (w12 << 12)) & mask; + out[28] = (w12 >> 16) & mask; + out[29] = ((w12 >> 44) | (w13 << 20)) & mask; + out[30] = (w13 >> 8) & mask; + out[31] = w13 >> 36; + + return in; +} + +inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 536870911ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 29) & mask; + out[2] = ((w0 >> 58) | (w1 << 6)) & mask; + out[3] = (w1 >> 23) & mask; + out[4] = ((w1 >> 52) | (w2 << 12)) & mask; + out[5] = (w2 >> 17) & mask; + out[6] = ((w2 >> 46) | (w3 << 18)) & mask; + out[7] = (w3 >> 11) & mask; + out[8] = ((w3 >> 40) | (w4 << 24)) & mask; + out[9] = (w4 >> 5) & mask; + out[10] = (w4 >> 34) & mask; + out[11] = ((w4 >> 63) | (w5 << 1)) & mask; + out[12] = (w5 >> 28) & mask; + out[13] = ((w5 >> 57) | (w6 << 7)) & mask; + out[14] = (w6 >> 22) & mask; + out[15] = ((w6 >> 51) | (w7 << 13)) & mask; + out[16] = (w7 >> 16) & mask; + out[17] = ((w7 >> 45) | (w8 << 19)) & mask; + out[18] = (w8 >> 10) & mask; + out[19] = ((w8 >> 39) | (w9 << 25)) & mask; + out[20] = (w9 >> 4) & mask; + out[21] = (w9 >> 33) & mask; + out[22] = ((w9 >> 62) | (w10 << 2)) & mask; + out[23] = (w10 >> 27) & mask; + out[24] = ((w10 >> 56) | (w11 << 8)) & mask; + out[25] = (w11 >> 21) & mask; + out[26] = ((w11 >> 50) | (w12 << 14)) & mask; + out[27] = (w12 >> 15) & mask; + out[28] = ((w12 >> 44) | (w13 << 20)) & mask; + out[29] = (w13 >> 9) & mask; + out[30] = ((w13 >> 38) | (w14 << 26)) & mask; + out[31] = (w14 >> 3) & mask; + + return in; +} + +inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 1073741823ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + out[0] = (w0)&mask; + out[1] = (w0 >> 30) & mask; + out[2] = ((w0 >> 60) | (w1 << 4)) & mask; + out[3] = (w1 >> 26) & mask; + out[4] = ((w1 >> 56) | (w2 << 8)) & mask; + out[5] = (w2 >> 22) & mask; + out[6] = ((w2 >> 52) | (w3 << 12)) & mask; + out[7] = (w3 >> 18) & mask; + out[8] = ((w3 >> 48) | (w4 << 16)) & mask; + out[9] = (w4 >> 14) & mask; + out[10] = ((w4 >> 44) | (w5 << 20)) & mask; + out[11] = (w5 >> 10) & mask; + out[12] = ((w5 >> 40) | (w6 << 24)) & mask; + out[13] = (w6 >> 6) & mask; + out[14] = ((w6 >> 36) | (w7 << 28)) & mask; + out[15] = (w7 >> 2) & mask; + out[16] = (w7 >> 32) & mask; + out[17] = ((w7 >> 62) | (w8 << 2)) & mask; + out[18] = (w8 >> 28) & mask; + out[19] = ((w8 >> 58) | (w9 << 6)) & mask; + out[20] = (w9 >> 24) & mask; + out[21] = ((w9 >> 54) | (w10 << 10)) & mask; + out[22] = (w10 >> 20) & mask; + out[23] = ((w10 >> 50) | (w11 << 14)) & mask; + out[24] = (w11 >> 16) & mask; + out[25] = ((w11 >> 46) | (w12 << 18)) & mask; + out[26] = (w12 >> 12) & mask; + out[27] = ((w12 >> 42) | (w13 << 22)) & mask; + out[28] = (w13 >> 8) & mask; + out[29] = ((w13 >> 38) | (w14 << 26)) & mask; + out[30] = (w14 >> 4) & mask; + out[31] = w14 >> 34; + + return in; +} + +inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 2147483647ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 4; + out[0] = (w0)&mask; + out[1] = (w0 >> 31) & mask; + out[2] = ((w0 >> 62) | (w1 << 2)) & mask; + out[3] = (w1 >> 29) & mask; + out[4] = ((w1 >> 60) | (w2 << 4)) & mask; + out[5] = (w2 >> 27) & mask; + out[6] = ((w2 >> 58) | (w3 << 6)) & mask; + out[7] = (w3 >> 25) & mask; + out[8] = ((w3 >> 56) | (w4 << 8)) & mask; + out[9] = (w4 >> 23) & mask; + out[10] = ((w4 >> 54) | (w5 << 10)) & mask; + out[11] = (w5 >> 21) & mask; + out[12] = ((w5 >> 52) | (w6 << 12)) & mask; + out[13] = (w6 >> 19) & mask; + out[14] = ((w6 >> 50) | (w7 << 14)) & mask; + out[15] = (w7 >> 17) & mask; + out[16] = ((w7 >> 48) | (w8 << 16)) & mask; + out[17] = (w8 >> 15) & mask; + out[18] = ((w8 >> 46) | (w9 << 18)) & mask; + out[19] = (w9 >> 13) & mask; + out[20] = ((w9 >> 44) | (w10 << 20)) & mask; + out[21] = (w10 >> 11) & mask; + out[22] = ((w10 >> 42) | (w11 << 22)) & mask; + out[23] = (w11 >> 9) & mask; + out[24] = ((w11 >> 40) | (w12 << 24)) & mask; + out[25] = (w12 >> 7) & mask; + out[26] = ((w12 >> 38) | (w13 << 26)) & mask; + out[27] = (w13 >> 5) & mask; + out[28] = ((w13 >> 36) | (w14 << 28)) & mask; + out[29] = (w14 >> 3) & mask; + out[30] = ((w14 >> 34) | (w15 << 30)) & mask; + out[31] = (w15 >> 1) & mask; + + return in; +} + +inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 4294967295ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + out[0] = (w0)&mask; + out[1] = w0 >> 32; + out[2] = (w1)&mask; + out[3] = w1 >> 32; + out[4] = (w2)&mask; + out[5] = w2 >> 32; + out[6] = (w3)&mask; + out[7] = w3 >> 32; + out[8] = (w4)&mask; + out[9] = w4 >> 32; + out[10] = (w5)&mask; + out[11] = w5 >> 32; + out[12] = (w6)&mask; + out[13] = w6 >> 32; + out[14] = (w7)&mask; + out[15] = w7 >> 32; + out[16] = (w8)&mask; + out[17] = w8 >> 32; + out[18] = (w9)&mask; + out[19] = w9 >> 32; + out[20] = (w10)&mask; + out[21] = w10 >> 32; + out[22] = (w11)&mask; + out[23] = w11 >> 32; + out[24] = (w12)&mask; + out[25] = w12 >> 32; + out[26] = (w13)&mask; + out[27] = w13 >> 32; + out[28] = (w14)&mask; + out[29] = w14 >> 32; + out[30] = (w15)&mask; + out[31] = w15 >> 32; + + return in; +} + +inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 8589934591ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 33) | (w1 << 31)) & mask; + out[2] = (w1 >> 2) & mask; + out[3] = ((w1 >> 35) | (w2 << 29)) & mask; + out[4] = (w2 >> 4) & mask; + out[5] = ((w2 >> 37) | (w3 << 27)) & mask; + out[6] = (w3 >> 6) & mask; + out[7] = ((w3 >> 39) | (w4 << 25)) & mask; + out[8] = (w4 >> 8) & mask; + out[9] = ((w4 >> 41) | (w5 << 23)) & mask; + out[10] = (w5 >> 10) & mask; + out[11] = ((w5 >> 43) | (w6 << 21)) & mask; + out[12] = (w6 >> 12) & mask; + out[13] = ((w6 >> 45) | (w7 << 19)) & mask; + out[14] = (w7 >> 14) & mask; + out[15] = ((w7 >> 47) | (w8 << 17)) & mask; + out[16] = (w8 >> 16) & mask; + out[17] = ((w8 >> 49) | (w9 << 15)) & mask; + out[18] = (w9 >> 18) & mask; + out[19] = ((w9 >> 51) | (w10 << 13)) & mask; + out[20] = (w10 >> 20) & mask; + out[21] = ((w10 >> 53) | (w11 << 11)) & mask; + out[22] = (w11 >> 22) & mask; + out[23] = ((w11 >> 55) | (w12 << 9)) & mask; + out[24] = (w12 >> 24) & mask; + out[25] = ((w12 >> 57) | (w13 << 7)) & mask; + out[26] = (w13 >> 26) & mask; + out[27] = ((w13 >> 59) | (w14 << 5)) & mask; + out[28] = (w14 >> 28) & mask; + out[29] = ((w14 >> 61) | (w15 << 3)) & mask; + out[30] = (w15 >> 30) & mask; + out[31] = ((w15 >> 63) | (w16 << 1)) & mask; + + return in; +} + +inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 17179869183ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 34) | (w1 << 30)) & mask; + out[2] = (w1 >> 4) & mask; + out[3] = ((w1 >> 38) | (w2 << 26)) & mask; + out[4] = (w2 >> 8) & mask; + out[5] = ((w2 >> 42) | (w3 << 22)) & mask; + out[6] = (w3 >> 12) & mask; + out[7] = ((w3 >> 46) | (w4 << 18)) & mask; + out[8] = (w4 >> 16) & mask; + out[9] = ((w4 >> 50) | (w5 << 14)) & mask; + out[10] = (w5 >> 20) & mask; + out[11] = ((w5 >> 54) | (w6 << 10)) & mask; + out[12] = (w6 >> 24) & mask; + out[13] = ((w6 >> 58) | (w7 << 6)) & mask; + out[14] = (w7 >> 28) & mask; + out[15] = ((w7 >> 62) | (w8 << 2)) & mask; + out[16] = ((w8 >> 32) | (w9 << 32)) & mask; + out[17] = (w9 >> 2) & mask; + out[18] = ((w9 >> 36) | (w10 << 28)) & mask; + out[19] = (w10 >> 6) & mask; + out[20] = ((w10 >> 40) | (w11 << 24)) & mask; + out[21] = (w11 >> 10) & mask; + out[22] = ((w11 >> 44) | (w12 << 20)) & mask; + out[23] = (w12 >> 14) & mask; + out[24] = ((w12 >> 48) | (w13 << 16)) & mask; + out[25] = (w13 >> 18) & mask; + out[26] = ((w13 >> 52) | (w14 << 12)) & mask; + out[27] = (w14 >> 22) & mask; + out[28] = ((w14 >> 56) | (w15 << 8)) & mask; + out[29] = (w15 >> 26) & mask; + out[30] = ((w15 >> 60) | (w16 << 4)) & mask; + out[31] = w16 >> 30; + + return in; +} + +inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 34359738367ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 35) | (w1 << 29)) & mask; + out[2] = (w1 >> 6) & mask; + out[3] = ((w1 >> 41) | (w2 << 23)) & mask; + out[4] = (w2 >> 12) & mask; + out[5] = ((w2 >> 47) | (w3 << 17)) & mask; + out[6] = (w3 >> 18) & mask; + out[7] = ((w3 >> 53) | (w4 << 11)) & mask; + out[8] = (w4 >> 24) & mask; + out[9] = ((w4 >> 59) | (w5 << 5)) & mask; + out[10] = ((w5 >> 30) | (w6 << 34)) & mask; + out[11] = (w6 >> 1) & mask; + out[12] = ((w6 >> 36) | (w7 << 28)) & mask; + out[13] = (w7 >> 7) & mask; + out[14] = ((w7 >> 42) | (w8 << 22)) & mask; + out[15] = (w8 >> 13) & mask; + out[16] = ((w8 >> 48) | (w9 << 16)) & mask; + out[17] = (w9 >> 19) & mask; + out[18] = ((w9 >> 54) | (w10 << 10)) & mask; + out[19] = (w10 >> 25) & mask; + out[20] = ((w10 >> 60) | (w11 << 4)) & mask; + out[21] = ((w11 >> 31) | (w12 << 33)) & mask; + out[22] = (w12 >> 2) & mask; + out[23] = ((w12 >> 37) | (w13 << 27)) & mask; + out[24] = (w13 >> 8) & mask; + out[25] = ((w13 >> 43) | (w14 << 21)) & mask; + out[26] = (w14 >> 14) & mask; + out[27] = ((w14 >> 49) | (w15 << 15)) & mask; + out[28] = (w15 >> 20) & mask; + out[29] = ((w15 >> 55) | (w16 << 9)) & mask; + out[30] = (w16 >> 26) & mask; + out[31] = ((w16 >> 61) | (w17 << 3)) & mask; + + return in; +} + +inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 68719476735ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 36) | (w1 << 28)) & mask; + out[2] = (w1 >> 8) & mask; + out[3] = ((w1 >> 44) | (w2 << 20)) & mask; + out[4] = (w2 >> 16) & mask; + out[5] = ((w2 >> 52) | (w3 << 12)) & mask; + out[6] = (w3 >> 24) & mask; + out[7] = ((w3 >> 60) | (w4 << 4)) & mask; + out[8] = ((w4 >> 32) | (w5 << 32)) & mask; + out[9] = (w5 >> 4) & mask; + out[10] = ((w5 >> 40) | (w6 << 24)) & mask; + out[11] = (w6 >> 12) & mask; + out[12] = ((w6 >> 48) | (w7 << 16)) & mask; + out[13] = (w7 >> 20) & mask; + out[14] = ((w7 >> 56) | (w8 << 8)) & mask; + out[15] = w8 >> 28; + out[16] = (w9)&mask; + out[17] = ((w9 >> 36) | (w10 << 28)) & mask; + out[18] = (w10 >> 8) & mask; + out[19] = ((w10 >> 44) | (w11 << 20)) & mask; + out[20] = (w11 >> 16) & mask; + out[21] = ((w11 >> 52) | (w12 << 12)) & mask; + out[22] = (w12 >> 24) & mask; + out[23] = ((w12 >> 60) | (w13 << 4)) & mask; + out[24] = ((w13 >> 32) | (w14 << 32)) & mask; + out[25] = (w14 >> 4) & mask; + out[26] = ((w14 >> 40) | (w15 << 24)) & mask; + out[27] = (w15 >> 12) & mask; + out[28] = ((w15 >> 48) | (w16 << 16)) & mask; + out[29] = (w16 >> 20) & mask; + out[30] = ((w16 >> 56) | (w17 << 8)) & mask; + out[31] = w17 >> 28; + + return in; +} + +inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 137438953471ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 37) | (w1 << 27)) & mask; + out[2] = (w1 >> 10) & mask; + out[3] = ((w1 >> 47) | (w2 << 17)) & mask; + out[4] = (w2 >> 20) & mask; + out[5] = ((w2 >> 57) | (w3 << 7)) & mask; + out[6] = ((w3 >> 30) | (w4 << 34)) & mask; + out[7] = (w4 >> 3) & mask; + out[8] = ((w4 >> 40) | (w5 << 24)) & mask; + out[9] = (w5 >> 13) & mask; + out[10] = ((w5 >> 50) | (w6 << 14)) & mask; + out[11] = (w6 >> 23) & mask; + out[12] = ((w6 >> 60) | (w7 << 4)) & mask; + out[13] = ((w7 >> 33) | (w8 << 31)) & mask; + out[14] = (w8 >> 6) & mask; + out[15] = ((w8 >> 43) | (w9 << 21)) & mask; + out[16] = (w9 >> 16) & mask; + out[17] = ((w9 >> 53) | (w10 << 11)) & mask; + out[18] = (w10 >> 26) & mask; + out[19] = ((w10 >> 63) | (w11 << 1)) & mask; + out[20] = ((w11 >> 36) | (w12 << 28)) & mask; + out[21] = (w12 >> 9) & mask; + out[22] = ((w12 >> 46) | (w13 << 18)) & mask; + out[23] = (w13 >> 19) & mask; + out[24] = ((w13 >> 56) | (w14 << 8)) & mask; + out[25] = ((w14 >> 29) | (w15 << 35)) & mask; + out[26] = (w15 >> 2) & mask; + out[27] = ((w15 >> 39) | (w16 << 25)) & mask; + out[28] = (w16 >> 12) & mask; + out[29] = ((w16 >> 49) | (w17 << 15)) & mask; + out[30] = (w17 >> 22) & mask; + out[31] = ((w17 >> 59) | (w18 << 5)) & mask; + + return in; +} + +inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 274877906943ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 38) | (w1 << 26)) & mask; + out[2] = (w1 >> 12) & mask; + out[3] = ((w1 >> 50) | (w2 << 14)) & mask; + out[4] = (w2 >> 24) & mask; + out[5] = ((w2 >> 62) | (w3 << 2)) & mask; + out[6] = ((w3 >> 36) | (w4 << 28)) & mask; + out[7] = (w4 >> 10) & mask; + out[8] = ((w4 >> 48) | (w5 << 16)) & mask; + out[9] = (w5 >> 22) & mask; + out[10] = ((w5 >> 60) | (w6 << 4)) & mask; + out[11] = ((w6 >> 34) | (w7 << 30)) & mask; + out[12] = (w7 >> 8) & mask; + out[13] = ((w7 >> 46) | (w8 << 18)) & mask; + out[14] = (w8 >> 20) & mask; + out[15] = ((w8 >> 58) | (w9 << 6)) & mask; + out[16] = ((w9 >> 32) | (w10 << 32)) & mask; + out[17] = (w10 >> 6) & mask; + out[18] = ((w10 >> 44) | (w11 << 20)) & mask; + out[19] = (w11 >> 18) & mask; + out[20] = ((w11 >> 56) | (w12 << 8)) & mask; + out[21] = ((w12 >> 30) | (w13 << 34)) & mask; + out[22] = (w13 >> 4) & mask; + out[23] = ((w13 >> 42) | (w14 << 22)) & mask; + out[24] = (w14 >> 16) & mask; + out[25] = ((w14 >> 54) | (w15 << 10)) & mask; + out[26] = ((w15 >> 28) | (w16 << 36)) & mask; + out[27] = (w16 >> 2) & mask; + out[28] = ((w16 >> 40) | (w17 << 24)) & mask; + out[29] = (w17 >> 14) & mask; + out[30] = ((w17 >> 52) | (w18 << 12)) & mask; + out[31] = w18 >> 26; + + return in; +} + +inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 549755813887ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 39) | (w1 << 25)) & mask; + out[2] = (w1 >> 14) & mask; + out[3] = ((w1 >> 53) | (w2 << 11)) & mask; + out[4] = ((w2 >> 28) | (w3 << 36)) & mask; + out[5] = (w3 >> 3) & mask; + out[6] = ((w3 >> 42) | (w4 << 22)) & mask; + out[7] = (w4 >> 17) & mask; + out[8] = ((w4 >> 56) | (w5 << 8)) & mask; + out[9] = ((w5 >> 31) | (w6 << 33)) & mask; + out[10] = (w6 >> 6) & mask; + out[11] = ((w6 >> 45) | (w7 << 19)) & mask; + out[12] = (w7 >> 20) & mask; + out[13] = ((w7 >> 59) | (w8 << 5)) & mask; + out[14] = ((w8 >> 34) | (w9 << 30)) & mask; + out[15] = (w9 >> 9) & mask; + out[16] = ((w9 >> 48) | (w10 << 16)) & mask; + out[17] = (w10 >> 23) & mask; + out[18] = ((w10 >> 62) | (w11 << 2)) & mask; + out[19] = ((w11 >> 37) | (w12 << 27)) & mask; + out[20] = (w12 >> 12) & mask; + out[21] = ((w12 >> 51) | (w13 << 13)) & mask; + out[22] = ((w13 >> 26) | (w14 << 38)) & mask; + out[23] = (w14 >> 1) & mask; + out[24] = ((w14 >> 40) | (w15 << 24)) & mask; + out[25] = (w15 >> 15) & mask; + out[26] = ((w15 >> 54) | (w16 << 10)) & mask; + out[27] = ((w16 >> 29) | (w17 << 35)) & mask; + out[28] = (w17 >> 4) & mask; + out[29] = ((w17 >> 43) | (w18 << 21)) & mask; + out[30] = (w18 >> 18) & mask; + out[31] = ((w18 >> 57) | (w19 << 7)) & mask; + + return in; +} + +inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 1099511627775ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 40) | (w1 << 24)) & mask; + out[2] = (w1 >> 16) & mask; + out[3] = ((w1 >> 56) | (w2 << 8)) & mask; + out[4] = ((w2 >> 32) | (w3 << 32)) & mask; + out[5] = (w3 >> 8) & mask; + out[6] = ((w3 >> 48) | (w4 << 16)) & mask; + out[7] = w4 >> 24; + out[8] = (w5)&mask; + out[9] = ((w5 >> 40) | (w6 << 24)) & mask; + out[10] = (w6 >> 16) & mask; + out[11] = ((w6 >> 56) | (w7 << 8)) & mask; + out[12] = ((w7 >> 32) | (w8 << 32)) & mask; + out[13] = (w8 >> 8) & mask; + out[14] = ((w8 >> 48) | (w9 << 16)) & mask; + out[15] = w9 >> 24; + out[16] = (w10)&mask; + out[17] = ((w10 >> 40) | (w11 << 24)) & mask; + out[18] = (w11 >> 16) & mask; + out[19] = ((w11 >> 56) | (w12 << 8)) & mask; + out[20] = ((w12 >> 32) | (w13 << 32)) & mask; + out[21] = (w13 >> 8) & mask; + out[22] = ((w13 >> 48) | (w14 << 16)) & mask; + out[23] = w14 >> 24; + out[24] = (w15)&mask; + out[25] = ((w15 >> 40) | (w16 << 24)) & mask; + out[26] = (w16 >> 16) & mask; + out[27] = ((w16 >> 56) | (w17 << 8)) & mask; + out[28] = ((w17 >> 32) | (w18 << 32)) & mask; + out[29] = (w18 >> 8) & mask; + out[30] = ((w18 >> 48) | (w19 << 16)) & mask; + out[31] = w19 >> 24; + + return in; +} + +inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 2199023255551ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 41) | (w1 << 23)) & mask; + out[2] = (w1 >> 18) & mask; + out[3] = ((w1 >> 59) | (w2 << 5)) & mask; + out[4] = ((w2 >> 36) | (w3 << 28)) & mask; + out[5] = (w3 >> 13) & mask; + out[6] = ((w3 >> 54) | (w4 << 10)) & mask; + out[7] = ((w4 >> 31) | (w5 << 33)) & mask; + out[8] = (w5 >> 8) & mask; + out[9] = ((w5 >> 49) | (w6 << 15)) & mask; + out[10] = ((w6 >> 26) | (w7 << 38)) & mask; + out[11] = (w7 >> 3) & mask; + out[12] = ((w7 >> 44) | (w8 << 20)) & mask; + out[13] = (w8 >> 21) & mask; + out[14] = ((w8 >> 62) | (w9 << 2)) & mask; + out[15] = ((w9 >> 39) | (w10 << 25)) & mask; + out[16] = (w10 >> 16) & mask; + out[17] = ((w10 >> 57) | (w11 << 7)) & mask; + out[18] = ((w11 >> 34) | (w12 << 30)) & mask; + out[19] = (w12 >> 11) & mask; + out[20] = ((w12 >> 52) | (w13 << 12)) & mask; + out[21] = ((w13 >> 29) | (w14 << 35)) & mask; + out[22] = (w14 >> 6) & mask; + out[23] = ((w14 >> 47) | (w15 << 17)) & mask; + out[24] = ((w15 >> 24) | (w16 << 40)) & mask; + out[25] = (w16 >> 1) & mask; + out[26] = ((w16 >> 42) | (w17 << 22)) & mask; + out[27] = (w17 >> 19) & mask; + out[28] = ((w17 >> 60) | (w18 << 4)) & mask; + out[29] = ((w18 >> 37) | (w19 << 27)) & mask; + out[30] = (w19 >> 14) & mask; + out[31] = ((w19 >> 55) | (w20 << 9)) & mask; + + return in; +} + +inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 4398046511103ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 42) | (w1 << 22)) & mask; + out[2] = (w1 >> 20) & mask; + out[3] = ((w1 >> 62) | (w2 << 2)) & mask; + out[4] = ((w2 >> 40) | (w3 << 24)) & mask; + out[5] = (w3 >> 18) & mask; + out[6] = ((w3 >> 60) | (w4 << 4)) & mask; + out[7] = ((w4 >> 38) | (w5 << 26)) & mask; + out[8] = (w5 >> 16) & mask; + out[9] = ((w5 >> 58) | (w6 << 6)) & mask; + out[10] = ((w6 >> 36) | (w7 << 28)) & mask; + out[11] = (w7 >> 14) & mask; + out[12] = ((w7 >> 56) | (w8 << 8)) & mask; + out[13] = ((w8 >> 34) | (w9 << 30)) & mask; + out[14] = (w9 >> 12) & mask; + out[15] = ((w9 >> 54) | (w10 << 10)) & mask; + out[16] = ((w10 >> 32) | (w11 << 32)) & mask; + out[17] = (w11 >> 10) & mask; + out[18] = ((w11 >> 52) | (w12 << 12)) & mask; + out[19] = ((w12 >> 30) | (w13 << 34)) & mask; + out[20] = (w13 >> 8) & mask; + out[21] = ((w13 >> 50) | (w14 << 14)) & mask; + out[22] = ((w14 >> 28) | (w15 << 36)) & mask; + out[23] = (w15 >> 6) & mask; + out[24] = ((w15 >> 48) | (w16 << 16)) & mask; + out[25] = ((w16 >> 26) | (w17 << 38)) & mask; + out[26] = (w17 >> 4) & mask; + out[27] = ((w17 >> 46) | (w18 << 18)) & mask; + out[28] = ((w18 >> 24) | (w19 << 40)) & mask; + out[29] = (w19 >> 2) & mask; + out[30] = ((w19 >> 44) | (w20 << 20)) & mask; + out[31] = w20 >> 22; + + return in; +} + +inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 8796093022207ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 43) | (w1 << 21)) & mask; + out[2] = ((w1 >> 22) | (w2 << 42)) & mask; + out[3] = (w2 >> 1) & mask; + out[4] = ((w2 >> 44) | (w3 << 20)) & mask; + out[5] = ((w3 >> 23) | (w4 << 41)) & mask; + out[6] = (w4 >> 2) & mask; + out[7] = ((w4 >> 45) | (w5 << 19)) & mask; + out[8] = ((w5 >> 24) | (w6 << 40)) & mask; + out[9] = (w6 >> 3) & mask; + out[10] = ((w6 >> 46) | (w7 << 18)) & mask; + out[11] = ((w7 >> 25) | (w8 << 39)) & mask; + out[12] = (w8 >> 4) & mask; + out[13] = ((w8 >> 47) | (w9 << 17)) & mask; + out[14] = ((w9 >> 26) | (w10 << 38)) & mask; + out[15] = (w10 >> 5) & mask; + out[16] = ((w10 >> 48) | (w11 << 16)) & mask; + out[17] = ((w11 >> 27) | (w12 << 37)) & mask; + out[18] = (w12 >> 6) & mask; + out[19] = ((w12 >> 49) | (w13 << 15)) & mask; + out[20] = ((w13 >> 28) | (w14 << 36)) & mask; + out[21] = (w14 >> 7) & mask; + out[22] = ((w14 >> 50) | (w15 << 14)) & mask; + out[23] = ((w15 >> 29) | (w16 << 35)) & mask; + out[24] = (w16 >> 8) & mask; + out[25] = ((w16 >> 51) | (w17 << 13)) & mask; + out[26] = ((w17 >> 30) | (w18 << 34)) & mask; + out[27] = (w18 >> 9) & mask; + out[28] = ((w18 >> 52) | (w19 << 12)) & mask; + out[29] = ((w19 >> 31) | (w20 << 33)) & mask; + out[30] = (w20 >> 10) & mask; + out[31] = ((w20 >> 53) | (w21 << 11)) & mask; + + return in; +} + +inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 17592186044415ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 44) | (w1 << 20)) & mask; + out[2] = ((w1 >> 24) | (w2 << 40)) & mask; + out[3] = (w2 >> 4) & mask; + out[4] = ((w2 >> 48) | (w3 << 16)) & mask; + out[5] = ((w3 >> 28) | (w4 << 36)) & mask; + out[6] = (w4 >> 8) & mask; + out[7] = ((w4 >> 52) | (w5 << 12)) & mask; + out[8] = ((w5 >> 32) | (w6 << 32)) & mask; + out[9] = (w6 >> 12) & mask; + out[10] = ((w6 >> 56) | (w7 << 8)) & mask; + out[11] = ((w7 >> 36) | (w8 << 28)) & mask; + out[12] = (w8 >> 16) & mask; + out[13] = ((w8 >> 60) | (w9 << 4)) & mask; + out[14] = ((w9 >> 40) | (w10 << 24)) & mask; + out[15] = w10 >> 20; + out[16] = (w11)&mask; + out[17] = ((w11 >> 44) | (w12 << 20)) & mask; + out[18] = ((w12 >> 24) | (w13 << 40)) & mask; + out[19] = (w13 >> 4) & mask; + out[20] = ((w13 >> 48) | (w14 << 16)) & mask; + out[21] = ((w14 >> 28) | (w15 << 36)) & mask; + out[22] = (w15 >> 8) & mask; + out[23] = ((w15 >> 52) | (w16 << 12)) & mask; + out[24] = ((w16 >> 32) | (w17 << 32)) & mask; + out[25] = (w17 >> 12) & mask; + out[26] = ((w17 >> 56) | (w18 << 8)) & mask; + out[27] = ((w18 >> 36) | (w19 << 28)) & mask; + out[28] = (w19 >> 16) & mask; + out[29] = ((w19 >> 60) | (w20 << 4)) & mask; + out[30] = ((w20 >> 40) | (w21 << 24)) & mask; + out[31] = w21 >> 20; + + return in; +} + +inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 35184372088831ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 45) | (w1 << 19)) & mask; + out[2] = ((w1 >> 26) | (w2 << 38)) & mask; + out[3] = (w2 >> 7) & mask; + out[4] = ((w2 >> 52) | (w3 << 12)) & mask; + out[5] = ((w3 >> 33) | (w4 << 31)) & mask; + out[6] = (w4 >> 14) & mask; + out[7] = ((w4 >> 59) | (w5 << 5)) & mask; + out[8] = ((w5 >> 40) | (w6 << 24)) & mask; + out[9] = ((w6 >> 21) | (w7 << 43)) & mask; + out[10] = (w7 >> 2) & mask; + out[11] = ((w7 >> 47) | (w8 << 17)) & mask; + out[12] = ((w8 >> 28) | (w9 << 36)) & mask; + out[13] = (w9 >> 9) & mask; + out[14] = ((w9 >> 54) | (w10 << 10)) & mask; + out[15] = ((w10 >> 35) | (w11 << 29)) & mask; + out[16] = (w11 >> 16) & mask; + out[17] = ((w11 >> 61) | (w12 << 3)) & mask; + out[18] = ((w12 >> 42) | (w13 << 22)) & mask; + out[19] = ((w13 >> 23) | (w14 << 41)) & mask; + out[20] = (w14 >> 4) & mask; + out[21] = ((w14 >> 49) | (w15 << 15)) & mask; + out[22] = ((w15 >> 30) | (w16 << 34)) & mask; + out[23] = (w16 >> 11) & mask; + out[24] = ((w16 >> 56) | (w17 << 8)) & mask; + out[25] = ((w17 >> 37) | (w18 << 27)) & mask; + out[26] = (w18 >> 18) & mask; + out[27] = ((w18 >> 63) | (w19 << 1)) & mask; + out[28] = ((w19 >> 44) | (w20 << 20)) & mask; + out[29] = ((w20 >> 25) | (w21 << 39)) & mask; + out[30] = (w21 >> 6) & mask; + out[31] = ((w21 >> 51) | (w22 << 13)) & mask; + + return in; +} + +inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 70368744177663ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 46) | (w1 << 18)) & mask; + out[2] = ((w1 >> 28) | (w2 << 36)) & mask; + out[3] = (w2 >> 10) & mask; + out[4] = ((w2 >> 56) | (w3 << 8)) & mask; + out[5] = ((w3 >> 38) | (w4 << 26)) & mask; + out[6] = ((w4 >> 20) | (w5 << 44)) & mask; + out[7] = (w5 >> 2) & mask; + out[8] = ((w5 >> 48) | (w6 << 16)) & mask; + out[9] = ((w6 >> 30) | (w7 << 34)) & mask; + out[10] = (w7 >> 12) & mask; + out[11] = ((w7 >> 58) | (w8 << 6)) & mask; + out[12] = ((w8 >> 40) | (w9 << 24)) & mask; + out[13] = ((w9 >> 22) | (w10 << 42)) & mask; + out[14] = (w10 >> 4) & mask; + out[15] = ((w10 >> 50) | (w11 << 14)) & mask; + out[16] = ((w11 >> 32) | (w12 << 32)) & mask; + out[17] = (w12 >> 14) & mask; + out[18] = ((w12 >> 60) | (w13 << 4)) & mask; + out[19] = ((w13 >> 42) | (w14 << 22)) & mask; + out[20] = ((w14 >> 24) | (w15 << 40)) & mask; + out[21] = (w15 >> 6) & mask; + out[22] = ((w15 >> 52) | (w16 << 12)) & mask; + out[23] = ((w16 >> 34) | (w17 << 30)) & mask; + out[24] = (w17 >> 16) & mask; + out[25] = ((w17 >> 62) | (w18 << 2)) & mask; + out[26] = ((w18 >> 44) | (w19 << 20)) & mask; + out[27] = ((w19 >> 26) | (w20 << 38)) & mask; + out[28] = (w20 >> 8) & mask; + out[29] = ((w20 >> 54) | (w21 << 10)) & mask; + out[30] = ((w21 >> 36) | (w22 << 28)) & mask; + out[31] = w22 >> 18; + + return in; +} + +inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 140737488355327ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 47) | (w1 << 17)) & mask; + out[2] = ((w1 >> 30) | (w2 << 34)) & mask; + out[3] = (w2 >> 13) & mask; + out[4] = ((w2 >> 60) | (w3 << 4)) & mask; + out[5] = ((w3 >> 43) | (w4 << 21)) & mask; + out[6] = ((w4 >> 26) | (w5 << 38)) & mask; + out[7] = (w5 >> 9) & mask; + out[8] = ((w5 >> 56) | (w6 << 8)) & mask; + out[9] = ((w6 >> 39) | (w7 << 25)) & mask; + out[10] = ((w7 >> 22) | (w8 << 42)) & mask; + out[11] = (w8 >> 5) & mask; + out[12] = ((w8 >> 52) | (w9 << 12)) & mask; + out[13] = ((w9 >> 35) | (w10 << 29)) & mask; + out[14] = ((w10 >> 18) | (w11 << 46)) & mask; + out[15] = (w11 >> 1) & mask; + out[16] = ((w11 >> 48) | (w12 << 16)) & mask; + out[17] = ((w12 >> 31) | (w13 << 33)) & mask; + out[18] = (w13 >> 14) & mask; + out[19] = ((w13 >> 61) | (w14 << 3)) & mask; + out[20] = ((w14 >> 44) | (w15 << 20)) & mask; + out[21] = ((w15 >> 27) | (w16 << 37)) & mask; + out[22] = (w16 >> 10) & mask; + out[23] = ((w16 >> 57) | (w17 << 7)) & mask; + out[24] = ((w17 >> 40) | (w18 << 24)) & mask; + out[25] = ((w18 >> 23) | (w19 << 41)) & mask; + out[26] = (w19 >> 6) & mask; + out[27] = ((w19 >> 53) | (w20 << 11)) & mask; + out[28] = ((w20 >> 36) | (w21 << 28)) & mask; + out[29] = ((w21 >> 19) | (w22 << 45)) & mask; + out[30] = (w22 >> 2) & mask; + out[31] = ((w22 >> 49) | (w23 << 15)) & mask; + + return in; +} + +inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 281474976710655ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 48) | (w1 << 16)) & mask; + out[2] = ((w1 >> 32) | (w2 << 32)) & mask; + out[3] = w2 >> 16; + out[4] = (w3)&mask; + out[5] = ((w3 >> 48) | (w4 << 16)) & mask; + out[6] = ((w4 >> 32) | (w5 << 32)) & mask; + out[7] = w5 >> 16; + out[8] = (w6)&mask; + out[9] = ((w6 >> 48) | (w7 << 16)) & mask; + out[10] = ((w7 >> 32) | (w8 << 32)) & mask; + out[11] = w8 >> 16; + out[12] = (w9)&mask; + out[13] = ((w9 >> 48) | (w10 << 16)) & mask; + out[14] = ((w10 >> 32) | (w11 << 32)) & mask; + out[15] = w11 >> 16; + out[16] = (w12)&mask; + out[17] = ((w12 >> 48) | (w13 << 16)) & mask; + out[18] = ((w13 >> 32) | (w14 << 32)) & mask; + out[19] = w14 >> 16; + out[20] = (w15)&mask; + out[21] = ((w15 >> 48) | (w16 << 16)) & mask; + out[22] = ((w16 >> 32) | (w17 << 32)) & mask; + out[23] = w17 >> 16; + out[24] = (w18)&mask; + out[25] = ((w18 >> 48) | (w19 << 16)) & mask; + out[26] = ((w19 >> 32) | (w20 << 32)) & mask; + out[27] = w20 >> 16; + out[28] = (w21)&mask; + out[29] = ((w21 >> 48) | (w22 << 16)) & mask; + out[30] = ((w22 >> 32) | (w23 << 32)) & mask; + out[31] = w23 >> 16; + + return in; +} + +inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 562949953421311ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 49) | (w1 << 15)) & mask; + out[2] = ((w1 >> 34) | (w2 << 30)) & mask; + out[3] = ((w2 >> 19) | (w3 << 45)) & mask; + out[4] = (w3 >> 4) & mask; + out[5] = ((w3 >> 53) | (w4 << 11)) & mask; + out[6] = ((w4 >> 38) | (w5 << 26)) & mask; + out[7] = ((w5 >> 23) | (w6 << 41)) & mask; + out[8] = (w6 >> 8) & mask; + out[9] = ((w6 >> 57) | (w7 << 7)) & mask; + out[10] = ((w7 >> 42) | (w8 << 22)) & mask; + out[11] = ((w8 >> 27) | (w9 << 37)) & mask; + out[12] = (w9 >> 12) & mask; + out[13] = ((w9 >> 61) | (w10 << 3)) & mask; + out[14] = ((w10 >> 46) | (w11 << 18)) & mask; + out[15] = ((w11 >> 31) | (w12 << 33)) & mask; + out[16] = ((w12 >> 16) | (w13 << 48)) & mask; + out[17] = (w13 >> 1) & mask; + out[18] = ((w13 >> 50) | (w14 << 14)) & mask; + out[19] = ((w14 >> 35) | (w15 << 29)) & mask; + out[20] = ((w15 >> 20) | (w16 << 44)) & mask; + out[21] = (w16 >> 5) & mask; + out[22] = ((w16 >> 54) | (w17 << 10)) & mask; + out[23] = ((w17 >> 39) | (w18 << 25)) & mask; + out[24] = ((w18 >> 24) | (w19 << 40)) & mask; + out[25] = (w19 >> 9) & mask; + out[26] = ((w19 >> 58) | (w20 << 6)) & mask; + out[27] = ((w20 >> 43) | (w21 << 21)) & mask; + out[28] = ((w21 >> 28) | (w22 << 36)) & mask; + out[29] = (w22 >> 13) & mask; + out[30] = ((w22 >> 62) | (w23 << 2)) & mask; + out[31] = ((w23 >> 47) | (w24 << 17)) & mask; + + return in; +} + +inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 1125899906842623ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 50) | (w1 << 14)) & mask; + out[2] = ((w1 >> 36) | (w2 << 28)) & mask; + out[3] = ((w2 >> 22) | (w3 << 42)) & mask; + out[4] = (w3 >> 8) & mask; + out[5] = ((w3 >> 58) | (w4 << 6)) & mask; + out[6] = ((w4 >> 44) | (w5 << 20)) & mask; + out[7] = ((w5 >> 30) | (w6 << 34)) & mask; + out[8] = ((w6 >> 16) | (w7 << 48)) & mask; + out[9] = (w7 >> 2) & mask; + out[10] = ((w7 >> 52) | (w8 << 12)) & mask; + out[11] = ((w8 >> 38) | (w9 << 26)) & mask; + out[12] = ((w9 >> 24) | (w10 << 40)) & mask; + out[13] = (w10 >> 10) & mask; + out[14] = ((w10 >> 60) | (w11 << 4)) & mask; + out[15] = ((w11 >> 46) | (w12 << 18)) & mask; + out[16] = ((w12 >> 32) | (w13 << 32)) & mask; + out[17] = ((w13 >> 18) | (w14 << 46)) & mask; + out[18] = (w14 >> 4) & mask; + out[19] = ((w14 >> 54) | (w15 << 10)) & mask; + out[20] = ((w15 >> 40) | (w16 << 24)) & mask; + out[21] = ((w16 >> 26) | (w17 << 38)) & mask; + out[22] = (w17 >> 12) & mask; + out[23] = ((w17 >> 62) | (w18 << 2)) & mask; + out[24] = ((w18 >> 48) | (w19 << 16)) & mask; + out[25] = ((w19 >> 34) | (w20 << 30)) & mask; + out[26] = ((w20 >> 20) | (w21 << 44)) & mask; + out[27] = (w21 >> 6) & mask; + out[28] = ((w21 >> 56) | (w22 << 8)) & mask; + out[29] = ((w22 >> 42) | (w23 << 22)) & mask; + out[30] = ((w23 >> 28) | (w24 << 36)) & mask; + out[31] = w24 >> 14; + + return in; +} + +inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 2251799813685247ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 51) | (w1 << 13)) & mask; + out[2] = ((w1 >> 38) | (w2 << 26)) & mask; + out[3] = ((w2 >> 25) | (w3 << 39)) & mask; + out[4] = (w3 >> 12) & mask; + out[5] = ((w3 >> 63) | (w4 << 1)) & mask; + out[6] = ((w4 >> 50) | (w5 << 14)) & mask; + out[7] = ((w5 >> 37) | (w6 << 27)) & mask; + out[8] = ((w6 >> 24) | (w7 << 40)) & mask; + out[9] = (w7 >> 11) & mask; + out[10] = ((w7 >> 62) | (w8 << 2)) & mask; + out[11] = ((w8 >> 49) | (w9 << 15)) & mask; + out[12] = ((w9 >> 36) | (w10 << 28)) & mask; + out[13] = ((w10 >> 23) | (w11 << 41)) & mask; + out[14] = (w11 >> 10) & mask; + out[15] = ((w11 >> 61) | (w12 << 3)) & mask; + out[16] = ((w12 >> 48) | (w13 << 16)) & mask; + out[17] = ((w13 >> 35) | (w14 << 29)) & mask; + out[18] = ((w14 >> 22) | (w15 << 42)) & mask; + out[19] = (w15 >> 9) & mask; + out[20] = ((w15 >> 60) | (w16 << 4)) & mask; + out[21] = ((w16 >> 47) | (w17 << 17)) & mask; + out[22] = ((w17 >> 34) | (w18 << 30)) & mask; + out[23] = ((w18 >> 21) | (w19 << 43)) & mask; + out[24] = (w19 >> 8) & mask; + out[25] = ((w19 >> 59) | (w20 << 5)) & mask; + out[26] = ((w20 >> 46) | (w21 << 18)) & mask; + out[27] = ((w21 >> 33) | (w22 << 31)) & mask; + out[28] = ((w22 >> 20) | (w23 << 44)) & mask; + out[29] = (w23 >> 7) & mask; + out[30] = ((w23 >> 58) | (w24 << 6)) & mask; + out[31] = ((w24 >> 45) | (w25 << 19)) & mask; + + return in; +} + +inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 4503599627370495ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 52) | (w1 << 12)) & mask; + out[2] = ((w1 >> 40) | (w2 << 24)) & mask; + out[3] = ((w2 >> 28) | (w3 << 36)) & mask; + out[4] = ((w3 >> 16) | (w4 << 48)) & mask; + out[5] = (w4 >> 4) & mask; + out[6] = ((w4 >> 56) | (w5 << 8)) & mask; + out[7] = ((w5 >> 44) | (w6 << 20)) & mask; + out[8] = ((w6 >> 32) | (w7 << 32)) & mask; + out[9] = ((w7 >> 20) | (w8 << 44)) & mask; + out[10] = (w8 >> 8) & mask; + out[11] = ((w8 >> 60) | (w9 << 4)) & mask; + out[12] = ((w9 >> 48) | (w10 << 16)) & mask; + out[13] = ((w10 >> 36) | (w11 << 28)) & mask; + out[14] = ((w11 >> 24) | (w12 << 40)) & mask; + out[15] = w12 >> 12; + out[16] = (w13)&mask; + out[17] = ((w13 >> 52) | (w14 << 12)) & mask; + out[18] = ((w14 >> 40) | (w15 << 24)) & mask; + out[19] = ((w15 >> 28) | (w16 << 36)) & mask; + out[20] = ((w16 >> 16) | (w17 << 48)) & mask; + out[21] = (w17 >> 4) & mask; + out[22] = ((w17 >> 56) | (w18 << 8)) & mask; + out[23] = ((w18 >> 44) | (w19 << 20)) & mask; + out[24] = ((w19 >> 32) | (w20 << 32)) & mask; + out[25] = ((w20 >> 20) | (w21 << 44)) & mask; + out[26] = (w21 >> 8) & mask; + out[27] = ((w21 >> 60) | (w22 << 4)) & mask; + out[28] = ((w22 >> 48) | (w23 << 16)) & mask; + out[29] = ((w23 >> 36) | (w24 << 28)) & mask; + out[30] = ((w24 >> 24) | (w25 << 40)) & mask; + out[31] = w25 >> 12; + + return in; +} + +inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 9007199254740991ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 53) | (w1 << 11)) & mask; + out[2] = ((w1 >> 42) | (w2 << 22)) & mask; + out[3] = ((w2 >> 31) | (w3 << 33)) & mask; + out[4] = ((w3 >> 20) | (w4 << 44)) & mask; + out[5] = (w4 >> 9) & mask; + out[6] = ((w4 >> 62) | (w5 << 2)) & mask; + out[7] = ((w5 >> 51) | (w6 << 13)) & mask; + out[8] = ((w6 >> 40) | (w7 << 24)) & mask; + out[9] = ((w7 >> 29) | (w8 << 35)) & mask; + out[10] = ((w8 >> 18) | (w9 << 46)) & mask; + out[11] = (w9 >> 7) & mask; + out[12] = ((w9 >> 60) | (w10 << 4)) & mask; + out[13] = ((w10 >> 49) | (w11 << 15)) & mask; + out[14] = ((w11 >> 38) | (w12 << 26)) & mask; + out[15] = ((w12 >> 27) | (w13 << 37)) & mask; + out[16] = ((w13 >> 16) | (w14 << 48)) & mask; + out[17] = (w14 >> 5) & mask; + out[18] = ((w14 >> 58) | (w15 << 6)) & mask; + out[19] = ((w15 >> 47) | (w16 << 17)) & mask; + out[20] = ((w16 >> 36) | (w17 << 28)) & mask; + out[21] = ((w17 >> 25) | (w18 << 39)) & mask; + out[22] = ((w18 >> 14) | (w19 << 50)) & mask; + out[23] = (w19 >> 3) & mask; + out[24] = ((w19 >> 56) | (w20 << 8)) & mask; + out[25] = ((w20 >> 45) | (w21 << 19)) & mask; + out[26] = ((w21 >> 34) | (w22 << 30)) & mask; + out[27] = ((w22 >> 23) | (w23 << 41)) & mask; + out[28] = ((w23 >> 12) | (w24 << 52)) & mask; + out[29] = (w24 >> 1) & mask; + out[30] = ((w24 >> 54) | (w25 << 10)) & mask; + out[31] = ((w25 >> 43) | (w26 << 21)) & mask; + + return in; +} + +inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 18014398509481983ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 54) | (w1 << 10)) & mask; + out[2] = ((w1 >> 44) | (w2 << 20)) & mask; + out[3] = ((w2 >> 34) | (w3 << 30)) & mask; + out[4] = ((w3 >> 24) | (w4 << 40)) & mask; + out[5] = ((w4 >> 14) | (w5 << 50)) & mask; + out[6] = (w5 >> 4) & mask; + out[7] = ((w5 >> 58) | (w6 << 6)) & mask; + out[8] = ((w6 >> 48) | (w7 << 16)) & mask; + out[9] = ((w7 >> 38) | (w8 << 26)) & mask; + out[10] = ((w8 >> 28) | (w9 << 36)) & mask; + out[11] = ((w9 >> 18) | (w10 << 46)) & mask; + out[12] = (w10 >> 8) & mask; + out[13] = ((w10 >> 62) | (w11 << 2)) & mask; + out[14] = ((w11 >> 52) | (w12 << 12)) & mask; + out[15] = ((w12 >> 42) | (w13 << 22)) & mask; + out[16] = ((w13 >> 32) | (w14 << 32)) & mask; + out[17] = ((w14 >> 22) | (w15 << 42)) & mask; + out[18] = ((w15 >> 12) | (w16 << 52)) & mask; + out[19] = (w16 >> 2) & mask; + out[20] = ((w16 >> 56) | (w17 << 8)) & mask; + out[21] = ((w17 >> 46) | (w18 << 18)) & mask; + out[22] = ((w18 >> 36) | (w19 << 28)) & mask; + out[23] = ((w19 >> 26) | (w20 << 38)) & mask; + out[24] = ((w20 >> 16) | (w21 << 48)) & mask; + out[25] = (w21 >> 6) & mask; + out[26] = ((w21 >> 60) | (w22 << 4)) & mask; + out[27] = ((w22 >> 50) | (w23 << 14)) & mask; + out[28] = ((w23 >> 40) | (w24 << 24)) & mask; + out[29] = ((w24 >> 30) | (w25 << 34)) & mask; + out[30] = ((w25 >> 20) | (w26 << 44)) & mask; + out[31] = w26 >> 10; + + return in; +} + +inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 36028797018963967ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 55) | (w1 << 9)) & mask; + out[2] = ((w1 >> 46) | (w2 << 18)) & mask; + out[3] = ((w2 >> 37) | (w3 << 27)) & mask; + out[4] = ((w3 >> 28) | (w4 << 36)) & mask; + out[5] = ((w4 >> 19) | (w5 << 45)) & mask; + out[6] = ((w5 >> 10) | (w6 << 54)) & mask; + out[7] = (w6 >> 1) & mask; + out[8] = ((w6 >> 56) | (w7 << 8)) & mask; + out[9] = ((w7 >> 47) | (w8 << 17)) & mask; + out[10] = ((w8 >> 38) | (w9 << 26)) & mask; + out[11] = ((w9 >> 29) | (w10 << 35)) & mask; + out[12] = ((w10 >> 20) | (w11 << 44)) & mask; + out[13] = ((w11 >> 11) | (w12 << 53)) & mask; + out[14] = (w12 >> 2) & mask; + out[15] = ((w12 >> 57) | (w13 << 7)) & mask; + out[16] = ((w13 >> 48) | (w14 << 16)) & mask; + out[17] = ((w14 >> 39) | (w15 << 25)) & mask; + out[18] = ((w15 >> 30) | (w16 << 34)) & mask; + out[19] = ((w16 >> 21) | (w17 << 43)) & mask; + out[20] = ((w17 >> 12) | (w18 << 52)) & mask; + out[21] = (w18 >> 3) & mask; + out[22] = ((w18 >> 58) | (w19 << 6)) & mask; + out[23] = ((w19 >> 49) | (w20 << 15)) & mask; + out[24] = ((w20 >> 40) | (w21 << 24)) & mask; + out[25] = ((w21 >> 31) | (w22 << 33)) & mask; + out[26] = ((w22 >> 22) | (w23 << 42)) & mask; + out[27] = ((w23 >> 13) | (w24 << 51)) & mask; + out[28] = (w24 >> 4) & mask; + out[29] = ((w24 >> 59) | (w25 << 5)) & mask; + out[30] = ((w25 >> 50) | (w26 << 14)) & mask; + out[31] = ((w26 >> 41) | (w27 << 23)) & mask; + + return in; +} + +inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 72057594037927935ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 56) | (w1 << 8)) & mask; + out[2] = ((w1 >> 48) | (w2 << 16)) & mask; + out[3] = ((w2 >> 40) | (w3 << 24)) & mask; + out[4] = ((w3 >> 32) | (w4 << 32)) & mask; + out[5] = ((w4 >> 24) | (w5 << 40)) & mask; + out[6] = ((w5 >> 16) | (w6 << 48)) & mask; + out[7] = w6 >> 8; + out[8] = (w7)&mask; + out[9] = ((w7 >> 56) | (w8 << 8)) & mask; + out[10] = ((w8 >> 48) | (w9 << 16)) & mask; + out[11] = ((w9 >> 40) | (w10 << 24)) & mask; + out[12] = ((w10 >> 32) | (w11 << 32)) & mask; + out[13] = ((w11 >> 24) | (w12 << 40)) & mask; + out[14] = ((w12 >> 16) | (w13 << 48)) & mask; + out[15] = w13 >> 8; + out[16] = (w14)&mask; + out[17] = ((w14 >> 56) | (w15 << 8)) & mask; + out[18] = ((w15 >> 48) | (w16 << 16)) & mask; + out[19] = ((w16 >> 40) | (w17 << 24)) & mask; + out[20] = ((w17 >> 32) | (w18 << 32)) & mask; + out[21] = ((w18 >> 24) | (w19 << 40)) & mask; + out[22] = ((w19 >> 16) | (w20 << 48)) & mask; + out[23] = w20 >> 8; + out[24] = (w21)&mask; + out[25] = ((w21 >> 56) | (w22 << 8)) & mask; + out[26] = ((w22 >> 48) | (w23 << 16)) & mask; + out[27] = ((w23 >> 40) | (w24 << 24)) & mask; + out[28] = ((w24 >> 32) | (w25 << 32)) & mask; + out[29] = ((w25 >> 24) | (w26 << 40)) & mask; + out[30] = ((w26 >> 16) | (w27 << 48)) & mask; + out[31] = w27 >> 8; + + return in; +} + +inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 144115188075855871ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + uint64_t w28 = util::SafeLoadAs(in); + w28 = arrow::bit_util::FromLittleEndian(w28); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 57) | (w1 << 7)) & mask; + out[2] = ((w1 >> 50) | (w2 << 14)) & mask; + out[3] = ((w2 >> 43) | (w3 << 21)) & mask; + out[4] = ((w3 >> 36) | (w4 << 28)) & mask; + out[5] = ((w4 >> 29) | (w5 << 35)) & mask; + out[6] = ((w5 >> 22) | (w6 << 42)) & mask; + out[7] = ((w6 >> 15) | (w7 << 49)) & mask; + out[8] = ((w7 >> 8) | (w8 << 56)) & mask; + out[9] = (w8 >> 1) & mask; + out[10] = ((w8 >> 58) | (w9 << 6)) & mask; + out[11] = ((w9 >> 51) | (w10 << 13)) & mask; + out[12] = ((w10 >> 44) | (w11 << 20)) & mask; + out[13] = ((w11 >> 37) | (w12 << 27)) & mask; + out[14] = ((w12 >> 30) | (w13 << 34)) & mask; + out[15] = ((w13 >> 23) | (w14 << 41)) & mask; + out[16] = ((w14 >> 16) | (w15 << 48)) & mask; + out[17] = ((w15 >> 9) | (w16 << 55)) & mask; + out[18] = (w16 >> 2) & mask; + out[19] = ((w16 >> 59) | (w17 << 5)) & mask; + out[20] = ((w17 >> 52) | (w18 << 12)) & mask; + out[21] = ((w18 >> 45) | (w19 << 19)) & mask; + out[22] = ((w19 >> 38) | (w20 << 26)) & mask; + out[23] = ((w20 >> 31) | (w21 << 33)) & mask; + out[24] = ((w21 >> 24) | (w22 << 40)) & mask; + out[25] = ((w22 >> 17) | (w23 << 47)) & mask; + out[26] = ((w23 >> 10) | (w24 << 54)) & mask; + out[27] = (w24 >> 3) & mask; + out[28] = ((w24 >> 60) | (w25 << 4)) & mask; + out[29] = ((w25 >> 53) | (w26 << 11)) & mask; + out[30] = ((w26 >> 46) | (w27 << 18)) & mask; + out[31] = ((w27 >> 39) | (w28 << 25)) & mask; + + return in; +} + +inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 288230376151711743ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + uint64_t w28 = util::SafeLoadAs(in); + w28 = arrow::bit_util::FromLittleEndian(w28); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 58) | (w1 << 6)) & mask; + out[2] = ((w1 >> 52) | (w2 << 12)) & mask; + out[3] = ((w2 >> 46) | (w3 << 18)) & mask; + out[4] = ((w3 >> 40) | (w4 << 24)) & mask; + out[5] = ((w4 >> 34) | (w5 << 30)) & mask; + out[6] = ((w5 >> 28) | (w6 << 36)) & mask; + out[7] = ((w6 >> 22) | (w7 << 42)) & mask; + out[8] = ((w7 >> 16) | (w8 << 48)) & mask; + out[9] = ((w8 >> 10) | (w9 << 54)) & mask; + out[10] = (w9 >> 4) & mask; + out[11] = ((w9 >> 62) | (w10 << 2)) & mask; + out[12] = ((w10 >> 56) | (w11 << 8)) & mask; + out[13] = ((w11 >> 50) | (w12 << 14)) & mask; + out[14] = ((w12 >> 44) | (w13 << 20)) & mask; + out[15] = ((w13 >> 38) | (w14 << 26)) & mask; + out[16] = ((w14 >> 32) | (w15 << 32)) & mask; + out[17] = ((w15 >> 26) | (w16 << 38)) & mask; + out[18] = ((w16 >> 20) | (w17 << 44)) & mask; + out[19] = ((w17 >> 14) | (w18 << 50)) & mask; + out[20] = ((w18 >> 8) | (w19 << 56)) & mask; + out[21] = (w19 >> 2) & mask; + out[22] = ((w19 >> 60) | (w20 << 4)) & mask; + out[23] = ((w20 >> 54) | (w21 << 10)) & mask; + out[24] = ((w21 >> 48) | (w22 << 16)) & mask; + out[25] = ((w22 >> 42) | (w23 << 22)) & mask; + out[26] = ((w23 >> 36) | (w24 << 28)) & mask; + out[27] = ((w24 >> 30) | (w25 << 34)) & mask; + out[28] = ((w25 >> 24) | (w26 << 40)) & mask; + out[29] = ((w26 >> 18) | (w27 << 46)) & mask; + out[30] = ((w27 >> 12) | (w28 << 52)) & mask; + out[31] = w28 >> 6; + + return in; +} + +inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 576460752303423487ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + uint64_t w28 = util::SafeLoadAs(in); + w28 = arrow::bit_util::FromLittleEndian(w28); + in += 8; + uint64_t w29 = util::SafeLoadAs(in); + w29 = arrow::bit_util::FromLittleEndian(w29); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 59) | (w1 << 5)) & mask; + out[2] = ((w1 >> 54) | (w2 << 10)) & mask; + out[3] = ((w2 >> 49) | (w3 << 15)) & mask; + out[4] = ((w3 >> 44) | (w4 << 20)) & mask; + out[5] = ((w4 >> 39) | (w5 << 25)) & mask; + out[6] = ((w5 >> 34) | (w6 << 30)) & mask; + out[7] = ((w6 >> 29) | (w7 << 35)) & mask; + out[8] = ((w7 >> 24) | (w8 << 40)) & mask; + out[9] = ((w8 >> 19) | (w9 << 45)) & mask; + out[10] = ((w9 >> 14) | (w10 << 50)) & mask; + out[11] = ((w10 >> 9) | (w11 << 55)) & mask; + out[12] = (w11 >> 4) & mask; + out[13] = ((w11 >> 63) | (w12 << 1)) & mask; + out[14] = ((w12 >> 58) | (w13 << 6)) & mask; + out[15] = ((w13 >> 53) | (w14 << 11)) & mask; + out[16] = ((w14 >> 48) | (w15 << 16)) & mask; + out[17] = ((w15 >> 43) | (w16 << 21)) & mask; + out[18] = ((w16 >> 38) | (w17 << 26)) & mask; + out[19] = ((w17 >> 33) | (w18 << 31)) & mask; + out[20] = ((w18 >> 28) | (w19 << 36)) & mask; + out[21] = ((w19 >> 23) | (w20 << 41)) & mask; + out[22] = ((w20 >> 18) | (w21 << 46)) & mask; + out[23] = ((w21 >> 13) | (w22 << 51)) & mask; + out[24] = ((w22 >> 8) | (w23 << 56)) & mask; + out[25] = (w23 >> 3) & mask; + out[26] = ((w23 >> 62) | (w24 << 2)) & mask; + out[27] = ((w24 >> 57) | (w25 << 7)) & mask; + out[28] = ((w25 >> 52) | (w26 << 12)) & mask; + out[29] = ((w26 >> 47) | (w27 << 17)) & mask; + out[30] = ((w27 >> 42) | (w28 << 22)) & mask; + out[31] = ((w28 >> 37) | (w29 << 27)) & mask; + + return in; +} + +inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 1152921504606846975ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + uint64_t w28 = util::SafeLoadAs(in); + w28 = arrow::bit_util::FromLittleEndian(w28); + in += 8; + uint64_t w29 = util::SafeLoadAs(in); + w29 = arrow::bit_util::FromLittleEndian(w29); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 60) | (w1 << 4)) & mask; + out[2] = ((w1 >> 56) | (w2 << 8)) & mask; + out[3] = ((w2 >> 52) | (w3 << 12)) & mask; + out[4] = ((w3 >> 48) | (w4 << 16)) & mask; + out[5] = ((w4 >> 44) | (w5 << 20)) & mask; + out[6] = ((w5 >> 40) | (w6 << 24)) & mask; + out[7] = ((w6 >> 36) | (w7 << 28)) & mask; + out[8] = ((w7 >> 32) | (w8 << 32)) & mask; + out[9] = ((w8 >> 28) | (w9 << 36)) & mask; + out[10] = ((w9 >> 24) | (w10 << 40)) & mask; + out[11] = ((w10 >> 20) | (w11 << 44)) & mask; + out[12] = ((w11 >> 16) | (w12 << 48)) & mask; + out[13] = ((w12 >> 12) | (w13 << 52)) & mask; + out[14] = ((w13 >> 8) | (w14 << 56)) & mask; + out[15] = w14 >> 4; + out[16] = (w15)&mask; + out[17] = ((w15 >> 60) | (w16 << 4)) & mask; + out[18] = ((w16 >> 56) | (w17 << 8)) & mask; + out[19] = ((w17 >> 52) | (w18 << 12)) & mask; + out[20] = ((w18 >> 48) | (w19 << 16)) & mask; + out[21] = ((w19 >> 44) | (w20 << 20)) & mask; + out[22] = ((w20 >> 40) | (w21 << 24)) & mask; + out[23] = ((w21 >> 36) | (w22 << 28)) & mask; + out[24] = ((w22 >> 32) | (w23 << 32)) & mask; + out[25] = ((w23 >> 28) | (w24 << 36)) & mask; + out[26] = ((w24 >> 24) | (w25 << 40)) & mask; + out[27] = ((w25 >> 20) | (w26 << 44)) & mask; + out[28] = ((w26 >> 16) | (w27 << 48)) & mask; + out[29] = ((w27 >> 12) | (w28 << 52)) & mask; + out[30] = ((w28 >> 8) | (w29 << 56)) & mask; + out[31] = w29 >> 4; + + return in; +} + +inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 2305843009213693951ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + uint64_t w28 = util::SafeLoadAs(in); + w28 = arrow::bit_util::FromLittleEndian(w28); + in += 8; + uint64_t w29 = util::SafeLoadAs(in); + w29 = arrow::bit_util::FromLittleEndian(w29); + in += 8; + uint64_t w30 = util::SafeLoadAs(in); + w30 = arrow::bit_util::FromLittleEndian(w30); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 61) | (w1 << 3)) & mask; + out[2] = ((w1 >> 58) | (w2 << 6)) & mask; + out[3] = ((w2 >> 55) | (w3 << 9)) & mask; + out[4] = ((w3 >> 52) | (w4 << 12)) & mask; + out[5] = ((w4 >> 49) | (w5 << 15)) & mask; + out[6] = ((w5 >> 46) | (w6 << 18)) & mask; + out[7] = ((w6 >> 43) | (w7 << 21)) & mask; + out[8] = ((w7 >> 40) | (w8 << 24)) & mask; + out[9] = ((w8 >> 37) | (w9 << 27)) & mask; + out[10] = ((w9 >> 34) | (w10 << 30)) & mask; + out[11] = ((w10 >> 31) | (w11 << 33)) & mask; + out[12] = ((w11 >> 28) | (w12 << 36)) & mask; + out[13] = ((w12 >> 25) | (w13 << 39)) & mask; + out[14] = ((w13 >> 22) | (w14 << 42)) & mask; + out[15] = ((w14 >> 19) | (w15 << 45)) & mask; + out[16] = ((w15 >> 16) | (w16 << 48)) & mask; + out[17] = ((w16 >> 13) | (w17 << 51)) & mask; + out[18] = ((w17 >> 10) | (w18 << 54)) & mask; + out[19] = ((w18 >> 7) | (w19 << 57)) & mask; + out[20] = ((w19 >> 4) | (w20 << 60)) & mask; + out[21] = (w20 >> 1) & mask; + out[22] = ((w20 >> 62) | (w21 << 2)) & mask; + out[23] = ((w21 >> 59) | (w22 << 5)) & mask; + out[24] = ((w22 >> 56) | (w23 << 8)) & mask; + out[25] = ((w23 >> 53) | (w24 << 11)) & mask; + out[26] = ((w24 >> 50) | (w25 << 14)) & mask; + out[27] = ((w25 >> 47) | (w26 << 17)) & mask; + out[28] = ((w26 >> 44) | (w27 << 20)) & mask; + out[29] = ((w27 >> 41) | (w28 << 23)) & mask; + out[30] = ((w28 >> 38) | (w29 << 26)) & mask; + out[31] = ((w29 >> 35) | (w30 << 29)) & mask; + + return in; +} + +inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 4611686018427387903ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + uint64_t w28 = util::SafeLoadAs(in); + w28 = arrow::bit_util::FromLittleEndian(w28); + in += 8; + uint64_t w29 = util::SafeLoadAs(in); + w29 = arrow::bit_util::FromLittleEndian(w29); + in += 8; + uint64_t w30 = util::SafeLoadAs(in); + w30 = arrow::bit_util::FromLittleEndian(w30); + in += 8; + out[0] = (w0)&mask; + out[1] = ((w0 >> 62) | (w1 << 2)) & mask; + out[2] = ((w1 >> 60) | (w2 << 4)) & mask; + out[3] = ((w2 >> 58) | (w3 << 6)) & mask; + out[4] = ((w3 >> 56) | (w4 << 8)) & mask; + out[5] = ((w4 >> 54) | (w5 << 10)) & mask; + out[6] = ((w5 >> 52) | (w6 << 12)) & mask; + out[7] = ((w6 >> 50) | (w7 << 14)) & mask; + out[8] = ((w7 >> 48) | (w8 << 16)) & mask; + out[9] = ((w8 >> 46) | (w9 << 18)) & mask; + out[10] = ((w9 >> 44) | (w10 << 20)) & mask; + out[11] = ((w10 >> 42) | (w11 << 22)) & mask; + out[12] = ((w11 >> 40) | (w12 << 24)) & mask; + out[13] = ((w12 >> 38) | (w13 << 26)) & mask; + out[14] = ((w13 >> 36) | (w14 << 28)) & mask; + out[15] = ((w14 >> 34) | (w15 << 30)) & mask; + out[16] = ((w15 >> 32) | (w16 << 32)) & mask; + out[17] = ((w16 >> 30) | (w17 << 34)) & mask; + out[18] = ((w17 >> 28) | (w18 << 36)) & mask; + out[19] = ((w18 >> 26) | (w19 << 38)) & mask; + out[20] = ((w19 >> 24) | (w20 << 40)) & mask; + out[21] = ((w20 >> 22) | (w21 << 42)) & mask; + out[22] = ((w21 >> 20) | (w22 << 44)) & mask; + out[23] = ((w22 >> 18) | (w23 << 46)) & mask; + out[24] = ((w23 >> 16) | (w24 << 48)) & mask; + out[25] = ((w24 >> 14) | (w25 << 50)) & mask; + out[26] = ((w25 >> 12) | (w26 << 52)) & mask; + out[27] = ((w26 >> 10) | (w27 << 54)) & mask; + out[28] = ((w27 >> 8) | (w28 << 56)) & mask; + out[29] = ((w28 >> 6) | (w29 << 58)) & mask; + out[30] = ((w29 >> 4) | (w30 << 60)) & mask; + out[31] = w30 >> 2; + + return in; +} + +inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out) { + const uint64_t mask = 9223372036854775807ULL; + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + uint64_t w28 = util::SafeLoadAs(in); + w28 = arrow::bit_util::FromLittleEndian(w28); + in += 8; + uint64_t w29 = util::SafeLoadAs(in); + w29 = arrow::bit_util::FromLittleEndian(w29); + in += 8; + uint64_t w30 = util::SafeLoadAs(in); + w30 = arrow::bit_util::FromLittleEndian(w30); + in += 8; + uint64_t w31 = util::SafeLoadAs(in); + w31 = arrow::bit_util::FromLittleEndian(w31); + in += 4; + out[0] = (w0)&mask; + out[1] = ((w0 >> 63) | (w1 << 1)) & mask; + out[2] = ((w1 >> 62) | (w2 << 2)) & mask; + out[3] = ((w2 >> 61) | (w3 << 3)) & mask; + out[4] = ((w3 >> 60) | (w4 << 4)) & mask; + out[5] = ((w4 >> 59) | (w5 << 5)) & mask; + out[6] = ((w5 >> 58) | (w6 << 6)) & mask; + out[7] = ((w6 >> 57) | (w7 << 7)) & mask; + out[8] = ((w7 >> 56) | (w8 << 8)) & mask; + out[9] = ((w8 >> 55) | (w9 << 9)) & mask; + out[10] = ((w9 >> 54) | (w10 << 10)) & mask; + out[11] = ((w10 >> 53) | (w11 << 11)) & mask; + out[12] = ((w11 >> 52) | (w12 << 12)) & mask; + out[13] = ((w12 >> 51) | (w13 << 13)) & mask; + out[14] = ((w13 >> 50) | (w14 << 14)) & mask; + out[15] = ((w14 >> 49) | (w15 << 15)) & mask; + out[16] = ((w15 >> 48) | (w16 << 16)) & mask; + out[17] = ((w16 >> 47) | (w17 << 17)) & mask; + out[18] = ((w17 >> 46) | (w18 << 18)) & mask; + out[19] = ((w18 >> 45) | (w19 << 19)) & mask; + out[20] = ((w19 >> 44) | (w20 << 20)) & mask; + out[21] = ((w20 >> 43) | (w21 << 21)) & mask; + out[22] = ((w21 >> 42) | (w22 << 22)) & mask; + out[23] = ((w22 >> 41) | (w23 << 23)) & mask; + out[24] = ((w23 >> 40) | (w24 << 24)) & mask; + out[25] = ((w24 >> 39) | (w25 << 25)) & mask; + out[26] = ((w25 >> 38) | (w26 << 26)) & mask; + out[27] = ((w26 >> 37) | (w27 << 27)) & mask; + out[28] = ((w27 >> 36) | (w28 << 28)) & mask; + out[29] = ((w28 >> 35) | (w29 << 29)) & mask; + out[30] = ((w29 >> 34) | (w30 << 30)) & mask; + out[31] = ((w30 >> 33) | (w31 << 31)) & mask; + + return in; +} + +inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out) { + uint64_t w0 = util::SafeLoadAs(in); + w0 = arrow::bit_util::FromLittleEndian(w0); + in += 8; + uint64_t w1 = util::SafeLoadAs(in); + w1 = arrow::bit_util::FromLittleEndian(w1); + in += 8; + uint64_t w2 = util::SafeLoadAs(in); + w2 = arrow::bit_util::FromLittleEndian(w2); + in += 8; + uint64_t w3 = util::SafeLoadAs(in); + w3 = arrow::bit_util::FromLittleEndian(w3); + in += 8; + uint64_t w4 = util::SafeLoadAs(in); + w4 = arrow::bit_util::FromLittleEndian(w4); + in += 8; + uint64_t w5 = util::SafeLoadAs(in); + w5 = arrow::bit_util::FromLittleEndian(w5); + in += 8; + uint64_t w6 = util::SafeLoadAs(in); + w6 = arrow::bit_util::FromLittleEndian(w6); + in += 8; + uint64_t w7 = util::SafeLoadAs(in); + w7 = arrow::bit_util::FromLittleEndian(w7); + in += 8; + uint64_t w8 = util::SafeLoadAs(in); + w8 = arrow::bit_util::FromLittleEndian(w8); + in += 8; + uint64_t w9 = util::SafeLoadAs(in); + w9 = arrow::bit_util::FromLittleEndian(w9); + in += 8; + uint64_t w10 = util::SafeLoadAs(in); + w10 = arrow::bit_util::FromLittleEndian(w10); + in += 8; + uint64_t w11 = util::SafeLoadAs(in); + w11 = arrow::bit_util::FromLittleEndian(w11); + in += 8; + uint64_t w12 = util::SafeLoadAs(in); + w12 = arrow::bit_util::FromLittleEndian(w12); + in += 8; + uint64_t w13 = util::SafeLoadAs(in); + w13 = arrow::bit_util::FromLittleEndian(w13); + in += 8; + uint64_t w14 = util::SafeLoadAs(in); + w14 = arrow::bit_util::FromLittleEndian(w14); + in += 8; + uint64_t w15 = util::SafeLoadAs(in); + w15 = arrow::bit_util::FromLittleEndian(w15); + in += 8; + uint64_t w16 = util::SafeLoadAs(in); + w16 = arrow::bit_util::FromLittleEndian(w16); + in += 8; + uint64_t w17 = util::SafeLoadAs(in); + w17 = arrow::bit_util::FromLittleEndian(w17); + in += 8; + uint64_t w18 = util::SafeLoadAs(in); + w18 = arrow::bit_util::FromLittleEndian(w18); + in += 8; + uint64_t w19 = util::SafeLoadAs(in); + w19 = arrow::bit_util::FromLittleEndian(w19); + in += 8; + uint64_t w20 = util::SafeLoadAs(in); + w20 = arrow::bit_util::FromLittleEndian(w20); + in += 8; + uint64_t w21 = util::SafeLoadAs(in); + w21 = arrow::bit_util::FromLittleEndian(w21); + in += 8; + uint64_t w22 = util::SafeLoadAs(in); + w22 = arrow::bit_util::FromLittleEndian(w22); + in += 8; + uint64_t w23 = util::SafeLoadAs(in); + w23 = arrow::bit_util::FromLittleEndian(w23); + in += 8; + uint64_t w24 = util::SafeLoadAs(in); + w24 = arrow::bit_util::FromLittleEndian(w24); + in += 8; + uint64_t w25 = util::SafeLoadAs(in); + w25 = arrow::bit_util::FromLittleEndian(w25); + in += 8; + uint64_t w26 = util::SafeLoadAs(in); + w26 = arrow::bit_util::FromLittleEndian(w26); + in += 8; + uint64_t w27 = util::SafeLoadAs(in); + w27 = arrow::bit_util::FromLittleEndian(w27); + in += 8; + uint64_t w28 = util::SafeLoadAs(in); + w28 = arrow::bit_util::FromLittleEndian(w28); + in += 8; + uint64_t w29 = util::SafeLoadAs(in); + w29 = arrow::bit_util::FromLittleEndian(w29); + in += 8; + uint64_t w30 = util::SafeLoadAs(in); + w30 = arrow::bit_util::FromLittleEndian(w30); + in += 8; + uint64_t w31 = util::SafeLoadAs(in); + w31 = arrow::bit_util::FromLittleEndian(w31); + in += 8; + out[0] = w0; + out[1] = w1; + out[2] = w2; + out[3] = w3; + out[4] = w4; + out[5] = w5; + out[6] = w6; + out[7] = w7; + out[8] = w8; + out[9] = w9; + out[10] = w10; + out[11] = w11; + out[12] = w12; + out[13] = w13; + out[14] = w14; + out[15] = w15; + out[16] = w16; + out[17] = w17; + out[18] = w18; + out[19] = w19; + out[20] = w20; + out[21] = w21; + out[22] = w22; + out[23] = w23; + out[24] = w24; + out[25] = w25; + out[26] = w26; + out[27] = w27; + out[28] = w28; + out[29] = w29; + out[30] = w30; + out[31] = w31; + + return in; +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index 9105aaa2af4..84f091594c1 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -15,17 +15,15 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/bpacking_avx2.h" +#include "arrow/util/bpacking_avx2_internal.h" #include "arrow/util/bpacking_simd256_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx2.h b/cpp/src/arrow/util/bpacking_avx2.h deleted file mode 100644 index 7a7d8bf8c44..00000000000 --- a/cpp/src/arrow/util/bpacking_avx2.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -namespace arrow { -namespace internal { - -int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking_avx2_internal.h b/cpp/src/arrow/util/bpacking_avx2_internal.h new file mode 100644 index 00000000000..b2c213fe2aa --- /dev/null +++ b/cpp/src/arrow/util/bpacking_avx2_internal.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/visibility.h" + +#include + +namespace arrow::internal { + +ARROW_EXPORT int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512.cc b/cpp/src/arrow/util/bpacking_avx512.cc index 3570bcc352b..35de0dd5b47 100644 --- a/cpp/src/arrow/util/bpacking_avx512.cc +++ b/cpp/src/arrow/util/bpacking_avx512.cc @@ -15,17 +15,15 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/bpacking_avx512.h" +#include "arrow/util/bpacking_avx512_internal.h" #include "arrow/util/bpacking_simd512_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512.h b/cpp/src/arrow/util/bpacking_avx512.h deleted file mode 100644 index 96723f803e0..00000000000 --- a/cpp/src/arrow/util/bpacking_avx512.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -namespace arrow { -namespace internal { - -int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking_avx512_internal.h b/cpp/src/arrow/util/bpacking_avx512_internal.h new file mode 100644 index 00000000000..847aa981433 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_avx512_internal.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/visibility.h" + +#include + +namespace arrow::internal { + +ARROW_EXPORT int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc new file mode 100644 index 00000000000..f0ac22910c6 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "arrow/testing/util.h" +#include "arrow/util/bpacking_internal.h" + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +# include "arrow/util/bpacking_avx2_internal.h" +# include "arrow/util/cpu_info.h" +#endif +#if defined(ARROW_HAVE_RUNTIME_AVX512) +# include "arrow/util/bpacking_avx512_internal.h" +#endif +#if defined(ARROW_HAVE_NEON) +# include "arrow/util/bpacking_neon_internal.h" +#endif + +namespace arrow::internal { +namespace { + +template +using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); + +/// Get the number of bytes associate with a packing. +constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { + const auto num_bits = num_values * bit_width; + if (num_bits % 8 != 0) { + throw std::invalid_argument("Must pack a multiple of 8 bits."); + } + return num_bits / 8; +} + +/// Generate random bytes as packed integers. +std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { + constexpr uint32_t kSeed = 3214; + const auto num_bytes = GetNumBytes(num_values, bit_width); + + std::vector out(num_bytes); + random_bytes(num_bytes, kSeed, out.data()); + + return out; +} + +const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { + auto addr = reinterpret_cast(ptr); + + if (addr % alignment == 0) { + return ptr; + } + + auto remainder = addr % alignment; + auto bytes_to_add = alignment - remainder; + + return ptr + bytes_to_add; +} + +template +void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bool skip, + std::string skip_msg) { + if (skip) { + state.SkipWithMessage(skip_msg); + } + + const auto bit_width = static_cast(state.range(0)); + const auto num_values = static_cast(state.range(1)); + + // Assume std::vector allocation is likely be aligned for greater than a byte. + // So we allocate more values than necessary and skip to the next byte with the + // desired (non) alignment to test the proper condition. + constexpr int32_t kExtraValues = sizeof(Int) * 8; + const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); + const uint8_t* packed_ptr = + GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); + + std::vector unpacked(num_values, 0); + + for (auto _ : state) { + unpack(packed_ptr, unpacked.data(), num_values, bit_width); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(num_values * state.iterations()); +} + +constexpr int32_t kMinRange = 64; +constexpr int32_t kMaxRange = 32768; +constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; +constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; +static const std::vector> kBitWidthsNumValues32 = { + kBitWidths32, + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), +}; +static const std::vector> kBitWidthsNumValues64 = { + kBitWidths64, + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), +}; + +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackUint32(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); +} +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); +} + +BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar) + ->ArgsProduct(kBitWidthsNumValues64); + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), + "Avx2 not available") + ->ArgsProduct(kBitWidthsNumValues32); +#endif + +#if defined(ARROW_HAVE_RUNTIME_AVX512) +BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, unpack32_avx512, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), + "Avx512 not available") + ->ArgsProduct(kBitWidthsNumValues32); +#endif + +#if defined(ARROW_HAVE_NEON) +BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, unpack32_neon) + ->ArgsProduct(kBitWidthsNumValues32); +#endif + +BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, unpack32) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, unpack32) + ->ArgsProduct(kBitWidthsNumValues32); + +BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, unpack64) + ->ArgsProduct(kBitWidthsNumValues64); +BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, unpack64) + ->ArgsProduct(kBitWidthsNumValues64); + +} // namespace +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_default.h b/cpp/src/arrow/util/bpacking_default_internal.h similarity index 100% rename from cpp/src/arrow/util/bpacking_default.h rename to cpp/src/arrow/util/bpacking_default_internal.h diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h new file mode 100644 index 00000000000..e003cd8c0c6 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_internal.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/visibility.h" + +#include + +namespace arrow::internal { + +/// The scalar 32 bit unpacking. +ARROW_EXPORT int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); + +/// The scalar 64 bit unpacking. +ARROW_EXPORT int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, + int num_bits); + +ARROW_EXPORT +int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); +ARROW_EXPORT +int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits); + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon.cc b/cpp/src/arrow/util/bpacking_neon.cc index 3ab6de75f4c..407b309b7e8 100644 --- a/cpp/src/arrow/util/bpacking_neon.cc +++ b/cpp/src/arrow/util/bpacking_neon.cc @@ -15,17 +15,15 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/bpacking_neon.h" +#include "arrow/util/bpacking_neon_internal.h" #include "arrow/util/bpacking_simd128_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon.h b/cpp/src/arrow/util/bpacking_neon.h deleted file mode 100644 index 9d02cd568ac..00000000000 --- a/cpp/src/arrow/util/bpacking_neon.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -namespace arrow { -namespace internal { - -int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking_neon_internal.h b/cpp/src/arrow/util/bpacking_neon_internal.h new file mode 100644 index 00000000000..683aa5cbc47 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_neon_internal.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/visibility.h" + +#include + +namespace arrow::internal { + +ARROW_EXPORT int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index 4b2c97c0a7d..5beecad4210 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -24,7 +24,7 @@ #include -#include "arrow/util/dispatch.h" +#include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h index 8b1756d3fc1..3dccb1745f7 100644 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h @@ -24,7 +24,7 @@ #include -#include "arrow/util/dispatch.h" +#include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h index deeb423353a..4f2aeaeeb4b 100644 --- a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h @@ -24,7 +24,7 @@ #include -#include "arrow/util/dispatch.h" +#include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 581a19a53e5..9464908c021 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -164,7 +164,7 @@ def main(simd_width): #include - #include "arrow/util/dispatch.h" + #include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" namespace arrow {{ diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h index fa5a6689a56..98e192e7cb0 100644 --- a/cpp/src/arrow/util/bpacking_simd_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_internal.h @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/dispatch.h" +#include "arrow/util/dispatch_internal.h" #include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc new file mode 100644 index 00000000000..c2dd4748a44 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "arrow/result.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/logging.h" + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +# include "arrow/util/bpacking_avx2_internal.h" +# include "arrow/util/cpu_info.h" +#endif +#if defined(ARROW_HAVE_RUNTIME_AVX512) +# include "arrow/util/bpacking_avx512_internal.h" +#endif +#if defined(ARROW_HAVE_NEON) +# include "arrow/util/bpacking_neon_internal.h" +#endif + +namespace arrow::internal { + +template +using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); + +/// Get the number of bytes associate with a packing. +Result GetNumBytes(int32_t num_values, int32_t bit_width) { + const auto num_bits = num_values * bit_width; + if (num_bits % 8 != 0) { + return Status::NotImplemented( + "The unpack functions only work on a multiple of 8 bits."); + } + return num_bits / 8; +} + +/// Generate random bytes as packed integers. +std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { + constexpr uint32_t kSeed = 3214; + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + std::vector out(num_bytes); + random_bytes(num_bytes, kSeed, out.data()); + + return out; +} + +/// Convenience wrapper to unpack into a vector +template +std::vector UnpackValues(const uint8_t* packed, int32_t num_values, + int32_t bit_width, UnpackFunc unpack) { + std::vector out(num_values); + int values_read = unpack(packed, out.data(), num_values, bit_width); + ARROW_DCHECK_GE(values_read, 0); + out.resize(values_read); + return out; +} + +/// Use BitWriter to pack values into a vector. +template +std::vector PackValues(const std::vector& values, int32_t num_values, + int32_t bit_width) { + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + std::vector out(static_cast(num_bytes)); + bit_util::BitWriter writer(out.data(), num_bytes); + for (const auto& v : values) { + bool written = writer.PutValue(v, bit_width); + if (!written) { + throw std::runtime_error("Cannot write move values"); + } + } + + return out; +} + +template +void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values, + int32_t bit_width, UnpackFunc unpack) { + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + const auto unpacked = UnpackValues(packed, num_values, bit_width, unpack); + EXPECT_EQ(unpacked.size(), num_values); + const auto roundtrip = PackValues(unpacked, num_values, bit_width); + EXPECT_EQ(num_bytes, roundtrip.size()); + for (int i = 0; i < num_bytes; ++i) { + EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i; + } +} + +const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { + auto addr = reinterpret_cast(ptr); + + if (addr % alignment == 0) { + return ptr; + } + + auto remainder = addr % alignment; + auto bytes_to_add = alignment - remainder; + + return ptr + bytes_to_add; +} + +struct TestUnpackSize { + int32_t num_values; + int32_t bit_width; +}; + +class TestUnpack : public ::testing::TestWithParam { + protected: + template + void TestRoundtripAlignment(UnpackFunc unpack, std::size_t alignment_offset) { + auto [num_values, bit_width] = GetParam(); + + // Assume std::vector allocation is likely be aligned for greater than a byte. + // So we allocate more values than necessary and skip to the next byte with the + // desired (non) alignment to test the proper condition. + constexpr int32_t kExtraValues = sizeof(Int) * 8; + const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); + const uint8_t* packed_unaligned = + GetNextAlignedByte(packed.data(), sizeof(Int)) + alignment_offset; + + CheckUnpackPackRoundtrip(packed_unaligned, num_values, bit_width, unpack); + } + + template + void TestUnpackZeros(UnpackFunc unpack) { + auto [num_values, bit_width] = GetParam(); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + const std::vector packed(static_cast(num_bytes), uint8_t{0}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + const std::vector expected(static_cast(num_values), Int{0}); + EXPECT_EQ(unpacked, expected); + } + + template + void TestUnpackOnes(UnpackFunc unpack) { + auto [num_values, bit_width] = GetParam(); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + const std::vector packed(static_cast(num_bytes), uint8_t{0xFF}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + // Generate bit_width ones + Int expected_value = 0; + for (int i = 0; i < bit_width; ++i) { + expected_value = (expected_value << 1) | 1; + } + const std::vector expected(static_cast(num_values), expected_value); + EXPECT_EQ(unpacked, expected); + } + + template + void TestUnpackAlternating(UnpackFunc unpack) { + const auto [num_values, bit_width] = GetParam(); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + const std::vector packed(static_cast(num_bytes), uint8_t{0xAA}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + // Generate alternative bit sequence sratring with either 0 or 1 + Int one_zero_value = 0; + Int zero_one_value = 0; + for (int i = 0; i < bit_width; ++i) { + zero_one_value = (zero_one_value << 1) | (i % 2); + one_zero_value = (one_zero_value << 1) | ((i + 1) % 2); + } + + std::vector expected; + if (bit_width % 2 == 0) { + // For even bit_width, the same pattern repeats every time + expected.resize(static_cast(num_values), one_zero_value); + } else { + // For odd bit_width, we alternate a pattern leading with 0 and 1 + for (int i = 0; i < num_values; ++i) { + expected.push_back(i % 2 == 0 ? zero_one_value : one_zero_value); + } + } + EXPECT_EQ(unpacked, expected); + } + + template + void TestAll(UnpackFunc unpack) { + // Known values + TestUnpackZeros(unpack); + TestUnpackOnes(unpack); + TestUnpackAlternating(unpack); + + // Roundtrips + TestRoundtripAlignment(unpack, /* alignment_offset= */ 0); + TestRoundtripAlignment(unpack, /* alignment_offset= */ 1); + } +}; + +INSTANTIATE_TEST_SUITE_P( + UnpackMultiplesOf64Values, TestUnpack, + ::testing::Values(TestUnpackSize{64, 1}, TestUnpackSize{128, 1}, + TestUnpackSize{2048, 1}, TestUnpackSize{64, 31}, + TestUnpackSize{128, 31}, TestUnpackSize{2048, 1}, + TestUnpackSize{2048, 8}, TestUnpackSize{2048, 13}, + TestUnpackSize{2048, 16}, TestUnpackSize{2048, 31}, + TestUnpackSize{2048, 32})); + +TEST_P(TestUnpack, Unpack32Scalar) { this->TestAll(&unpack32_scalar); } +TEST_P(TestUnpack, Unpack64Scalar) { this->TestAll(&unpack64_scalar); } + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +TEST_P(TestUnpack, Unpack32Avx2) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) { + GTEST_SKIP() << "Test requires AVX2"; + } + this->TestAll(&unpack32_avx2); +} +#endif + +#if defined(ARROW_HAVE_RUNTIME_AVX512) +TEST_P(TestUnpack, Unpack32Avx512) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) { + GTEST_SKIP() << "Test requires AVX512"; + } + this->TestAll(&unpack32_avx512); +} +#endif + +#if defined(ARROW_HAVE_NEON) +TEST_P(TestUnpack, Unpack32Neon) { this->TestAll(&unpack32_neon); } +#endif + +TEST_P(TestUnpack, Unpack32) { this->TestAll(&unpack32); } +TEST_P(TestUnpack, Unpack64) { this->TestAll(&unpack64); } + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/byte_stream_split_internal.cc b/cpp/src/arrow/util/byte_stream_split_internal.cc new file mode 100644 index 00000000000..7e9e339cffd --- /dev/null +++ b/cpp/src/arrow/util/byte_stream_split_internal.cc @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/byte_stream_split_internal.h" +#include "arrow/util/dispatch_internal.h" + +#include + +namespace arrow::util::internal { + +using ::arrow::internal::DispatchLevel; +using ::arrow::internal::DynamicDispatch; + +/************************ + * Decode dispatching * + ************************/ + +template +struct ByteStreamSplitDecodeDynamic { + using FunctionType = decltype(&ByteStreamSplitDecodeScalar); + using Implementation = std::pair; + + constexpr static auto implementations() { + return std::array { + Implementation { + DispatchLevel::NONE, +#if defined(ARROW_HAVE_NEON) + // We always expect Neon to be available on Arm64 + &ByteStreamSplitDecodeSimd, +#elif defined(ARROW_HAVE_SSE4_2) + // We always expect SSE4.2 to be available on x86_64 + &ByteStreamSplitDecodeSimd, +#else + &ByteStreamSplitDecodeScalar, +#endif + } + , +#if defined(ARROW_HAVE_RUNTIME_AVX2) + Implementation{ + DispatchLevel::AVX2, + &ByteStreamSplitDecodeSimd, + }, +#endif + }; + } +}; + +template +void ByteStreamSplitDecodeSimdDispatch(const uint8_t* data, int width, int64_t num_values, + int64_t stride, uint8_t* out) { + static const DynamicDispatch> dispatch; + return dispatch.func(data, width, num_values, stride, out); +} + +template void ByteStreamSplitDecodeSimdDispatch<2>(const uint8_t*, int, int64_t, int64_t, + uint8_t*); +template void ByteStreamSplitDecodeSimdDispatch<4>(const uint8_t*, int, int64_t, int64_t, + uint8_t*); +template void ByteStreamSplitDecodeSimdDispatch<8>(const uint8_t*, int, int64_t, int64_t, + uint8_t*); + +/************************ + * Encode dispatching * + ************************/ + +template +struct ByteStreamSplitEncodeDynamic { + using FunctionType = decltype(&ByteStreamSplitEncodeScalar); + using Implementation = std::pair; + + constexpr static auto implementations() { + return std::array { + Implementation { + DispatchLevel::NONE, +#if defined(ARROW_HAVE_NEON) + // We always expect Neon to be available on Arm64 + &ByteStreamSplitEncodeSimd, +#elif defined(ARROW_HAVE_SSE4_2) + // We always expect SSE4.2 to be available on x86_64 + &ByteStreamSplitEncodeSimd, +#else + &ByteStreamSplitEncodeScalar, +#endif + } + , +#if defined(ARROW_HAVE_RUNTIME_AVX2) + Implementation{DispatchLevel::AVX2, &ByteStreamSplitEncodeAvx2}, +#endif + }; + } +}; + +template +void ByteStreamSplitEncodeSimdDispatch(const uint8_t* raw_values, int width, + const int64_t num_values, + uint8_t* output_buffer_raw) { + static const DynamicDispatch> dispatch; + return dispatch.func(raw_values, width, num_values, output_buffer_raw); +} + +template void ByteStreamSplitEncodeSimdDispatch<2>(const uint8_t*, int, const int64_t, + uint8_t*); +template void ByteStreamSplitEncodeSimdDispatch<4>(const uint8_t*, int, const int64_t, + uint8_t*); +template void ByteStreamSplitEncodeSimdDispatch<8>(const uint8_t*, int, const int64_t, + uint8_t*); + +} // namespace arrow::util::internal diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index d3214239ff9..70f9b87d6c7 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -18,11 +18,12 @@ #pragma once #include "arrow/util/endian.h" -#include "arrow/util/simd.h" +#include "arrow/util/math_internal.h" #include "arrow/util/small_vector.h" +#include "arrow/util/type_traits.h" #include "arrow/util/ubsan.h" +#include "arrow/util/visibility.h" -#include #include #include #include @@ -35,20 +36,39 @@ namespace arrow::util::internal { -// -// SIMD implementations -// +#if defined(ARROW_HAVE_SIMD_SPLIT) -#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2) -template -void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_values, - int64_t stride, uint8_t* out) { - using simd_batch = xsimd::make_sized_batch_t; +/*************************** + * xsimd implementations * + ***************************/ + +using ::arrow::internal::ReversePow2; + +template +void ByteStreamSplitDecodeSimd(const uint8_t* data, int width, int64_t num_values, + int64_t stride, uint8_t* out) { + using simd_batch = xsimd::batch; + // For signed arithmetic + constexpr int kBatchSize = static_cast(simd_batch::size); + + static_assert(kBatchSize >= 16, "The smallest SIMD size is 128 bits"); + + if constexpr (kBatchSize > 16) { + if (num_values < kBatchSize) { + using Arch128 = xsimd::make_sized_batch_t::arch_type; + return ByteStreamSplitDecodeSimd(data, width, num_values, + stride, out); + } + } + static_assert(kNumStreams <= kBatchSize, + "The algorithm works when the number of streams is smaller than the SIMD " + "batch size."); assert(width == kNumStreams); - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2); - constexpr int64_t kBlockSize = sizeof(simd_batch) * kNumStreams; + constexpr int kNumStreamsLog2 = ReversePow2(kNumStreams); + static_assert(kNumStreamsLog2 != 0, + "The algorithm works for a number of streams being a power of two."); + constexpr int64_t kBlockSize = kBatchSize * kNumStreams; const int64_t size = num_values * kNumStreams; const int64_t num_blocks = size / kBlockSize; @@ -63,7 +83,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va const int64_t byte_index = b * stride + i; gathered_byte_data[b] = data[byte_index]; } - memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams); + std::memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams); } // The blocks get processed hierarchically using the unpack intrinsics. @@ -71,262 +91,112 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va // Stage 1: AAAA BBBB CCCC DDDD // Stage 2: ACAC ACAC BDBD BDBD // Stage 3: ABCD ABCD ABCD ABCD - simd_batch stage[kNumStreamsLog2 + 1][kNumStreams]; constexpr int kNumStreamsHalf = kNumStreams / 2U; - for (int64_t i = 0; i < num_blocks; ++i) { - for (int j = 0; j < kNumStreams; ++j) { - stage[0][j] = - simd_batch::load_unaligned(&data[i * sizeof(simd_batch) + j * stride]); - } - for (int step = 0; step < kNumStreamsLog2; ++step) { - for (int j = 0; j < kNumStreamsHalf; ++j) { - stage[step + 1U][j * 2] = - xsimd::zip_lo(stage[step][j], stage[step][kNumStreamsHalf + j]); - stage[step + 1U][j * 2 + 1U] = - xsimd::zip_hi(stage[step][j], stage[step][kNumStreamsHalf + j]); - } - } - for (int j = 0; j < kNumStreams; ++j) { - xsimd::store_unaligned( - reinterpret_cast(out + (i * kNumStreams + j) * sizeof(simd_batch)), - stage[kNumStreamsLog2][j]); - } - } -} - -template -void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, - const int64_t num_values, uint8_t* output_buffer_raw) { - using simd_batch = xsimd::make_sized_batch_t; - - assert(width == kNumStreams); - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams; - - simd_batch stage[3][kNumStreams]; - simd_batch final_result[kNumStreams]; - - const int64_t size = num_values * kNumStreams; - const int64_t num_blocks = size / kBlockSize; - int8_t* output_buffer_streams[kNumStreams]; - for (int i = 0; i < kNumStreams; ++i) { - output_buffer_streams[i] = - reinterpret_cast(&output_buffer_raw[num_values * i]); - } - - // First handle suffix. - const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; - for (int64_t i = num_processed_elements; i < num_values; ++i) { - for (int j = 0; j < kNumStreams; ++j) { - const uint8_t byte_in_value = raw_values[i * kNumStreams + j]; - output_buffer_raw[j * num_values + i] = byte_in_value; - } - } - // The current shuffling algorithm diverges for float and double types but the compiler - // should be able to remove the branch since only one path is taken for each template - // instantiation. - // Example run for 32-bit variables: - // Step 0: copy from unaligned input bytes: - // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ... - // Step 1: simd_batch::zip_lo and simd_batch::zip_hi: - // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ... - // Step 2: apply simd_batch::zip_lo and simd_batch::zip_hi again: - // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ... - // Step 3: simd_batch::zip_lo and simd_batch::zip_hi: - // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ... - // Step 4: simd_batch::zip_lo and simd_batch::zip_hi: - // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ... for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { - // First copy the data to stage 0. + simd_batch stage[kNumStreamsLog2 + 1][kNumStreams]; + for (int i = 0; i < kNumStreams; ++i) { - stage[0][i] = simd_batch::load_unaligned( - reinterpret_cast(raw_values) + - (block_index * kNumStreams + i) * sizeof(simd_batch)); + stage[0][i] = + simd_batch::load_unaligned(&data[block_index * kBatchSize + i * stride]); } - // The shuffling of bytes is performed through the unpack intrinsics. - // In my measurements this gives better performance then an implementation - // which uses the shuffle intrinsics. - for (int stage_lvl = 0; stage_lvl < 2; ++stage_lvl) { - for (int i = 0; i < kNumStreams / 2; ++i) { - stage[stage_lvl + 1][i * 2] = - xsimd::zip_lo(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); - stage[stage_lvl + 1][i * 2 + 1] = - xsimd::zip_hi(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); - } - } - if constexpr (kNumStreams == 8) { - // This is the path for 64bits data. - simd_batch tmp[8]; - using int32_batch = xsimd::make_sized_batch_t; - // This is a workaround, see: https://github.com/xtensor-stack/xsimd/issues/735 - auto from_int32_batch = [](int32_batch from) -> simd_batch { - simd_batch dest; - memcpy(&dest, &from, sizeof(simd_batch)); - return dest; - }; - auto to_int32_batch = [](simd_batch from) -> int32_batch { - int32_batch dest; - memcpy(&dest, &from, sizeof(simd_batch)); - return dest; - }; - for (int i = 0; i < 4; ++i) { - tmp[i * 2] = from_int32_batch( - xsimd::zip_lo(to_int32_batch(stage[2][i]), to_int32_batch(stage[2][i + 4]))); - tmp[i * 2 + 1] = from_int32_batch( - xsimd::zip_hi(to_int32_batch(stage[2][i]), to_int32_batch(stage[2][i + 4]))); - } - for (int i = 0; i < 4; ++i) { - final_result[i * 2] = from_int32_batch( - xsimd::zip_lo(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4]))); - final_result[i * 2 + 1] = from_int32_batch( - xsimd::zip_hi(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4]))); - } - } else { - // This is the path for 32bits data. - using int64_batch = xsimd::make_sized_batch_t; - // This is a workaround, see: https://github.com/xtensor-stack/xsimd/issues/735 - auto from_int64_batch = [](int64_batch from) -> simd_batch { - simd_batch dest; - memcpy(&dest, &from, sizeof(simd_batch)); - return dest; - }; - auto to_int64_batch = [](simd_batch from) -> int64_batch { - int64_batch dest; - memcpy(&dest, &from, sizeof(simd_batch)); - return dest; - }; - simd_batch tmp[4]; - for (int i = 0; i < 2; ++i) { - tmp[i * 2] = xsimd::zip_lo(stage[2][i * 2], stage[2][i * 2 + 1]); - tmp[i * 2 + 1] = xsimd::zip_hi(stage[2][i * 2], stage[2][i * 2 + 1]); - } - for (int i = 0; i < 2; ++i) { - final_result[i * 2] = from_int64_batch( - xsimd::zip_lo(to_int64_batch(tmp[i]), to_int64_batch(tmp[i + 2]))); - final_result[i * 2 + 1] = from_int64_batch( - xsimd::zip_hi(to_int64_batch(tmp[i]), to_int64_batch(tmp[i + 2]))); + for (int step = 0; step < kNumStreamsLog2; ++step) { + for (int i = 0; i < kNumStreamsHalf; ++i) { + stage[step + 1U][i * 2] = + xsimd::zip_lo(stage[step][i], stage[step][kNumStreamsHalf + i]); + stage[step + 1U][i * 2 + 1U] = + xsimd::zip_hi(stage[step][i], stage[step][kNumStreamsHalf + i]); } } + for (int i = 0; i < kNumStreams; ++i) { - xsimd::store_unaligned(&output_buffer_streams[i][block_index * sizeof(simd_batch)], - final_result[i]); + xsimd::store_unaligned( + reinterpret_cast(out + (block_index * kNumStreams + i) * kBatchSize), + stage[kNumStreamsLog2][i]); } } } -#endif - -#if defined(ARROW_HAVE_AVX2) -template -void ByteStreamSplitDecodeAvx2(const uint8_t* data, int width, int64_t num_values, - int64_t stride, uint8_t* out) { - assert(width == kNumStreams); - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2); - constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams; - - const int64_t size = num_values * kNumStreams; - if (size < kBlockSize) // Back to SSE for small size - return ByteStreamSplitDecodeSimd128(data, width, num_values, stride, - out); - const int64_t num_blocks = size / kBlockSize; - - // First handle suffix. - const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; - for (int64_t i = num_processed_elements; i < num_values; ++i) { - uint8_t gathered_byte_data[kNumStreams]; - for (int b = 0; b < kNumStreams; ++b) { - const int64_t byte_index = b * stride + i; - gathered_byte_data[b] = data[byte_index]; - } - memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams); +// Like xsimd::zip_lo, but zip groups of kNumBytes at once. +template +auto zip_lo_n(const xsimd::batch& a, const xsimd::batch& b) + -> xsimd::batch { + using arrow::internal::SizedInt; + using simd_batch = xsimd::batch; + // For signed arithmetic + constexpr int kBatchSize = static_cast(simd_batch::size); + + if constexpr (kNumBytes == kBatchSize) { + return a; + } else if constexpr (kNumBytes <= 8) { + return xsimd::bitwise_cast( + xsimd::zip_lo(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); + } else if constexpr (kNumBytes == 16 && kBatchSize == 32) { + // No data type for 128 bits. + // This could be made generic by simply computing the shuffle permute constant + return xsimd::bitwise_cast( + xsimd::shuffle(xsimd::bitwise_cast(a), xsimd::bitwise_cast(b), + xsimd::batch_constant{})); } +} - // Processed hierarchically using unpack intrinsics, then permute intrinsics. - __m256i stage[kNumStreamsLog2 + 1][kNumStreams]; - __m256i final_result[kNumStreams]; - constexpr int kNumStreamsHalf = kNumStreams / 2; - - for (int64_t i = 0; i < num_blocks; ++i) { - for (int j = 0; j < kNumStreams; ++j) { - stage[0][j] = _mm256_loadu_si256( - reinterpret_cast(&data[i * sizeof(__m256i) + j * stride])); - } +// Like xsimd::zip_hi, but zip groups of kNumBytes at once. +template +auto zip_hi_n(const xsimd::batch& a, const xsimd::batch& b) + -> xsimd::batch { + using simd_batch = xsimd::batch; + using arrow::internal::SizedInt; + // For signed arithmetic + constexpr int kBatchSize = static_cast(simd_batch::size); + + if constexpr (kNumBytes == kBatchSize) { + return b; + } else if constexpr (kNumBytes <= 8) { + return xsimd::bitwise_cast( + xsimd::zip_hi(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); + } else if constexpr (kNumBytes == 16 && kBatchSize == 32) { + // No data type for 128 bits + // This could be made generic by simply computing the shuffle permute constant + return xsimd::bitwise_cast( + xsimd::shuffle(xsimd::bitwise_cast(a), xsimd::bitwise_cast(b), + xsimd::batch_constant{})); + } +} - for (int step = 0; step < kNumStreamsLog2; ++step) { - for (int j = 0; j < kNumStreamsHalf; ++j) { - stage[step + 1][j * 2] = - _mm256_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); - stage[step + 1][j * 2 + 1] = - _mm256_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); - } - } +template +void ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width, + const int64_t num_values, uint8_t* output_buffer_raw) { + using simd_batch = xsimd::batch; + // For signed arithmetic + constexpr int kBatchSize = static_cast(simd_batch::size); - if constexpr (kNumStreams == 8) { - // path for double, 128i index: - // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B}, - // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F}, - final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b00100000); - final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b00100000); - final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4], - stage[kNumStreamsLog2][5], 0b00100000); - final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6], - stage[kNumStreamsLog2][7], 0b00100000); - final_result[4] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b00110001); - final_result[5] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b00110001); - final_result[6] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4], - stage[kNumStreamsLog2][5], 0b00110001); - final_result[7] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6], - stage[kNumStreamsLog2][7], 0b00110001); - } else { - // path for float, 128i index: - // {0x00, 0x04}, {0x01, 0x05}, {0x02, 0x06}, {0x03, 0x07} - final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b00100000); - final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b00100000); - final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b00110001); - final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b00110001); - } + static_assert(kBatchSize >= 16, "The smallest SIMD size is 128 bits"); - for (int j = 0; j < kNumStreams; ++j) { - _mm256_storeu_si256( - reinterpret_cast<__m256i*>(out + (i * kNumStreams + j) * sizeof(__m256i)), - final_result[j]); + if constexpr (kBatchSize > 16) { + if (num_values < kBatchSize) { + using Arch128 = xsimd::make_sized_batch_t::arch_type; + return ByteStreamSplitEncodeSimd( + raw_values, width, num_values, output_buffer_raw); } } -} -template -void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, int width, - const int64_t num_values, uint8_t* output_buffer_raw) { assert(width == kNumStreams); - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kBlockSize = sizeof(__m256i) * kNumStreams; - - if constexpr (kNumStreams == 8) // Back to SSE, currently no path for double. - return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, - output_buffer_raw); + static_assert(kNumStreams <= kBatchSize, + "The algorithm works when the number of streams is smaller than the SIMD " + "batch size."); + constexpr int kBlockSize = kBatchSize * kNumStreams; + static_assert(ReversePow2(kNumStreams) != 0, + "The algorithm works for a number of streams being a power of two."); const int64_t size = num_values * kNumStreams; - if (size < kBlockSize) // Back to SSE for small size - return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, - output_buffer_raw); const int64_t num_blocks = size / kBlockSize; - const __m256i* raw_values_simd = reinterpret_cast(raw_values); - __m256i* output_buffer_streams[kNumStreams]; - + int8_t* output_buffer_streams[kNumStreams]; for (int i = 0; i < kNumStreams; ++i) { output_buffer_streams[i] = - reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]); + reinterpret_cast(&output_buffer_raw[num_values * i]); } // First handle suffix. @@ -338,74 +208,112 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, int width, } } - // Path for float. - // 1. Processed hierarchically to 32i block using the unpack intrinsics. - // 2. Pack 128i block using _mm256_permutevar8x32_epi32. - // 3. Pack final 256i block with _mm256_permute2x128_si256. - constexpr int kNumUnpack = 3; - __m256i stage[kNumUnpack + 1][kNumStreams]; - static const __m256i kPermuteMask = - _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); - __m256i permute[kNumStreams]; - __m256i final_result[kNumStreams]; - + // Number of input values we can fit in a simd register + constexpr int kNumValuesInBatch = kBatchSize / kNumStreams; + static_assert(kNumValuesInBatch > 0); + // Number of bytes we'll bring together in the first byte-level part of the algorithm. + // Since we zip with the next batch, the number of values in a batch determines how many + // bytes end up together before we can use a larger type + constexpr int kNumBytes = 2 * kNumValuesInBatch; + // Number of steps in the first part of the algorithm with byte-level zipping + constexpr int kNumStepsByte = ReversePow2(kNumValuesInBatch) + 1; + // Number of steps in the first part of the algorithm with large data type zipping + constexpr int kNumStepsLarge = ReversePow2(static_cast(kBatchSize) / kNumBytes); + // Total number of steps + constexpr int kNumSteps = kNumStepsByte + kNumStepsLarge; + static_assert(kNumSteps == ReversePow2(kBatchSize)); + + // Two step shuffling algorithm that starts with bytes and ends with a larger data type. + // An algorithm similar to the decoding one with log2(kBatchSize) + 1 stages is + // also valid but not as performant. for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { + simd_batch stage[kNumSteps + 1][kNumStreams]; + + // First copy the data to stage 0. for (int i = 0; i < kNumStreams; ++i) { - stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * kNumStreams + i]); + stage[0][i] = simd_batch::load_unaligned( + &raw_values[(block_index * kNumStreams + i) * kBatchSize]); } - for (int stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) { - for (int i = 0; i < kNumStreams / 2; ++i) { - stage[stage_lvl + 1][i * 2] = - _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); - stage[stage_lvl + 1][i * 2 + 1] = - _mm256_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); + // We first make byte-level shuffling, until we have gather enough bytes together + // and in the correct order to use a bigger data type. + // + // Example with 32bit data on 128 bit register: + // + // 0: A0B0C0D0 A1B1C1D1 A2B2C2D2 A3B3C3D3 | A4B4C4D4 A5B5C5D5 A6B6C6D6 A7B7C7D7 | ... + // 1: A0A4B0B4 C0C4D0D4 A1A5B1B5 C1C5D1D5 | A2A6B2B6 C2C6D2D6 A3A7B3B7 C3C7D3D7 | ... + // 2: A0A2A4A6 B0B2B4B6 C0C2C4C6 D0D2D4D6 | A1A3A5A7 B1B3B5B7 C1C3C5C7 D1D3D5D7 | ... + // 3: A0A1A2A3 A4A5A6A7 B0B1B2B3 B4B5B6B7 | C0C1C2C3 C4C5C6C7 D0D1D2D3 D4D5D6D7 | ... + // + // The shuffling of bytes is performed through the unpack intrinsics. + // In my measurements this gives better performance then an implementation + // which uses the shuffle intrinsics. + // + // Loop order does not matter so we prefer higher locality + constexpr int kNumStreamsHalf = kNumStreams / 2; + for (int i = 0; i < kNumStreamsHalf; ++i) { + for (int step = 0; step < kNumStepsByte; ++step) { + stage[step + 1][i * 2] = + xsimd::zip_lo(stage[step][i * 2], stage[step][i * 2 + 1]); + stage[step + 1][i * 2 + 1] = + xsimd::zip_hi(stage[step][i * 2], stage[step][i * 2 + 1]); } } - for (int i = 0; i < kNumStreams; ++i) { - permute[i] = _mm256_permutevar8x32_epi32(stage[kNumUnpack][i], kPermuteMask); + // We know have the bytes packed in a larger data type and in the correct order to + // start using a bigger data type + // + // Example with 32bit data on 128 bit register. + // The large data type is int64_t with NumBytes=8 bytes: + // + // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ... + for (int step = kNumStepsByte; step < kNumSteps; ++step) { + for (int i = 0; i < kNumStreamsHalf; ++i) { + stage[step + 1][i * 2] = + zip_lo_n(stage[step][i], stage[step][i + kNumStreamsHalf]); + stage[step + 1][i * 2 + 1] = + zip_hi_n(stage[step][i], stage[step][i + kNumStreamsHalf]); + } } - final_result[0] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00100000); - final_result[1] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00110001); - final_result[2] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00100000); - final_result[3] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00110001); - + // Save the encoded data to the output buffer for (int i = 0; i < kNumStreams; ++i) { - _mm256_storeu_si256(&output_buffer_streams[i][block_index], final_result[i]); + xsimd::store_unaligned(&output_buffer_streams[i][block_index * kBatchSize], + stage[kNumSteps][i]); } } } -#endif // ARROW_HAVE_AVX2 -#if defined(ARROW_HAVE_SIMD_SPLIT) -template -void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int width, int64_t num_values, - int64_t stride, uint8_t* out) { -# if defined(ARROW_HAVE_AVX2) - return ByteStreamSplitDecodeAvx2(data, width, num_values, stride, out); -# elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) - return ByteStreamSplitDecodeSimd128(data, width, num_values, stride, out); -# else -# error "ByteStreamSplitDecodeSimd not implemented" -# endif -} +# if defined(ARROW_HAVE_RUNTIME_AVX2) + +// The extern template declaration are used internally and need export +// to be used in tests and benchmarks. + +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitDecodeSimd( + const uint8_t*, int, int64_t, int64_t, uint8_t*); +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitDecodeSimd( + const uint8_t*, int, int64_t, int64_t, uint8_t*); +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitDecodeSimd( + const uint8_t*, int, int64_t, int64_t, uint8_t*); template -void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width, - const int64_t num_values, - uint8_t* output_buffer_raw) { -# if defined(ARROW_HAVE_AVX2) - return ByteStreamSplitEncodeAvx2(raw_values, width, num_values, - output_buffer_raw); -# elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) - return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, - output_buffer_raw); -# else -# error "ByteStreamSplitEncodeSimd not implemented" +void ByteStreamSplitEncodeAvx2(const uint8_t*, int, const int64_t, uint8_t*); + +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitEncodeAvx2<2>(const uint8_t*, + int, + const int64_t, + uint8_t*); +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitEncodeAvx2<4>(const uint8_t*, + int, + const int64_t, + uint8_t*); +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitEncodeAvx2<8>(const uint8_t*, + int, + const int64_t, + uint8_t*); + # endif -} + #endif // @@ -543,52 +451,64 @@ inline void ByteStreamSplitDecodeScalarDynamic(const uint8_t* data, int width, DoMergeStreams(src_streams.data(), width, num_values, out); } +template +ARROW_EXPORT void ByteStreamSplitDecodeSimdDispatch(const uint8_t* data, int width, + int64_t num_values, int64_t stride, + uint8_t* out); + +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitDecodeSimdDispatch<2>( + const uint8_t*, int, int64_t, int64_t, uint8_t*); +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitDecodeSimdDispatch<4>( + const uint8_t*, int, int64_t, int64_t, uint8_t*); +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitDecodeSimdDispatch<8>( + const uint8_t*, int, int64_t, int64_t, uint8_t*); + +template +void ByteStreamSplitEncodeSimdDispatch(const uint8_t* raw_values, int width, + const int64_t num_values, + uint8_t* output_buffer_raw); + +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitEncodeSimdDispatch<2>( + const uint8_t*, int, const int64_t, uint8_t*); +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitEncodeSimdDispatch<4>( + const uint8_t*, int, const int64_t, uint8_t*); +extern template ARROW_TEMPLATE_EXPORT void ByteStreamSplitEncodeSimdDispatch<8>( + const uint8_t*, int, const int64_t, uint8_t*); + inline void ByteStreamSplitEncode(const uint8_t* raw_values, int width, const int64_t num_values, uint8_t* out) { -#if defined(ARROW_HAVE_SIMD_SPLIT) -# define ByteStreamSplitEncodePerhapsSimd ByteStreamSplitEncodeSimd -#else -# define ByteStreamSplitEncodePerhapsSimd ByteStreamSplitEncodeScalar -#endif switch (width) { case 1: - memcpy(out, raw_values, num_values); + std::memcpy(out, raw_values, num_values); return; case 2: - return ByteStreamSplitEncodeScalar<2>(raw_values, width, num_values, out); + return ByteStreamSplitEncodeSimdDispatch<2>(raw_values, width, num_values, out); case 4: - return ByteStreamSplitEncodePerhapsSimd<4>(raw_values, width, num_values, out); + return ByteStreamSplitEncodeSimdDispatch<4>(raw_values, width, num_values, out); case 8: - return ByteStreamSplitEncodePerhapsSimd<8>(raw_values, width, num_values, out); + return ByteStreamSplitEncodeSimdDispatch<8>(raw_values, width, num_values, out); case 16: return ByteStreamSplitEncodeScalar<16>(raw_values, width, num_values, out); } return ByteStreamSplitEncodeScalarDynamic(raw_values, width, num_values, out); -#undef ByteStreamSplitEncodePerhapsSimd } inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_values, int64_t stride, uint8_t* out) { -#if defined(ARROW_HAVE_SIMD_SPLIT) -# define ByteStreamSplitDecodePerhapsSimd ByteStreamSplitDecodeSimd -#else -# define ByteStreamSplitDecodePerhapsSimd ByteStreamSplitDecodeScalar -#endif switch (width) { case 1: - memcpy(out, data, num_values); + std::memcpy(out, data, num_values); return; case 2: - return ByteStreamSplitDecodeScalar<2>(data, width, num_values, stride, out); + return ByteStreamSplitDecodeSimdDispatch<2>(data, width, num_values, stride, out); case 4: - return ByteStreamSplitDecodePerhapsSimd<4>(data, width, num_values, stride, out); + return ByteStreamSplitDecodeSimdDispatch<4>(data, width, num_values, stride, out); case 8: - return ByteStreamSplitDecodePerhapsSimd<8>(data, width, num_values, stride, out); + return ByteStreamSplitDecodeSimdDispatch<8>(data, width, num_values, stride, out); case 16: return ByteStreamSplitDecodeScalar<16>(data, width, num_values, stride, out); } return ByteStreamSplitDecodeScalarDynamic(data, width, num_values, stride, out); -#undef ByteStreamSplitDecodePerhapsSimd } } // namespace arrow::util::internal diff --git a/cpp/src/arrow/util/byte_stream_split_internal_avx2.cc b/cpp/src/arrow/util/byte_stream_split_internal_avx2.cc new file mode 100644 index 00000000000..d94932e88c4 --- /dev/null +++ b/cpp/src/arrow/util/byte_stream_split_internal_avx2.cc @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/byte_stream_split_internal.h" +#include "arrow/util/math_internal.h" +#include "arrow/util/simd.h" + +#include +#include + +#include +#include +#include + +namespace arrow::util::internal { + +using ::arrow::internal::ReversePow2; + +template void ByteStreamSplitDecodeSimd(const uint8_t*, int, int64_t, + int64_t, uint8_t*); +template void ByteStreamSplitDecodeSimd(const uint8_t*, int, int64_t, + int64_t, uint8_t*); +template void ByteStreamSplitDecodeSimd(const uint8_t*, int, int64_t, + int64_t, uint8_t*); + +// Faster implementation in AVX2 using native intrinsics. +// Probably because the zip/unpack on AVX2 really work on two bits lanes, +// which is not general enough for xsimd to abstract. +inline void ByteStreamSplitEncodeAvx2Impl4(const uint8_t* raw_values, int width, + const int64_t num_values, + uint8_t* output_buffer_raw) { + constexpr int kNumStreams = 4; + assert(width == kNumStreams); + constexpr int kBlockSize = sizeof(__m256i) * kNumStreams; + + const int64_t size = num_values * kNumStreams; + if (size < kBlockSize) // Back to SSE for small size + return ByteStreamSplitEncodeSimd( + raw_values, width, num_values, output_buffer_raw); + const int64_t num_blocks = size / kBlockSize; + const __m256i* raw_values_simd = reinterpret_cast(raw_values); + __m256i* output_buffer_streams[kNumStreams]; + + for (int i = 0; i < kNumStreams; ++i) { + output_buffer_streams[i] = + reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]); + } + + // First handle suffix. + const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; + for (int64_t i = num_processed_elements; i < num_values; ++i) { + for (int j = 0; j < kNumStreams; ++j) { + const uint8_t byte_in_value = raw_values[i * kNumStreams + j]; + output_buffer_raw[j * num_values + i] = byte_in_value; + } + } + + // Path for float. + // 1. Processed hierarchically to 32i block using the unpack intrinsics. + // 2. Pack 128i block using _mm256_permutevar8x32_epi32. + // 3. Pack final 256i block with _mm256_permute2x128_si256. + constexpr int kNumUnpack = 3; + __m256i stage[kNumUnpack + 1][kNumStreams]; + __m256i permute[kNumStreams]; + __m256i final_result[kNumStreams]; + + for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { + for (int i = 0; i < kNumStreams; ++i) { + stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * kNumStreams + i]); + } + + // We first make byte-level shuffling, until we have gather enough bytes together + // and in the correct order to use a bigger data type. + // + // Loop order does not matter so we prefer higher locality + constexpr int kNumStreamsHalf = kNumStreams / 2; + for (int i = 0; i < kNumStreamsHalf; ++i) { + for (int stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) { + stage[stage_lvl + 1][i * 2] = + _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); + stage[stage_lvl + 1][i * 2 + 1] = + _mm256_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); + } + } + + for (int i = 0; i < kNumStreamsHalf; ++i) { + permute[i] = _mm256_permute2x128_si256( + stage[kNumUnpack][i], stage[kNumUnpack][i + kNumStreamsHalf], 0b00100000); + permute[i + kNumStreamsHalf] = _mm256_permute2x128_si256( + stage[kNumUnpack][i], stage[kNumUnpack][i + kNumStreamsHalf], 0b00110001); + } + + for (int i = 0; i < kNumStreams / 2; ++i) { + final_result[i * 2] = + _mm256_unpacklo_epi32(permute[i], permute[i + kNumStreamsHalf]); + final_result[i * 2 + 1] = + _mm256_unpackhi_epi32(permute[i], permute[i + kNumStreamsHalf]); + } + + for (int i = 0; i < kNumStreams; ++i) { + _mm256_storeu_si256(&output_buffer_streams[i][block_index], final_result[i]); + } + } +} + +template +void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, int width, + const int64_t num_values, uint8_t* output_buffer_raw) { + // Only size with a different implementation + if constexpr (kNumStreams == 4) { + return ByteStreamSplitEncodeAvx2Impl4(raw_values, width, num_values, + output_buffer_raw); + } else { + return ByteStreamSplitEncodeSimd( + raw_values, width, num_values, output_buffer_raw); + } +} + +template void ByteStreamSplitEncodeAvx2<2>(const uint8_t*, int, const int64_t, uint8_t*); +template void ByteStreamSplitEncodeAvx2<4>(const uint8_t*, int, const int64_t, uint8_t*); +template void ByteStreamSplitEncodeAvx2<8>(const uint8_t*, int, const int64_t, uint8_t*); + +} // namespace arrow::util::internal diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc index 9755cd8b8d0..13a99d937cb 100644 --- a/cpp/src/arrow/util/byte_stream_split_test.cc +++ b/cpp/src/arrow/util/byte_stream_split_test.cc @@ -136,24 +136,30 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { return input; } - template + template static std::vector MakeDecodeFuncs() { std::vector funcs; funcs.push_back({"scalar_dynamic", &ByteStreamSplitDecodeScalarDynamic}); funcs.push_back({"scalar", &ByteStreamSplitDecodeScalar}); #if defined(ARROW_HAVE_SIMD_SPLIT) if constexpr (kSimdImplemented) { - funcs.push_back({"simd", &ByteStreamSplitDecodeSimd}); - funcs.push_back({"simd128", &ByteStreamSplitDecodeSimd128}); + funcs.push_back({"simd_dispatch", &ByteStreamSplitDecodeSimdDispatch}); +# if defined(ARROW_HAVE_NEON) + funcs.push_back({"xsimd_neon", &ByteStreamSplitDecodeSimd}); +# endif +# if defined(ARROW_HAVE_SSE4_2) + funcs.push_back( + {"xsimd_sse4_2", &ByteStreamSplitDecodeSimd}); +# endif # if defined(ARROW_HAVE_AVX2) - funcs.push_back({"avx2", &ByteStreamSplitDecodeAvx2}); + funcs.push_back({"xsimd_avx2", &ByteStreamSplitDecodeSimd}); # endif } #endif // defined(ARROW_HAVE_SIMD_SPLIT) return funcs; } - template + template static std::vector MakeEncodeFuncs() { std::vector funcs; funcs.push_back({"reference", &ReferenceByteStreamSplitEncode}); @@ -161,10 +167,19 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { funcs.push_back({"scalar", &ByteStreamSplitEncodeScalar}); #if defined(ARROW_HAVE_SIMD_SPLIT) if constexpr (kSimdImplemented) { - funcs.push_back({"simd", &ByteStreamSplitEncodeSimd}); - funcs.push_back({"simd128", &ByteStreamSplitEncodeSimd128}); + funcs.push_back({"simd_dispatch", &ByteStreamSplitEncodeSimdDispatch}); +# if defined(ARROW_HAVE_NEON) + funcs.push_back({"xsimd_neon", &ByteStreamSplitEncodeSimd}); +# endif +# if defined(ARROW_HAVE_SSE4_2) + funcs.push_back( + {"xsimd_sse4_2", &ByteStreamSplitEncodeSimd}); +# endif # if defined(ARROW_HAVE_AVX2) - funcs.push_back({"avx2", &ByteStreamSplitEncodeAvx2}); + funcs.push_back({"xsimd_avx2", &ByteStreamSplitEncodeSimd}); + if constexpr (kWidth == 4) { + funcs.push_back({"intrinsics_avx2", &ByteStreamSplitEncodeAvx2}); + } # endif } #endif // defined(ARROW_HAVE_SIMD_SPLIT) diff --git a/cpp/src/arrow/util/cancel.cc b/cpp/src/arrow/util/cancel.cc index 81a16dbe228..a83475fed0b 100644 --- a/cpp/src/arrow/util/cancel.cc +++ b/cpp/src/arrow/util/cancel.cc @@ -169,7 +169,7 @@ struct SignalStopState : public std::enable_shared_from_this { self_pipe_ptr_.store(nullptr); auto handlers = std::move(saved_handlers_); for (const auto& h : handlers) { - ARROW_CHECK_OK(SetSignalHandler(h.signum, h.handler).status()); + ARROW_CHECK_OK(SetSignalHandler(h.signum, h.handler)); } } diff --git a/cpp/src/arrow/util/compression_zlib.cc b/cpp/src/arrow/util/compression_zlib.cc index 157716c3679..b06cf2d2243 100644 --- a/cpp/src/arrow/util/compression_zlib.cc +++ b/cpp/src/arrow/util/compression_zlib.cc @@ -58,6 +58,16 @@ constexpr int GZIP_CODEC = 16; // Determine if this is libz or gzip from header. constexpr int DETECT_CODEC = 32; +// Default "memory level" +// +// Memory consumption when compressing is given by the formula: +// `(1 << (windowBits+2)) + (1 << (memLevel+9))` +// +// With windowBits=15 and memLevel=8 (default zlib values), 262 kB is used. +// +// (see `zconf.h` from zlib) +constexpr int kGzipDefaultMemLevel = 8; + constexpr int kGZipMinCompressionLevel = 1; constexpr int kGZipMaxCompressionLevel = 9; @@ -196,8 +206,8 @@ class GZipCompressor : public Compressor { int ret; // Initialize to run specified format int window_bits = CompressionWindowBitsForFormat(format, input_window_bits); - if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, - compression_level_, Z_DEFAULT_STRATEGY)) != Z_OK) { + if ((ret = deflateInit2(&stream_, compression_level_, Z_DEFLATED, window_bits, + kGzipDefaultMemLevel, Z_DEFAULT_STRATEGY)) != Z_OK) { return ZlibError("zlib deflateInit failed: "); } else { initialized_ = true; @@ -343,8 +353,8 @@ class GZipCodec : public Codec { int ret; // Initialize to run specified format int window_bits = CompressionWindowBitsForFormat(format_, window_bits_); - if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, - compression_level_, Z_DEFAULT_STRATEGY)) != Z_OK) { + if ((ret = deflateInit2(&stream_, compression_level_, Z_DEFLATED, window_bits, + kGzipDefaultMemLevel, Z_DEFAULT_STRATEGY)) != Z_OK) { return ZlibErrorPrefix("zlib deflateInit failed: ", stream_.msg); } compressor_initialized_ = true; diff --git a/cpp/src/arrow/util/counting_semaphore.cc b/cpp/src/arrow/util/counting_semaphore.cc index b3106a6f824..de9750ba9ee 100644 --- a/cpp/src/arrow/util/counting_semaphore.cc +++ b/cpp/src/arrow/util/counting_semaphore.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/counting_semaphore.h" +#include "arrow/util/counting_semaphore_internal.h" #include #include diff --git a/cpp/src/arrow/util/counting_semaphore.h b/cpp/src/arrow/util/counting_semaphore_internal.h similarity index 100% rename from cpp/src/arrow/util/counting_semaphore.h rename to cpp/src/arrow/util/counting_semaphore_internal.h diff --git a/cpp/src/arrow/util/counting_semaphore_test.cc b/cpp/src/arrow/util/counting_semaphore_test.cc index 4de11ce852a..6635b1ddd7d 100644 --- a/cpp/src/arrow/util/counting_semaphore_test.cc +++ b/cpp/src/arrow/util/counting_semaphore_test.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/counting_semaphore.h" +#include "arrow/util/counting_semaphore_internal.h" #include #include diff --git a/cpp/src/arrow/util/cpu_info.cc b/cpp/src/arrow/util/cpu_info.cc index 6dbf2c35c1e..8bcc814d480 100644 --- a/cpp/src/arrow/util/cpu_info.cc +++ b/cpp/src/arrow/util/cpu_info.cc @@ -82,7 +82,7 @@ void OsRetrieveCacheSize(std::array* cache_sizes) { typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*); GetLogicalProcessorInformationFuncPointer func_pointer = (GetLogicalProcessorInformationFuncPointer)GetProcAddress( - GetModuleHandle("kernel32"), "GetLogicalProcessorInformation"); + GetModuleHandleW(L"kernel32"), "GetLogicalProcessorInformation"); if (!func_pointer) { ARROW_LOG(WARNING) << "Failed to find procedure GetLogicalProcessorInformation"; diff --git a/cpp/src/arrow/util/crc32_test.cc b/cpp/src/arrow/util/crc32_test.cc index 298c263c1ab..ac15fb9a63a 100644 --- a/cpp/src/arrow/util/crc32_test.cc +++ b/cpp/src/arrow/util/crc32_test.cc @@ -34,7 +34,7 @@ TEST(Crc32Test, Basic) { constexpr size_t TEST_CRC32_LENGTH = 9; std::array std_data = {0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39}; - size_t const std_data_len = sizeof(std_data) / sizeof(std_data[0]); + const size_t std_data_len = sizeof(std_data) / sizeof(std_data[0]); EXPECT_EQ(TEST_CRC32_RESULT, internal::crc32(0, &std_data[0], std_data_len)); for (size_t i = 1; i < std_data_len - 1; ++i) { diff --git a/cpp/src/arrow/util/date_internal.h b/cpp/src/arrow/util/date_internal.h new file mode 100644 index 00000000000..32f1cae966e --- /dev/null +++ b/cpp/src/arrow/util/date_internal.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/vendored/datetime.h" + +namespace arrow::internal { + +namespace date = arrow_vendored::date; + +// OffsetZone object is inspired by an example from date.h documentation: +// https://howardhinnant.github.io/date/tz.html#Examples + +class OffsetZone { + std::chrono::minutes offset_; + + public: + explicit OffsetZone(std::chrono::minutes offset) : offset_{offset} {} + + template + date::local_time to_local(date::sys_time tp) const { + return date::local_time{(tp + offset_).time_since_epoch()}; + } + + template + date::sys_time to_sys( + date::local_time tp, + [[maybe_unused]] date::choose = date::choose::earliest) const { + return date::sys_time{(tp - offset_).time_since_epoch()}; + } + + template + date::sys_info get_info(date::sys_time st) const { + return {date::sys_seconds::min(), date::sys_seconds::max(), offset_, + std::chrono::minutes(0), + offset_ >= std::chrono::minutes(0) ? "+" + date::format("%H%M", offset_) + : "-" + date::format("%H%M", -offset_)}; + } + + const OffsetZone* operator->() const { return this; } +}; + +} // namespace arrow::internal + +namespace arrow_vendored::date { +using arrow::internal::OffsetZone; + +template <> +struct zoned_traits { + static OffsetZone default_zone() { return OffsetZone{std::chrono::minutes{0}}; } + + static OffsetZone locate_zone(const std::string& name) { + throw std::runtime_error{"OffsetZone can't parse " + name}; + } +}; +} // namespace arrow_vendored::date diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 39c1fb067c6..9e075594d6a 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -45,6 +45,29 @@ using internal::SafeLeftShift; using internal::SafeSignedAdd; using internal::uint128_t; +namespace internal { + +Status ToArrowStatus(DecimalStatus dstatus) { + switch (dstatus) { + case DecimalStatus::kSuccess: + return Status::OK(); + + case DecimalStatus::kDivideByZero: + return Status::Invalid("Division by 0 in Decimal"); + + case DecimalStatus::kOverflow: + return Status::Invalid("Overflow occurred during Decimal operation"); + + case DecimalStatus::kRescaleDataLoss: + return Status::Invalid("Rescaling Decimal value would cause data loss"); + + default: + return Status::UnknownError("Unknown Decimal error"); + } +} + +} // namespace internal + namespace { struct BaseDecimalRealConversion { @@ -835,24 +858,6 @@ bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out) return pos == size; } -inline Status ToArrowStatus(DecimalStatus dstatus, int num_bits) { - switch (dstatus) { - case DecimalStatus::kSuccess: - return Status::OK(); - - case DecimalStatus::kDivideByZero: - return Status::Invalid("Division by 0 in Decimal", num_bits); - - case DecimalStatus::kOverflow: - return Status::Invalid("Overflow occurred during Decimal", num_bits, " operation."); - - case DecimalStatus::kRescaleDataLoss: - return Status::Invalid("Rescaling Decimal", num_bits, - " value would cause data loss"); - } - return Status::OK(); -} - template Status DecimalFromString(const char* type_name, std::string_view s, Decimal* out, int32_t* precision, int32_t* scale) { @@ -1105,11 +1110,7 @@ Result Decimal32::FromBigEndian(const uint8_t* bytes, int32_t length) return Decimal32(value); } -Status Decimal32::ToArrowStatus(DecimalStatus dstatus) const { - return arrow::ToArrowStatus(dstatus, 32); -} - -std::ostream& operator<<(std::ostream& os, const Decimal32& decimal) { +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal32& decimal) { os << decimal.ToIntegerString(); return os; } @@ -1132,11 +1133,7 @@ Result Decimal64::FromBigEndian(const uint8_t* bytes, int32_t length) return Decimal64(value); } -Status Decimal64::ToArrowStatus(DecimalStatus dstatus) const { - return arrow::ToArrowStatus(dstatus, 64); -} - -std::ostream& operator<<(std::ostream& os, const Decimal64& decimal) { +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal64& decimal) { os << decimal.ToIntegerString(); return os; } @@ -1194,11 +1191,7 @@ Result Decimal128::FromBigEndian(const uint8_t* bytes, int32_t lengt return Decimal128(high, static_cast(low)); } -Status Decimal128::ToArrowStatus(DecimalStatus dstatus) const { - return arrow::ToArrowStatus(dstatus, 128); -} - -std::ostream& operator<<(std::ostream& os, const Decimal128& decimal) { +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal128& decimal) { os << decimal.ToIntegerString(); return os; } @@ -1302,10 +1295,6 @@ Result Decimal256::FromBigEndian(const uint8_t* bytes, int32_t lengt return Decimal256(bit_util::little_endian::ToNative(little_endian_array)); } -Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const { - return arrow::ToArrowStatus(dstatus, 256); -} - namespace { struct Decimal256RealConversion @@ -1451,7 +1440,7 @@ double Decimal256::ToDouble(int32_t scale) const { return Decimal256RealConversion::ToReal(*this, scale); } -std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) { +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) { os << decimal.ToIntegerString(); return os; } diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 640dc9aec15..bae0c4dd248 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -33,6 +33,18 @@ namespace arrow { class Decimal64; +namespace internal { + +ARROW_EXPORT +Status ToArrowStatus(DecimalStatus); + +} // namespace internal + +template <> +struct IntoStatus { + static inline Status ToStatus(DecimalStatus st) { return internal::ToArrowStatus(st); } +}; + /// Represents a signed 32-bit decimal value in two's complement. /// Calulations wrap around and overflow is ignored. /// The max decimal precision that can be safely represented is @@ -75,8 +87,7 @@ class ARROW_EXPORT Decimal32 : public BasicDecimal32 { /// \return the pair of the quotient and the remainder Result> Divide(const Decimal32& divisor) const { std::pair result; - auto dstatus = BasicDecimal32::Divide(divisor, &result.first, &result.second); - ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus)); + ARROW_RETURN_NOT_OK(BasicDecimal32::Divide(divisor, &result.first, &result.second)); return result; } @@ -108,14 +119,13 @@ class ARROW_EXPORT Decimal32 : public BasicDecimal32 { /// \brief Convert from a big-endian byte representation. The length must be /// between 1 and 4 - /// \return error statis if the length is an invalid value + /// \return error status if the length is an invalid value static Result FromBigEndian(const uint8_t* data, int32_t length); /// \brief Convert Decimal32 from one scale to another Result Rescale(int32_t original_scale, int32_t new_scale) const { Decimal32 out; - auto dstatus = BasicDecimal32::Rescale(original_scale, new_scale, &out); - ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus)); + ARROW_RETURN_NOT_OK(BasicDecimal32::Rescale(original_scale, new_scale, &out)); return out; } @@ -150,10 +160,6 @@ class ARROW_EXPORT Decimal32 : public BasicDecimal32 { ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, const Decimal32& decimal); - - private: - /// Converts internal error code to Status - Status ToArrowStatus(DecimalStatus dstatus) const; }; class ARROW_EXPORT Decimal64 : public BasicDecimal64 { @@ -189,8 +195,7 @@ class ARROW_EXPORT Decimal64 : public BasicDecimal64 { /// \return the pair of the quotient and the remainder Result> Divide(const Decimal64& divisor) const { std::pair result; - auto dstatus = BasicDecimal64::Divide(divisor, &result.first, &result.second); - ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus)); + ARROW_RETURN_NOT_OK(BasicDecimal64::Divide(divisor, &result.first, &result.second)); return result; } @@ -220,14 +225,13 @@ class ARROW_EXPORT Decimal64 : public BasicDecimal64 { /// \brief Convert from a big-endian byte representation. The length must be /// between 1 and 4 - /// \return error statis if the length is an invalid value + /// \return error status if the length is an invalid value static Result FromBigEndian(const uint8_t* data, int32_t length); /// \brief Convert Decimal64 from one scale to another Result Rescale(int32_t original_scale, int32_t new_scale) const { Decimal64 out; - auto dstatus = BasicDecimal64::Rescale(original_scale, new_scale, &out); - ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus)); + ARROW_RETURN_NOT_OK(BasicDecimal64::Rescale(original_scale, new_scale, &out)); return out; } @@ -262,10 +266,6 @@ class ARROW_EXPORT Decimal64 : public BasicDecimal64 { ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, const Decimal64& decimal); - - private: - /// Converts internal error code to Status - Status ToArrowStatus(DecimalStatus dstatus) const; }; /// Represents a signed 128-bit integer in two's complement. @@ -315,8 +315,7 @@ class ARROW_EXPORT Decimal128 : public BasicDecimal128 { /// \return the pair of the quotient and the remainder Result> Divide(const Decimal128& divisor) const { std::pair result; - auto dstatus = BasicDecimal128::Divide(divisor, &result.first, &result.second); - ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus)); + ARROW_RETURN_NOT_OK(BasicDecimal128::Divide(divisor, &result.first, &result.second)); return result; } @@ -353,8 +352,7 @@ class ARROW_EXPORT Decimal128 : public BasicDecimal128 { /// \brief Convert Decimal128 from one scale to another Result Rescale(int32_t original_scale, int32_t new_scale) const { Decimal128 out; - auto dstatus = BasicDecimal128::Rescale(original_scale, new_scale, &out); - ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus)); + ARROW_RETURN_NOT_OK(BasicDecimal128::Rescale(original_scale, new_scale, &out)); return out; } @@ -396,10 +394,6 @@ class ARROW_EXPORT Decimal128 : public BasicDecimal128 { ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, const Decimal128& decimal); - - private: - /// Converts internal error code to Status - Status ToArrowStatus(DecimalStatus dstatus) const; }; /// Represents a signed 256-bit integer in two's complement. @@ -453,8 +447,7 @@ class ARROW_EXPORT Decimal256 : public BasicDecimal256 { /// \brief Convert Decimal256 from one scale to another Result Rescale(int32_t original_scale, int32_t new_scale) const { Decimal256 out; - auto dstatus = BasicDecimal256::Rescale(original_scale, new_scale, &out); - ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus)); + ARROW_RETURN_NOT_OK(BasicDecimal256::Rescale(original_scale, new_scale, &out)); return out; } @@ -470,8 +463,7 @@ class ARROW_EXPORT Decimal256 : public BasicDecimal256 { /// \return the pair of the quotient and the remainder Result> Divide(const Decimal256& divisor) const { std::pair result; - auto dstatus = BasicDecimal256::Divide(divisor, &result.first, &result.second); - ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus)); + ARROW_RETURN_NOT_OK(BasicDecimal256::Divide(divisor, &result.first, &result.second)); return result; } @@ -503,10 +495,6 @@ class ARROW_EXPORT Decimal256 : public BasicDecimal256 { ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, const Decimal256& decimal); - - private: - /// Converts internal error code to Status - Status ToArrowStatus(DecimalStatus dstatus) const; }; /// For an integer type, return the max number of decimal digits diff --git a/cpp/src/arrow/util/delimiting.cc b/cpp/src/arrow/util/delimiting.cc index 4794293e0b4..0bc1b45f6bf 100644 --- a/cpp/src/arrow/util/delimiting.cc +++ b/cpp/src/arrow/util/delimiting.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/util/delimiting.h" + #include "arrow/buffer.h" #include "arrow/util/logging_internal.h" diff --git a/cpp/src/arrow/util/dict_util.cc b/cpp/src/arrow/util/dict_util.cc index feab2324a40..c93517140ca 100644 --- a/cpp/src/arrow/util/dict_util.cc +++ b/cpp/src/arrow/util/dict_util.cc @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/dict_util.h" +#include "arrow/util/dict_util_internal.h" + #include "arrow/array/array_dict.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/util/dict_util.h b/cpp/src/arrow/util/dict_util_internal.h similarity index 100% rename from cpp/src/arrow/util/dict_util.h rename to cpp/src/arrow/util/dict_util_internal.h diff --git a/cpp/src/arrow/util/dispatch.h b/cpp/src/arrow/util/dispatch.h deleted file mode 100644 index fae9293f9e7..00000000000 --- a/cpp/src/arrow/util/dispatch.h +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/cpu_info.h" - -namespace arrow { -namespace internal { - -enum class DispatchLevel : int { - // These dispatch levels, corresponding to instruction set features, - // are sorted in increasing order of preference. - NONE = 0, - SSE4_2, - AVX2, - AVX512, - NEON, - MAX -}; - -/* - A facility for dynamic dispatch according to available DispatchLevel. - - Typical use: - - static void my_function_default(...); - static void my_function_avx2(...); - - struct MyDynamicFunction { - using FunctionType = decltype(&my_function_default); - - static std::vector> implementations() { - return { - { DispatchLevel::NONE, my_function_default } - #if defined(ARROW_HAVE_RUNTIME_AVX2) - , { DispatchLevel::AVX2, my_function_avx2 } - #endif - }; - } - }; - - void my_function(...) { - static DynamicDispatch dispatch; - return dispatch.func(...); - } -*/ -template -class DynamicDispatch { - protected: - using FunctionType = typename DynamicFunction::FunctionType; - using Implementation = std::pair; - - public: - DynamicDispatch() { Resolve(DynamicFunction::implementations()); } - - FunctionType func = {}; - - protected: - // Use the Implementation with the highest DispatchLevel - void Resolve(const std::vector& implementations) { - Implementation cur{DispatchLevel::NONE, {}}; - - for (const auto& impl : implementations) { - if (impl.first >= cur.first && IsSupported(impl.first)) { - // Higher (or same) level than current - cur = impl; - } - } - - if (!cur.second) { - Status::Invalid("No appropriate implementation found").Abort(); - } - func = cur.second; - } - - private: - bool IsSupported(DispatchLevel level) const { - static const auto cpu_info = arrow::internal::CpuInfo::GetInstance(); - - switch (level) { - case DispatchLevel::NONE: - return true; - case DispatchLevel::SSE4_2: - return cpu_info->IsSupported(CpuInfo::SSE4_2); - case DispatchLevel::AVX2: - return cpu_info->IsSupported(CpuInfo::AVX2); - case DispatchLevel::AVX512: - return cpu_info->IsSupported(CpuInfo::AVX512); - default: - return false; - } - } -}; - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/util/dispatch_internal.h b/cpp/src/arrow/util/dispatch_internal.h new file mode 100644 index 00000000000..7ac19b0b244 --- /dev/null +++ b/cpp/src/arrow/util/dispatch_internal.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/cpu_info.h" + +namespace arrow { +namespace internal { + +enum class DispatchLevel : int { + // These dispatch levels, corresponding to instruction set features, + // are sorted in increasing order of preference. + NONE = 0, + SSE4_2, + AVX2, + AVX512, + NEON, + MAX +}; + +/* + A facility for dynamic dispatch according to available DispatchLevel. + + Typical use: + + static void my_function_default(...); + static void my_function_avx2(...); + + struct MyDynamicFunction { + using FunctionType = decltype(&my_function_default); + + static std::vector> implementations() { + return { + { DispatchLevel::NONE, my_function_default } + #if defined(ARROW_HAVE_RUNTIME_AVX2) + , { DispatchLevel::AVX2, my_function_avx2 } + #endif + }; + } + }; + + void my_function(...) { + static DynamicDispatch dispatch; + return dispatch.func(...); + } +*/ +template +class DynamicDispatch { + protected: + using FunctionType = typename DynamicFunction::FunctionType; + using Implementation = std::pair; + + public: + DynamicDispatch() { Resolve(DynamicFunction::implementations()); } + + FunctionType func = {}; + + protected: + // Use the Implementation with the highest DispatchLevel + template + void Resolve(const Range& implementations) { + Implementation cur{DispatchLevel::NONE, {}}; + + for (const auto& impl : implementations) { + if (impl.first >= cur.first && IsSupported(impl.first)) { + // Higher (or same) level than current + cur = impl; + } + } + + if (!cur.second) { + Status::Invalid("No appropriate implementation found").Abort(); + } + func = cur.second; + } + + private: + bool IsSupported(DispatchLevel level) const { + static const auto cpu_info = arrow::internal::CpuInfo::GetInstance(); + + switch (level) { + case DispatchLevel::NONE: + return true; + case DispatchLevel::SSE4_2: + return cpu_info->IsSupported(CpuInfo::SSE4_2); + case DispatchLevel::AVX2: + return cpu_info->IsSupported(CpuInfo::AVX2); + case DispatchLevel::AVX512: + return cpu_info->IsSupported(CpuInfo::AVX512); + default: + return false; + } + } +}; + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/double_conversion.h b/cpp/src/arrow/util/double_conversion_internal.h similarity index 100% rename from cpp/src/arrow/util/double_conversion.h rename to cpp/src/arrow/util/double_conversion_internal.h diff --git a/cpp/src/arrow/util/endian.h b/cpp/src/arrow/util/endian.h index 9c603144a7f..fcc138828e7 100644 --- a/cpp/src/arrow/util/endian.h +++ b/cpp/src/arrow/util/endian.h @@ -24,7 +24,7 @@ # include // IWYU pragma: keep # elif defined(sun) || defined(__sun) # include // IWYU pragma: keep -# else +# elif !defined(_AIX) # include // IWYU pragma: keep # endif # diff --git a/cpp/src/arrow/util/float16.cc b/cpp/src/arrow/util/float16.cc index 5c8b3d10ca0..d3dc91b849a 100644 --- a/cpp/src/arrow/util/float16.cc +++ b/cpp/src/arrow/util/float16.cc @@ -220,7 +220,9 @@ Float16 Float16::FromDouble(double d) { return FromBits(BinaryConverter::ToBinary16(d_bits)); } -std::ostream& operator<<(std::ostream& os, Float16 arg) { return (os << arg.ToFloat()); } +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, Float16 arg) { + return (os << arg.ToFloat()); +} } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/util/float16.h b/cpp/src/arrow/util/float16.h index b7455dad537..b52145cdc0c 100644 --- a/cpp/src/arrow/util/float16.h +++ b/cpp/src/arrow/util/float16.h @@ -147,6 +147,9 @@ class ARROW_EXPORT Float16 { ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, Float16 arg); + static constexpr Float16 zero() { return FromBits(0); } + static constexpr Float16 one() { return FromBits(0x3c00); } + protected: uint16_t bits_; @@ -175,7 +178,9 @@ class ARROW_EXPORT Float16 { } }; +static_assert(std::is_standard_layout_v); static_assert(std::is_trivial_v); +static_assert(sizeof(Float16) == sizeof(uint16_t)); } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/util/formatting.cc b/cpp/src/arrow/util/formatting.cc index 97567d86321..58dadd0b11e 100644 --- a/cpp/src/arrow/util/formatting.cc +++ b/cpp/src/arrow/util/formatting.cc @@ -17,7 +17,7 @@ #include "arrow/util/formatting.h" #include "arrow/util/config.h" -#include "arrow/util/double_conversion.h" +#include "arrow/util/double_conversion_internal.h" #include "arrow/util/float16.h" #include "arrow/util/logging_internal.h" diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h index f2e3622ce60..844b6fb91a8 100644 --- a/cpp/src/arrow/util/formatting.h +++ b/cpp/src/arrow/util/formatting.h @@ -30,9 +30,8 @@ #include #include "arrow/status.h" -#include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/type_traits.h" -#include "arrow/util/double_conversion.h" #include "arrow/util/macros.h" #include "arrow/util/string.h" #include "arrow/util/time.h" diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 029f6bcd6c2..ac3beea2660 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -40,6 +40,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/endian.h" +#include "arrow/util/float16.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" @@ -143,6 +144,21 @@ struct ScalarHelper:: } }; +template +struct ScalarHelper>> + : public ScalarHelperBase { + // ScalarHelper specialization for Float16 + + static bool CompareScalars(Scalar u, Scalar v) { + if (u.is_nan()) { + // XXX should we do a bit-precise comparison? + return v.is_nan(); + } + return u == v; + } +}; + template hash_t ComputeStringHash(const void* data, int64_t length) { if (ARROW_PREDICT_TRUE(length <= 16)) { @@ -406,7 +422,9 @@ class ScalarMemoTable : public MemoTable { explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0) : hash_table_(pool, static_cast(entries)) {} - int32_t Get(const Scalar& value) const { + template + int32_t Get(Value&& v) const { + const Scalar value(std::forward(v)); auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(payload->value, value); }; @@ -419,9 +437,10 @@ class ScalarMemoTable : public MemoTable { } } - template - Status GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found, + template + Status GetOrInsert(Value&& v, Func1&& on_found, Func2&& on_not_found, int32_t* out_memo_index) { + const Scalar value(std::forward(v)); auto cmp_func = [value](const Payload* payload) -> bool { return ScalarHelper::CompareScalars(value, payload->value); }; @@ -440,7 +459,8 @@ class ScalarMemoTable : public MemoTable { return Status::OK(); } - Status GetOrInsert(const Scalar& value, int32_t* out_memo_index) { + template + Status GetOrInsert(Value&& value, int32_t* out_memo_index) { return GetOrInsert( value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index); } @@ -470,23 +490,30 @@ class ScalarMemoTable : public MemoTable { } // Copy values starting from index `start` into `out_data` - void CopyValues(int32_t start, Scalar* out_data) const { + template + void CopyValues(int32_t start, Value* out_data) const { + // So that both uint16_t and Float16 are allowed + static_assert(sizeof(Value) == sizeof(Scalar)); + Scalar* out = reinterpret_cast(out_data); hash_table_.VisitEntries([=](const HashTableEntry* entry) { int32_t index = entry->payload.memo_index - start; if (index >= 0) { - out_data[index] = entry->payload.value; + out[index] = entry->payload.value; } }); // Zero-initialize the null entry if (null_index_ != kKeyNotFound) { int32_t index = null_index_ - start; if (index >= 0) { - out_data[index] = Scalar{}; + out[index] = Scalar{}; } } } - void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } + template + void CopyValues(Value* out_data) const { + CopyValues(0, out_data); + } protected: struct Payload { @@ -903,6 +930,11 @@ struct HashTraits::value && !is_8bit_int::value> using MemoTableType = ScalarMemoTable; }; +template <> +struct HashTraits { + using MemoTableType = ScalarMemoTable<::arrow::util::Float16>; +}; + template struct HashTraits::value && !std::is_base_of::value>> { diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index 1ed28d717d3..50f3bd9a15e 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -115,6 +115,7 @@ #elif __linux__ # include # include +# include #endif #ifdef _WIN32 @@ -1069,8 +1070,11 @@ Result FileOpenReadable(const PlatformFilename& file_name) { } fd = FileDescriptor(ret); #else - int ret = open(file_name.ToNative().c_str(), O_RDONLY); - if (ret < 0) { + int ret; + do { + ret = open(file_name.ToNative().c_str(), O_RDONLY); + } while (ret == -1 && errno == EINTR); + if (ret == -1) { return IOErrorFromErrno(errno, "Failed to open local file '", file_name.ToString(), "'"); } @@ -1136,7 +1140,10 @@ Result FileOpenWritable(const PlatformFilename& file_name, oflag |= O_RDWR; } - int ret = open(file_name.ToNative().c_str(), oflag, 0666); + int ret; + do { + ret = open(file_name.ToNative().c_str(), oflag, 0666); + } while (ret == -1 && errno == EINTR); if (ret == -1) { return IOErrorFromErrno(errno, "Failed to open local file '", file_name.ToString(), "'"); @@ -1447,7 +1454,7 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, SetFilePointer(h, new_size_low, &new_size_high, FILE_BEGIN); SetEndOfFile(h); - fm = CreateFileMapping(h, NULL, PAGE_READWRITE, 0, 0, ""); + fm = CreateFileMappingW(h, NULL, PAGE_READWRITE, 0, 0, L""); if (fm == NULL) { return StatusFromMmapErrno("CreateFileMapping failed"); } @@ -2141,11 +2148,6 @@ uint64_t GetThreadId() { return equiv; } -uint64_t GetOptionalThreadId() { - auto tid = GetThreadId(); - return (tid == 0) ? tid - 1 : tid; -} - // Returns the current resident set size (physical memory use) measured // in bytes, or zero if the value cannot be determined on this OS. int64_t GetCurrentRSS() { @@ -2224,6 +2226,22 @@ int64_t GetTotalMemoryBytes() { #endif } +Result GetNumAffinityCores() { +#if defined(__linux__) + cpu_set_t mask; + if (sched_getaffinity(0, sizeof(mask), &mask) == 0) { + auto count = CPU_COUNT(&mask); + if (count > 0 && + static_cast(count) < std::numeric_limits::max()) { + return static_cast(count); + } + } + return IOErrorFromErrno(errno, "Could not read the CPU affinity."); +#else + return Status::NotImplemented("Only implemented for Linux"); +#endif +} + Result LoadDynamicLibrary(const char* path) { #ifdef _WIN32 ARROW_ASSIGN_OR_RAISE(auto platform_path, PlatformFilename::FromString(path)); diff --git a/cpp/src/arrow/util/io_util.h b/cpp/src/arrow/util/io_util.h index 892641d4bc5..e9f218b5205 100644 --- a/cpp/src/arrow/util/io_util.h +++ b/cpp/src/arrow/util/io_util.h @@ -419,6 +419,12 @@ int64_t GetCurrentRSS(); ARROW_EXPORT int64_t GetTotalMemoryBytes(); +/// \brief Get the number of affinity core on the system. +/// +/// This is only implemented on Linux. +/// If a value is returned, it is guaranteed to be greater or equal to one. +ARROW_EXPORT Result GetNumAffinityCores(); + /// \brief Load a dynamic library /// /// This wraps dlopen() except on Windows, where LoadLibrary() is called. diff --git a/cpp/src/arrow/util/io_util_test.cc b/cpp/src/arrow/util/io_util_test.cc index 1ff8fcf7adb..885f2355f4e 100644 --- a/cpp/src/arrow/util/io_util_test.cc +++ b/cpp/src/arrow/util/io_util_test.cc @@ -1123,5 +1123,16 @@ TEST(Memory, TotalMemory) { #endif } +TEST(CpuAffinity, NumberOfCores) { + auto maybe_affinity_cores = GetNumAffinityCores(); +#ifdef __linux__ + ASSERT_OK_AND_ASSIGN(auto affinity_cores, maybe_affinity_cores); + ASSERT_GE(affinity_cores, 1); + ASSERT_LE(affinity_cores, std::thread::hardware_concurrency()); +#else + ASSERT_RAISES(NotImplemented, maybe_affinity_cores); +#endif +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/iterator.h b/cpp/src/arrow/util/iterator.h index d18cb90f042..dc7fd1d84cc 100644 --- a/cpp/src/arrow/util/iterator.h +++ b/cpp/src/arrow/util/iterator.h @@ -165,7 +165,7 @@ class Iterator : public util::EqualityComparable> { } Result operator*() { - ARROW_RETURN_NOT_OK(value_.status()); + ARROW_RETURN_NOT_OK(value_); auto value = std::move(value_); value_ = IterationTraits::End(); diff --git a/cpp/src/arrow/util/iterator_test.cc b/cpp/src/arrow/util/iterator_test.cc index a247ba13aef..64148e58402 100644 --- a/cpp/src/arrow/util/iterator_test.cc +++ b/cpp/src/arrow/util/iterator_test.cc @@ -350,10 +350,10 @@ TEST(TestFunctionIterator, RangeForLoop) { int expected_i = 0; for (auto maybe_i : fails_at_3) { if (expected_i < 3) { - ASSERT_OK(maybe_i.status()); + ASSERT_OK(maybe_i); ASSERT_EQ(*maybe_i, expected_i); } else if (expected_i == 3) { - ASSERT_RAISES(IndexError, maybe_i.status()); + ASSERT_RAISES(IndexError, maybe_i); } ASSERT_LE(expected_i, 3) << "iteration stops after an error is encountered"; ++expected_i; @@ -499,7 +499,7 @@ TEST(ReadaheadIterator, NextError) { ASSERT_OK_AND_ASSIGN( auto it, MakeReadaheadIterator(Iterator(std::move(tracing_it)), 2)); - ASSERT_RAISES(IOError, it.Next().status()); + ASSERT_RAISES(IOError, it.Next()); AssertIteratorExhausted(it); SleepABit(); diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index 4390a4cb795..48e02c61202 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -30,7 +30,7 @@ #include "arrow/status.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging_internal.h" -#include "arrow/util/sort.h" +#include "arrow/util/sort_internal.h" using std::size_t; diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index e8c24892aab..460888f6d75 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -70,7 +70,7 @@ enum class ArrowLogLevel : int { // of 'msg' followed by the status. # define ARROW_CHECK_OK_PREPEND(to_call, msg, level) \ do { \ - ::arrow::Status _s = (to_call); \ + ::arrow::Status _s = ::arrow::ToStatus(to_call); \ ARROW_CHECK_OR_LOG(_s.ok(), level) \ << "Operation failed: " << ARROW_STRINGIFY(to_call) << "\n" \ << (msg) << ": " << _s.ToString(); \ diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index af29fd636b5..55bc1eeb1d2 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -171,6 +171,19 @@ // ---------------------------------------------------------------------- +// Macros to disable warnings about undeclared global functions +#if defined(__GNUC__) +# define ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING \ + _Pragma("GCC diagnostic push"); \ + _Pragma("GCC diagnostic ignored \"-Wmissing-declarations\"") +# define ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING _Pragma("GCC diagnostic pop") +#else +# define ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING +# define ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING +#endif + +// ---------------------------------------------------------------------- + // macros to disable padding // these macros are portable across different compilers and platforms //[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355] diff --git a/cpp/src/arrow/util/map.h b/cpp/src/arrow/util/map_internal.h similarity index 100% rename from cpp/src/arrow/util/map.h rename to cpp/src/arrow/util/map_internal.h diff --git a/cpp/src/arrow/util/math_internal.h b/cpp/src/arrow/util/math_internal.h index 3ff30cabf2e..a57083ce7f8 100644 --- a/cpp/src/arrow/util/math_internal.h +++ b/cpp/src/arrow/util/math_internal.h @@ -59,4 +59,20 @@ double NeumaierSum(Range&& inputs) { return sum + c; } +/// Based-2 logarithm that works only on powers of two. +template +constexpr T ReversePow2(T x) { + constexpr T kByteLen = 8; + for (T n = 0, y = 1; n <= (kByteLen * static_cast(sizeof(T))); ++n, y = y * 2) { + if (y == x) { + return n; + } + } + return 0; +} + +static_assert(ReversePow2(8) == 3); +static_assert(ReversePow2(4) == 2); +static_assert(ReversePow2(2) == 1); + } // namespace arrow::internal diff --git a/cpp/src/arrow/util/memory.cc b/cpp/src/arrow/util/memory.cc index e91009d5860..89e9b32ee14 100644 --- a/cpp/src/arrow/util/memory.cc +++ b/cpp/src/arrow/util/memory.cc @@ -18,7 +18,7 @@ #include #include "arrow/util/logging.h" -#include "arrow/util/memory.h" +#include "arrow/util/memory_internal.h" #include "arrow/util/thread_pool.h" namespace arrow { @@ -29,10 +29,14 @@ inline uint8_t* pointer_logical_and(const uint8_t* address, uintptr_t bits) { return reinterpret_cast(value & bits); } +namespace { + // This function is just for avoiding MinGW-w64 32bit crash. // See also: https://sourceforge.net/p/mingw-w64/bugs/767/ void* wrap_memcpy(void* dst, const void* src, size_t n) { return memcpy(dst, src, n); } +} // namespace + void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes, uintptr_t block_size, int num_threads) { // XXX This function is really using `num_threads + 1` threads. diff --git a/cpp/src/arrow/util/memory.h b/cpp/src/arrow/util/memory_internal.h similarity index 100% rename from cpp/src/arrow/util/memory.h rename to cpp/src/arrow/util/memory_internal.h diff --git a/cpp/src/arrow/util/meson.build b/cpp/src/arrow/util/meson.build index be93945dc65..2fbbedbb931 100644 --- a/cpp/src/arrow/util/meson.build +++ b/cpp/src/arrow/util/meson.build @@ -37,23 +37,23 @@ conf_data.set('UPPERCASE_BUILD_TYPE', get_option('buildtype').to_upper()) conf_data.set('ARROW_PACKAGE_KIND', get_option('package_kind')) -conf_data.set('ARROW_COMPUTE', false) +conf_data.set('ARROW_COMPUTE', needs_compute) conf_data.set('ARROW_CSV', false) conf_data.set('ARROW_CUDA', false) conf_data.set('ARROW_DATASET', false) conf_data.set('ARROW_FILESYSTEM', false) conf_data.set('ARROW_FLIGHT', false) conf_data.set('ARROW_FLIGHT_SQL', false) -conf_data.set('ARROW_IPC', false) +conf_data.set('ARROW_IPC', needs_ipc) conf_data.set('ARROW_JEMALLOC', false) conf_data.set('ARROW_JEMALLOC_VENDORED', false) -conf_data.set('ARROW_JSON', false) +conf_data.set('ARROW_JSON', needs_json) conf_data.set('ARROW_MIMALLOC', false) conf_data.set('ARROW_ORC', false) -conf_data.set('ARROW_PARQUET', false) +conf_data.set('ARROW_PARQUET', needs_parquet) conf_data.set('ARROW_SUBSTRAIT', false) conf_data.set('ARROW_AZURE', false) -conf_data.set('ARROW_ENABLE_THREADING', false) +conf_data.set('ARROW_ENABLE_THREADING', true) conf_data.set('ARROW_GCS', false) conf_data.set('ARROW_HDFS', false) conf_data.set('ARROW_S3', false) @@ -62,18 +62,18 @@ conf_data.set('ARROW_USE_GLOG', false) has_int128 = cpp_compiler.has_define('__SIZEOF_INT128__') conf_data.set('ARROW_USE_NATIVE_INT128', has_int128) -conf_data.set('ARROW_WITH_BROTLI', false) -conf_data.set('ARROW_WITH_BZ2', false) -conf_data.set('ARROW_WITH_LZ4', false) +conf_data.set('ARROW_WITH_BROTLI', needs_brotli) +conf_data.set('ARROW_WITH_BZ2', needs_bz2) +conf_data.set('ARROW_WITH_LZ4', needs_lz4) conf_data.set('ARROW_WITH_MUSL', false) conf_data.set('ARROW_WITH_OPENTELEMETRY', false) conf_data.set('ARROW_WITH_RE2', false) -conf_data.set('ARROW_WITH_SNAPPY', false) +conf_data.set('ARROW_WITH_SNAPPY', needs_snappy) conf_data.set('ARROW_WITH_UCX', false) conf_data.set('ARROW_WITH_UTF8PROC', false) -conf_data.set('ARROW_WITH_ZLIB', false) -conf_data.set('ARROW_WITH_ZSTD', false) -conf_data.set('PARQUET_REQUIRE_ENCRYPTION', false) +conf_data.set('ARROW_WITH_ZLIB', needs_zlib) +conf_data.set('ARROW_WITH_ZSTD', needs_zstd) +conf_data.set('PARQUET_REQUIRE_ENCRYPTION', needs_parquet_encryption) configure_file( input: 'config.h.cmake', @@ -117,14 +117,7 @@ install_headers( 'bitmap_visit.h', 'bitmap_writer.h', 'bit_run_reader.h', - 'bitset_stack.h', 'bit_util.h', - 'bpacking64_default.h', - 'bpacking_avx2.h', - 'bpacking_avx512.h', - 'bpacking_default.h', - 'bpacking.h', - 'bpacking_neon.h', 'byte_size.h', 'cancel.h', 'checked_cast.h', @@ -132,15 +125,11 @@ install_headers( 'compression.h', 'concurrent_map.h', 'converter.h', - 'counting_semaphore.h', 'cpu_info.h', 'crc32.h', 'debug.h', 'decimal.h', 'delimiting.h', - 'dict_util.h', - 'dispatch.h', - 'double_conversion.h', 'endian.h', 'float16.h', 'formatting.h', @@ -158,14 +147,11 @@ install_headers( 'logger.h', 'logging.h', 'macros.h', - 'map.h', 'math_constants.h', - 'memory.h', 'mutex.h', 'parallel.h', 'pcg_random.h', 'prefetch.h', - 'print.h', 'queue.h', 'range.h', 'ree_util.h', @@ -173,19 +159,14 @@ install_headers( 'rows_to_batches.h', 'simd.h', 'small_vector.h', - 'sort.h', - 'spaced.h', 'span.h', - 'stopwatch.h', - 'string_builder.h', + 'string_util.h', 'string.h', 'task_group.h', - 'tdigest.h', 'test_common.h', 'thread_pool.h', 'time.h', 'tracing.h', - 'trie.h', 'type_fwd.h', 'type_traits.h', 'ubsan.h', @@ -246,17 +227,16 @@ if host_machine.system() == 'windows' # This manifest enables long file paths on Windows 10+ # See https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#enable-long-paths-in-windows-10-version-1607-and-later if cpp_compiler.get_id() == 'msvc' - utility_test_sources += ['io_util_test.manifest'] + utility_test_srcs += ['io_util_test.manifest'] else - utility_test_sources += ['io_util_test.rc'] + utility_test_srcs += ['io_util_test.rc'] endif endif exc = executable( 'arrow-utility-test', sources: utility_test_srcs, - dependencies: [arrow_dep, filesystem_dep, gtest_dep, gmock_dep], - link_with: [arrow_test_lib], + dependencies: arrow_test_dep_no_main, implicit_include_directories: false, ) test('arrow-utility-test', exc) diff --git a/cpp/src/arrow/util/print.h b/cpp/src/arrow/util/print_internal.h similarity index 100% rename from cpp/src/arrow/util/print.h rename to cpp/src/arrow/util/print_internal.h diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 6f133f15b00..2420270f3ab 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -21,18 +21,17 @@ #pragma once #include -#include +#include #include -#include +#include +#include -#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_stream_utils_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/macros.h" -namespace arrow { -namespace util { +namespace arrow::util { /// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs /// are sufficiently long, RLE is used, otherwise, the values are just bit-packed @@ -82,80 +81,411 @@ namespace util { /// 200 ints = 25 groups of 8 /// <25 bytes of values, bitpacked> /// (total 26 bytes, 1 byte overhead) -// -/// Decoder class for RLE encoded data. -class RleDecoder { +/// The type for an encoded Rle of BitPacked run size, between 1 and 2^31-1 as per Parquet +/// spec. +/// This is also pragmatically used for other integer used in the Rle and BitPacked runs +/// and decoder to avoid conversions. +/// It can therefore be referred to as a "typical" size for Rle and BitPacked logic. +using rle_size_t = int32_t; + +template +class RleRunDecoder; + +/// A Single Run Length Encoded run. +/// +/// Consist of a single value repeated multiple times. +/// A previous version of this class also stored the value bit width to be self contain, +/// removing it and passing it explicitly when needed proved to speed up decoding up to +/// 10 % on some benchmarks. +class RleRun { public: - /// Create a decoder object. buffer/buffer_len is the decoded data. - /// bit_width is the width of each value (before encoding). - RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) - : bit_reader_(buffer, buffer_len), - bit_width_(bit_width), - current_value_(0), - repeat_count_(0), - literal_count_(0) { - ARROW_DCHECK_GE(bit_width_, 0); - ARROW_DCHECK_LE(bit_width_, 64); + /// The decoder class used to decode a single run in the given type. + template + using DecoderType = RleRunDecoder; + + constexpr RleRun() noexcept = default; + + explicit RleRun(const uint8_t* data, rle_size_t values_count, + rle_size_t value_bit_width) noexcept + : values_count_(values_count) { + ARROW_DCHECK_GE(value_bit_width, 0); + ARROW_DCHECK_GE(values_count, 0); + std::copy(data, data + raw_data_size(value_bit_width), data_.begin()); } - RleDecoder() : bit_width_(-1) {} + /// The number of repeated values in this run. + constexpr rle_size_t values_count() const noexcept { return values_count_; } - void Reset(const uint8_t* buffer, int buffer_len, int bit_width) { - ARROW_DCHECK_GE(bit_width, 0); - ARROW_DCHECK_LE(bit_width, 64); - bit_reader_.Reset(buffer, buffer_len); - bit_width_ = bit_width; - current_value_ = 0; - repeat_count_ = 0; - literal_count_ = 0; + /// A pointer to the repeated value raw bytes. + constexpr const uint8_t* raw_data_ptr() const noexcept { return data_.data(); } + + /// The number of bytes used for the raw repeated value. + constexpr rle_size_t raw_data_size(rle_size_t value_bit_width) const noexcept { + auto out = bit_util::BytesForBits(value_bit_width); + ARROW_DCHECK_LE(out, std::numeric_limits::max()); + return static_cast(out); } - /// Gets the next value. Returns false if there are no more. - template - bool Get(T* val); + private: + /// The repeated value raw bytes stored inside the class with enough space to store + /// up to a 64 bit value. + std::array data_ = {}; + /// The number of time the value is repeated. + rle_size_t values_count_ = 0; +}; - /// Gets a batch of values. Returns the number of decoded elements. +template +class BitPackedRunDecoder; + +/// A single bit packed run. +/// +/// Consist of a view on a buffer of bytes that encode integers on ``value_bit_width`` +/// bits (that is the numbers are small enough that high order bits are all zeros and can +/// be omitted). +/// A previous version of this class also stored the value bit width to be self contain, +/// removing it and passing it explicitly when needed proved to speed up decoding up to +/// 10 % on some benchmarks. +class BitPackedRun { + public: + /// The decoder class used to decode a single run in the given type. template - int GetBatch(T* values, int batch_size); + using DecoderType = BitPackedRunDecoder; - /// Like GetBatch but add spacing for null entries - template - int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, T* out); + constexpr BitPackedRun() noexcept = default; + + constexpr BitPackedRun(const uint8_t* data, rle_size_t values_count, + rle_size_t value_bit_width) noexcept + : data_(data), values_count_(values_count) { + ARROW_CHECK_GE(value_bit_width, 0); + ARROW_CHECK_GE(values_count_, 0); + } + + constexpr rle_size_t values_count() const noexcept { return values_count_; } + + constexpr const uint8_t* raw_data_ptr() const noexcept { return data_; } + + constexpr rle_size_t raw_data_size(rle_size_t value_bit_width) const noexcept { + auto out = bit_util::BytesForBits(static_cast(value_bit_width) * + static_cast(values_count_)); + ARROW_CHECK_LE(out, std::numeric_limits::max()); + return static_cast(out); + } + + private: + /// The pointer to the beginning of the run + const uint8_t* data_ = nullptr; + /// Number of values in this run. + rle_size_t values_count_ = 0; +}; + +/// A parser that emits either a ``BitPackedRun`` or a ``RleRun``. +class RleBitPackedParser { + public: + /// The different types of runs emitted by the parser + using dynamic_run_type = std::variant; + + constexpr RleBitPackedParser() noexcept = default; + + constexpr RleBitPackedParser(const uint8_t* data, rle_size_t data_size, + rle_size_t value_bit_width) noexcept + : data_(data), data_size_(data_size), value_bit_width_(value_bit_width) {} + + constexpr void Reset(const uint8_t* data, rle_size_t data_size, + rle_size_t value_bit_width) noexcept { + *this = {data, data_size, value_bit_width}; + } + + /// Whether there is still runs to iterate over. + /// + /// WARN: Due to simplistic error handling, iteration with Next and Peek could + /// fail to return data while the parser is not exhausted. + /// This is how one can check for errors. + bool exhausted() const { return data_size_ == 0; } + + /// Enum to return from an ``Parse`` handler. + /// + /// Since a callback has no way to know when to stop, the handler must return + /// a value indicating to the ``Parse`` function whether to stop or continue. + enum class ControlFlow { + Continue, + Break, + }; + + /// A callback approach to parsing. + /// + /// This approach is used to reduce the number of dynamic lookups involved with using a + /// variant. + /// + /// The handler must be of the form + /// ```cpp + /// struct Handler { + /// ControlFlow OnBitPackedRun(BitPackedRun run); + /// + /// ControlFlow OnRleRun(RleRun run); + /// }; + /// ``` + template + void Parse(Handler&& handler); + + private: + /// The pointer to the beginning of the run + const uint8_t* data_ = nullptr; + /// Size in bytes of the run. + rle_size_t data_size_ = 0; + /// The size in bit of a packed value in the run + rle_size_t value_bit_width_ = 0; + + /// Run the handler on the run read and return the number of values read. + /// Does not advance the parser. + template + std::pair PeekImpl(Handler&&) const; +}; + +/// Decoder class for a single run of RLE encoded data. +template +class RleRunDecoder { + public: + /// The type in which the data should be decoded. + using value_type = T; + /// The type of run that can be decoded. + using RunType = RleRun; + + constexpr RleRunDecoder() noexcept = default; + + explicit RleRunDecoder(const RunType& run, rle_size_t value_bit_width) noexcept { + Reset(run, value_bit_width); + } + + void Reset(const RunType& run, rle_size_t value_bit_width) noexcept { + remaining_count_ = run.values_count(); + if constexpr (std::is_same_v) { + // ARROW-18031: just check the LSB of the next byte and move on. + // If we memcpy + FromLittleEndian, we have potential undefined behavior + // if the bool value isn't 0 or 1. + value_ = *run.raw_data_ptr() & 1; + } else { + // Memcopy is required to avoid undefined behavior. + value_ = {}; + std::memcpy(&value_, run.raw_data_ptr(), run.raw_data_size(value_bit_width)); + value_ = ::arrow::bit_util::FromLittleEndian(value_); + } + } + + /// Return the number of values that can be advanced. + rle_size_t remaining() const { return remaining_count_; } + + /// Return the repeated value of this decoder. + constexpr value_type value() const { return value_; } + + /// Try to advance by as many values as provided. + /// Return the number of values skipped. + /// May advance by less than asked for if there are not enough values left. + [[nodiscard]] rle_size_t Advance(rle_size_t batch_size, rle_size_t value_bit_width) { + const auto steps = std::min(batch_size, remaining_count_); + remaining_count_ -= steps; + return steps; + } + + /// Get the next value and return false if there are no more. + [[nodiscard]] constexpr bool Get(value_type* out_value, rle_size_t value_bit_width) { + return GetBatch(out_value, 1, value_bit_width) == 1; + } + + /// Get a batch of values return the number of decoded elements. + /// May write fewer elements to the output than requested if there are not enough values + /// left. + [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size, + rle_size_t value_bit_width) { + if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { + return 0; + } + + const auto to_read = std::min(remaining_count_, batch_size); + std::fill(out, out + to_read, value_); + remaining_count_ -= to_read; + return to_read; + } + + private: + value_type value_ = {}; + rle_size_t remaining_count_ = 0; + + static_assert(std::is_integral_v, + "This class is meant to decode positive integers"); +}; + +/// Decoder class for single run of bit-packed encoded data. +template +class BitPackedRunDecoder { + public: + /// The type in which the data should be decoded. + using value_type = T; + /// The type of run that can be decoded. + using RunType = BitPackedRun; + + BitPackedRunDecoder() noexcept = default; + + explicit BitPackedRunDecoder(const RunType& run, rle_size_t value_bit_width) noexcept { + Reset(run, value_bit_width); + } + + void Reset(const RunType& run, rle_size_t value_bit_width) noexcept { + remaining_count_ = run.values_count(); + ARROW_DCHECK_GE(value_bit_width, 0); + ARROW_DCHECK_LE(value_bit_width, 64); + bit_reader_.Reset(run.raw_data_ptr(), run.raw_data_size(value_bit_width)); + } + + /// Return the number of values that can be advanced. + constexpr rle_size_t remaining() const { return remaining_count_; } + + /// Try to advance by as many values as provided. + /// Return the number of values skipped or 0 if it fail to advance. + /// May advance by less than asked for if there are not enough values left. + [[nodiscard]] rle_size_t Advance(rle_size_t batch_size, rle_size_t value_bit_width) { + const auto steps = std::min(batch_size, remaining_count_); + if (bit_reader_.Advance(steps * value_bit_width)) { + remaining_count_ -= steps; + return steps; + } + return 0; + } + + /// Get the next value and return false if there are no more or an error occurred. + [[nodiscard]] constexpr bool Get(value_type* out_value, rle_size_t value_bit_width) { + return GetBatch(out_value, 1, value_bit_width) == 1; + } + + /// Get a batch of values return the number of decoded elements. + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. + [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size, + rle_size_t value_bit_width) { + if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) { + return 0; + } + + const auto to_read = std::min(remaining_count_, batch_size); + const auto actual_read = bit_reader_.GetBatch(value_bit_width, out, to_read); + // There should not be any reason why the actual read would be different + // but this is error resistant. + remaining_count_ -= actual_read; + return actual_read; + } + + private: + ::arrow::bit_util::BitReader bit_reader_ = {}; + rle_size_t remaining_count_ = 0; + + static_assert(std::is_integral_v, + "This class is meant to decode positive integers"); +}; + +/// Decoder class for Parquet RLE bit-packed data. +template +class RleBitPackedDecoder { + public: + /// The type in which the data should be decoded. + using value_type = T; + using DynamicRun = RleBitPackedParser::dynamic_run_type; + + RleBitPackedDecoder() noexcept = default; + + /// Create a decoder object. + /// + /// data and data_size are the raw bytes to decode. + /// value_bit_width is the size in bits of each encoded value. + RleBitPackedDecoder(const uint8_t* data, rle_size_t data_size, + rle_size_t value_bit_width) noexcept { + Reset(data, data_size, value_bit_width); + } + + void Reset(const uint8_t* data, rle_size_t data_size, + rle_size_t value_bit_width) noexcept { + ARROW_DCHECK_GE(value_bit_width, 0); + ARROW_DCHECK_LE(value_bit_width, 64); + parser_.Reset(data, data_size, value_bit_width); + decoder_ = {}; + value_bit_width_ = value_bit_width; + } + + /// Whether there is still runs to iterate over. + /// + /// WARN: Due to lack of proper error handling, iteration with Get methods could return + /// no data while the parser is not exhausted. + /// This is how one can check for errors. + bool exhausted() const { return (run_remaining() == 0) && parser_.exhausted(); } + + /// Gets the next value or returns false if there are no more or an error occurred. + /// + /// NB: Because the encoding only supports literal runs with lengths + /// that are multiples of 8, RleEncoder sometimes pads the end of its + /// input with zeros. Since the encoding does not differentiate between + /// input values and padding, Get() returns true even for these padding + /// values. + [[nodiscard]] bool Get(value_type* val); + + /// Get a batch of values return the number of decoded elements. + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. + [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size); + + /// Like GetBatch but add spacing for null entries. + /// + /// Null entries will be set to an arbistrary value to avoid leaking private data. + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. + [[nodiscard]] rle_size_t GetBatchSpaced(rle_size_t batch_size, rle_size_t null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset, value_type* out); /// Like GetBatch but the values are then decoded using the provided dictionary - template - int GetBatchWithDict(const T* dictionary, int32_t dictionary_length, T* values, - int batch_size); + /// + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. + template + [[nodiscard]] rle_size_t GetBatchWithDict(const V* dictionary, + int32_t dictionary_length, V* out, + rle_size_t batch_size); /// Like GetBatchWithDict but add spacing for null entries /// - /// Null entries will be zero-initialized in `values` to avoid leaking - /// private data. - template - int GetBatchWithDictSpaced(const T* dictionary, int32_t dictionary_length, T* values, - int batch_size, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset); - - protected: - ::arrow::bit_util::BitReader bit_reader_; - /// Number of bits needed to encode the value. Must be between 0 and 64. - int bit_width_; - uint64_t current_value_; - int32_t repeat_count_; - int32_t literal_count_; + /// Null entries will be set to an arbistrary value to avoid leaking private data. + /// May write fewer elements to the output than requested if there are not enough values + /// left or if an error occurred. + template + [[nodiscard]] rle_size_t GetBatchWithDictSpaced( + const V* dictionary, int32_t dictionary_length, V* out, rle_size_t batch_size, + rle_size_t null_count, const uint8_t* valid_bits, int64_t valid_bits_offset); private: - /// Fills literal_count_ and repeat_count_ with next values. Returns false if there - /// are no more. - template - bool NextCounts(); + RleBitPackedParser parser_ = {}; + std::variant, BitPackedRunDecoder> decoder_ = {}; + rle_size_t value_bit_width_; + + /// Return the number of values that are remaining in the current run. + rle_size_t run_remaining() const { + return std::visit([](const auto& dec) { return dec.remaining(); }, decoder_); + } + + /// Get a batch of values from the current run and return the number elements read. + [[nodiscard]] rle_size_t RunGetBatch(value_type* out, rle_size_t batch_size) { + return std::visit( + [&](auto& dec) { return dec.GetBatch(out, batch_size, value_bit_width_); }, + decoder_); + } + + /// Call the parser with a single callable for all event types. + template + void ParseWithCallable(Callable&& func); /// Utility methods for retrieving spaced values. - template - int GetSpaced(Converter converter, int batch_size, int null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset, T* out); + template + [[nodiscard]] rle_size_t GetSpaced(Converter converter, + typename Converter::out_type* out, + rle_size_t batch_size, const uint8_t* valid_bits, + int64_t valid_bits_offset, rle_size_t null_count); }; /// Class to incrementally build the rle data. This class does not allocate any memory. @@ -164,7 +494,7 @@ class RleDecoder { /// This class does so by buffering 8 values at a time. If they are not all the same /// they are added to the literal run. If they are the same, they are added to the /// repeated run. When we switch modes, the previous run is flushed out. -class RleEncoder { +class RleBitPackedEncoder { public: /// buffer/buffer_len: preallocated output buffer. /// bit_width: max number of bits for value. @@ -172,7 +502,7 @@ class RleEncoder { /// when values should be encoded as repeated runs. Currently this is derived /// based on the bit_width, which can determine a storage optimal choice. /// TODO: allow 0 bit_width (and have dict encoder use it) - RleEncoder(uint8_t* buffer, int buffer_len, int bit_width) + RleBitPackedEncoder(uint8_t* buffer, int buffer_len, int bit_width) : bit_width_(bit_width), bit_writer_(buffer, buffer_len) { ARROW_DCHECK_GE(bit_width_, 0); ARROW_DCHECK_LE(bit_width_, 64); @@ -185,12 +515,12 @@ class RleEncoder { /// This is the maximum length of a single run for 'bit_width'. /// It is not valid to pass a buffer less than this length. static int MinBufferSize(int bit_width) { - /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. + // 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. int max_literal_run_size = 1 + static_cast(::arrow::bit_util::BytesForBits( MAX_VALUES_PER_LITERAL_RUN * bit_width)); - /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value. + // Up to kMaxVlqByteLength indicator and a single 'bit_width' value. int max_repeated_run_size = - ::arrow::bit_util::BitReader::kMaxVlqByteLength + + bit_util::kMaxLEB128ByteLenFor + static_cast(::arrow::bit_util::BytesForBits(bit_width)); return std::max(max_literal_run_size, max_repeated_run_size); } @@ -293,385 +623,674 @@ class RleEncoder { uint8_t* literal_indicator_byte_; }; +/************************ + * RleBitPackedParser * + ************************/ + +template +void RleBitPackedParser::Parse(Handler&& handler) { + while (!exhausted()) { + auto [read, control] = PeekImpl(handler); + data_ += read; + data_size_ -= read; + if (ARROW_PREDICT_FALSE(control == ControlFlow::Break)) { + break; + } + } +} + +namespace internal { +/// The maximal unsigned size that a variable can fit. +template +constexpr auto max_size_for_v = + static_cast>(std::numeric_limits::max()); + +} // namespace internal + +template +auto RleBitPackedParser::PeekImpl(Handler&& handler) const + -> std::pair { + ARROW_DCHECK(!exhausted()); + + constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor; + uint32_t run_len_type = 0; + const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); + + if (ARROW_PREDICT_FALSE(header_bytes == 0)) { + // Malformed LEB128 data + return {0, ControlFlow::Break}; + } + + const bool is_bit_packed = run_len_type & 1; + const uint32_t count = run_len_type >> 1; + if (is_bit_packed) { + // Bit-packed run + constexpr auto kMaxCount = bit_util::CeilDiv(internal::max_size_for_v, 8); + if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) { + // Illegal number of encoded values + return {0, ControlFlow::Break}; + } + + ARROW_DCHECK_LT(static_cast(count) * 8, + internal::max_size_for_v); + // Count Already divided by 8 for byte size calculations + const auto bytes_read = header_bytes + static_cast(count) * value_bit_width_; + if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) { + // Bit-packed run would overflow data buffer + return {0, ControlFlow::Break}; + } + const auto values_count = static_cast(count * 8); + + auto control = handler.OnBitPackedRun( + BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)); + + return {static_cast(bytes_read), control}; + } + + // RLE run + if (ARROW_PREDICT_FALSE(count == 0)) { + // Illegal number of encoded values + return {0, ControlFlow::Break}; + } + + // Safe because created from right shift + const auto values_count = static_cast(count); + const auto value_bytes = bit_util::BytesForBits(value_bit_width_); + ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v); + const auto bytes_read = header_bytes + static_cast(value_bytes); + + if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) { + // RLE run would overflow data buffer + return {0, ControlFlow::Break}; + } + + auto control = + handler.OnRleRun(RleRun(data_ + header_bytes, values_count, value_bit_width_)); + + return {bytes_read, control}; +} + +/************************* + * RleBitPackedDecoder * + *************************/ + template -inline bool RleDecoder::Get(T* val) { +template +void RleBitPackedDecoder::ParseWithCallable(Callable&& func) { + struct { + Callable func; + auto OnBitPackedRun(BitPackedRun run) { return func(std::move(run)); } + auto OnRleRun(RleRun run) { return func(std::move(run)); } + } handler{std::move(func)}; + + parser_.Parse(std::move(handler)); +} + +template +bool RleBitPackedDecoder::Get(value_type* val) { return GetBatch(val, 1) == 1; } template -inline int RleDecoder::GetBatch(T* values, int batch_size) { - ARROW_DCHECK_GE(bit_width_, 0); - int values_read = 0; - - auto* out = values; - - while (values_read < batch_size) { - int remaining = batch_size - values_read; - - if (repeat_count_ > 0) { // Repeated value case. - int repeat_batch = std::min(remaining, repeat_count_); - std::fill(out, out + repeat_batch, static_cast(current_value_)); - - repeat_count_ -= repeat_batch; - values_read += repeat_batch; - out += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = std::min(remaining, literal_count_); - int actual_read = bit_reader_.GetBatch(bit_width_, out, literal_batch); - if (actual_read != literal_batch) { - return values_read; - } +auto RleBitPackedDecoder::GetBatch(value_type* out, rle_size_t batch_size) + -> rle_size_t { + using ControlFlow = RleBitPackedParser::ControlFlow; - literal_count_ -= literal_batch; - values_read += literal_batch; - out += literal_batch; - } else { - if (!NextCounts()) return values_read; + rle_size_t values_read = 0; + + // Remaining from a previous call that would have left some unread data from a run. + if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { + const auto read = RunGetBatch(out, batch_size); + values_read += read; + out += read; + + // Either we fulfilled all the batch to be read or we finished remaining run. + if (ARROW_PREDICT_FALSE(values_read == batch_size)) { + return values_read; } + ARROW_DCHECK(run_remaining() == 0); } + ParseWithCallable([&](auto run) { + using RunDecoder = typename decltype(run)::template DecoderType; + + ARROW_DCHECK_LT(values_read, batch_size); + RunDecoder decoder(run, value_bit_width_); + const auto read = decoder.GetBatch(out, batch_size - values_read, value_bit_width_); + ARROW_DCHECK_LE(read, batch_size - values_read); + values_read += read; + out += read; + + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(values_read == batch_size || read == 0)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; + } + + return ControlFlow::Continue; + }); + return values_read; } -template -inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset, - T* out) { - if (ARROW_PREDICT_FALSE(null_count == batch_size)) { - converter.FillZero(out, out + batch_size); - return batch_size; - } - - ARROW_DCHECK_GE(bit_width_, 0); - int values_read = 0; - int values_remaining = batch_size - null_count; - - // Assume no bits to start. - arrow::internal::BitRunReader bit_reader(valid_bits, valid_bits_offset, - /*length=*/batch_size); - arrow::internal::BitRun valid_run = bit_reader.NextRun(); - while (values_read < batch_size) { - if (ARROW_PREDICT_FALSE(valid_run.length == 0)) { - valid_run = bit_reader.NextRun(); +namespace internal { + +/// Utility class to safely handle values and null count without too error-prone +/// verbosity. +class BatchCounter { + public: + using size_type = rle_size_t; + + static constexpr BatchCounter FromBatchSizeAndNulls(size_type batch_size, + size_type null_count) { + ARROW_DCHECK_LE(null_count, batch_size); + return {batch_size - null_count, null_count}; + } + + constexpr BatchCounter(size_type values_count, size_type null_count) noexcept + : values_count_(values_count), null_count_(null_count) {} + + constexpr size_type values_count() const noexcept { return values_count_; } + + constexpr size_type values_read() const noexcept { return values_read_; } + + constexpr size_type values_remaining() const noexcept { + ARROW_DCHECK_LE(values_read_, values_count_); + return values_count_ - values_read_; + } + + constexpr void AccrueReadValues(size_type to_read) noexcept { + ARROW_DCHECK_LE(to_read, values_remaining()); + values_read_ += to_read; + } + + constexpr size_type null_count() const noexcept { return null_count_; } + + constexpr size_type null_read() const noexcept { return null_read_; } + + constexpr size_type null_remaining() const noexcept { + ARROW_DCHECK_LE(null_read_, null_count_); + return null_count_ - null_read_; + } + + constexpr void AccrueReadNulls(size_type to_read) noexcept { + ARROW_DCHECK_LE(to_read, null_remaining()); + null_read_ += to_read; + } + + constexpr size_type total_remaining() const noexcept { + return values_remaining() + null_remaining(); + } + + constexpr size_type total_read() const noexcept { return values_read_ + null_read_; } + + constexpr bool is_fully_null() const noexcept { return values_remaining() == 0; } + + constexpr bool is_done() const noexcept { return total_remaining() == 0; } + + private: + size_type values_count_ = 0; + size_type values_read_ = 0; + size_type null_count_ = 0; + size_type null_read_ = 0; +}; + +template +struct GetSpacedResult { + Int values_read; + Int null_read; +}; + +/// Overload for GetSpaced for a single run in a RleDecoder +template +auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, + rle_size_t batch_size, rle_size_t null_count, + rle_size_t value_bit_width, BitRunReader* validity_reader, + BitRun* validity_run, RleRunDecoder* decoder) + -> GetSpacedResult { + ARROW_DCHECK_GT(batch_size, 0); + // The equality case is handled in the main loop in GetSpaced + ARROW_DCHECK_LT(null_count, batch_size); + + auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); + + const rle_size_t values_available = decoder->remaining(); + ARROW_DCHECK_GT(values_available, 0); + auto values_remaining_run = [&]() { + auto out = values_available - batch.values_read(); + ARROW_DCHECK_GE(out, 0); + return out; + }; + + // Consume as much as possible from the repeated run. + // We only need to count the number of nulls and non-nulls because we can fill in the + // same value for nulls and non-nulls. + // This proves to be a big efficiency win. + while (values_remaining_run() > 0 && !batch.is_done()) { + ARROW_DCHECK_GE(validity_run->length, 0); + ARROW_DCHECK_LT(validity_run->length, max_size_for_v); + ARROW_DCHECK_LE(validity_run->length, batch.total_remaining()); + const auto& validity_run_size = static_cast(validity_run->length); + + if (validity_run->set) { + // We may end the current RLE run in the middle of the validity run + auto update_size = std::min(validity_run_size, values_remaining_run()); + batch.AccrueReadValues(update_size); + validity_run->length -= update_size; + } else { + // We can consume all nulls here because it does not matter if we consume on this + // RLE run, or an a next encoded run. The value filled does not matter. + auto update_size = std::min(validity_run_size, batch.null_remaining()); + batch.AccrueReadNulls(update_size); + validity_run->length -= update_size; } - ARROW_DCHECK_GT(batch_size, 0); - ARROW_DCHECK_GT(valid_run.length, 0); + if (ARROW_PREDICT_TRUE(validity_run->length == 0)) { + *validity_run = validity_reader->NextRun(); + } + } + + const value_type value = decoder->value(); + if (ARROW_PREDICT_FALSE(!converter->InputIsValid(value))) { + return {0, 0}; + } + converter->WriteRepeated(out, out + batch.total_read(), value); + const auto actual_values_read = decoder->Advance(batch.values_read(), value_bit_width); + // We always cropped the number of values_read by the remaining values in the run. + // What's more the RLE decoder should not encounter any errors. + ARROW_DCHECK_EQ(actual_values_read, batch.values_read()); + + return {/* .values_read= */ batch.values_read(), /* .null_read= */ batch.null_read()}; +} + +template +auto RunGetSpaced(Converter* converter, typename Converter::out_type* out, + rle_size_t batch_size, rle_size_t null_count, + rle_size_t value_bit_width, BitRunReader* validity_reader, + BitRun* validity_run, BitPackedRunDecoder* decoder) + -> GetSpacedResult { + ARROW_DCHECK_GT(batch_size, 0); + // The equality case is handled in the main loop in GetSpaced + ARROW_DCHECK_LT(null_count, batch_size); + + auto batch = BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); + + const rle_size_t values_available = decoder->remaining(); + ARROW_DCHECK_GT(values_available, 0); + auto run_values_remaining = [&]() { + auto out = values_available - batch.values_read(); + ARROW_DCHECK_GE(out, 0); + return out; + }; + + while (run_values_remaining() > 0 && batch.values_remaining() > 0) { + // Pull a batch of values from the bit packed encoded data and store it in a local + // buffer to benefit from unpacking intrinsics and data locality. + // Quick benchmarking on a linux x86-64 cloud instance show that this previously + // hard-coded value is appropriate. + static constexpr rle_size_t kBufferCapacity = 1024; + std::array buffer = {}; + + rle_size_t buffer_start = 0; + rle_size_t buffer_end = 0; + auto buffer_size = [&]() { + auto out = buffer_end - buffer_start; + ARROW_DCHECK_GE(out, 0); + return out; + }; + + // buffer_start is 0 at this point so size is end + buffer_end = std::min(std::min(run_values_remaining(), batch.values_remaining()), + kBufferCapacity); + buffer_end = decoder->GetBatch(buffer.data(), buffer_size(), value_bit_width); + ARROW_DCHECK_LE(buffer_size(), kBufferCapacity); + + if (ARROW_PREDICT_FALSE(!converter->InputIsValid(buffer.data(), buffer_size()))) { + return {batch.values_read(), batch.null_read()}; + } - if (valid_run.set) { - if ((repeat_count_ == 0) && (literal_count_ == 0)) { - if (!NextCounts()) return values_read; - ARROW_DCHECK((repeat_count_ > 0) ^ (literal_count_ > 0)); + // Copy chunks of valid values into the output, while adjusting spacing for null + // values. + while (buffer_size() > 0) { + ARROW_DCHECK_GE(validity_run->length, 0); + ARROW_DCHECK_LT(validity_run->length, max_size_for_v); + ARROW_DCHECK_LE(validity_run->length, batch.total_remaining()); + const auto validity_run_length = static_cast(validity_run->length); + + // Copy as much as possible from the buffer into the output while not exceeding + // validity run + if (validity_run->set) { + const auto update_size = std::min(validity_run_length, buffer_size()); + converter->WriteRange(out, buffer.data() + buffer_start, update_size); + buffer_start += update_size; + batch.AccrueReadValues(update_size); + out += update_size; + validity_run->length -= update_size; + // Simply write zeros in the output + } else { + const auto update_size = std::min(validity_run_length, batch.null_remaining()); + converter->WriteZero(out, out + update_size); + batch.AccrueReadNulls(update_size); + out += update_size; + validity_run->length -= update_size; } - if (repeat_count_ > 0) { - int repeat_batch = 0; - // Consume the entire repeat counts incrementing repeat_batch to - // be the total of nulls + values consumed, we only need to - // get the total count because we can fill in the same value for - // nulls and non-nulls. This proves to be a big efficiency win. - while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) { - ARROW_DCHECK_GT(valid_run.length, 0); - if (valid_run.set) { - int update_size = std::min(static_cast(valid_run.length), repeat_count_); - repeat_count_ -= update_size; - repeat_batch += update_size; - valid_run.length -= update_size; - values_remaining -= update_size; - } else { - // We can consume all nulls here because we would do so on - // the next loop anyways. - repeat_batch += static_cast(valid_run.length); - valid_run.length = 0; - } - if (valid_run.length == 0) { - valid_run = bit_reader.NextRun(); - } - } - RunType current_value = static_cast(current_value_); - if (ARROW_PREDICT_FALSE(!converter.IsValid(current_value))) { - return values_read; - } - converter.Fill(out, out + repeat_batch, current_value); - out += repeat_batch; - values_read += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = std::min(values_remaining, literal_count_); - ARROW_DCHECK_GT(literal_batch, 0); - - // Decode the literals - constexpr int kBufferSize = 1024; - RunType indices[kBufferSize]; - literal_batch = std::min(literal_batch, kBufferSize); - int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch); - if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) { - return values_read; - } - if (!converter.IsValid(indices, /*length=*/actual_read)) { - return values_read; - } - int skipped = 0; - int literals_read = 0; - while (literals_read < literal_batch) { - if (valid_run.set) { - int update_size = std::min(literal_batch - literals_read, - static_cast(valid_run.length)); - converter.Copy(out, indices + literals_read, update_size); - literals_read += update_size; - out += update_size; - valid_run.length -= update_size; - } else { - converter.FillZero(out, out + valid_run.length); - out += valid_run.length; - skipped += static_cast(valid_run.length); - valid_run.length = 0; - } - if (valid_run.length == 0) { - valid_run = bit_reader.NextRun(); - } - } - literal_count_ -= literal_batch; - values_remaining -= literal_batch; - values_read += literal_batch + skipped; + if (validity_run->length == 0) { + *validity_run = validity_reader->NextRun(); } - } else { - converter.FillZero(out, out + valid_run.length); - out += valid_run.length; - values_read += static_cast(valid_run.length); - valid_run.length = 0; } + + ARROW_DCHECK_EQ(buffer_size(), 0); } - ARROW_DCHECK_EQ(valid_run.length, 0); - ARROW_DCHECK_EQ(values_remaining, 0); - return values_read; + + ARROW_DCHECK_EQ(values_available - decoder->remaining(), batch.values_read()); + ARROW_DCHECK_LE(batch.total_read(), batch_size); + ARROW_DCHECK_LE(batch.null_read(), batch.null_count()); + + return {/* .values_read= */ batch.values_read(), /* .null_read= */ batch.null_read()}; } +/// Overload for GetSpaced for a single run in a decoder variant +template +auto RunGetSpaced( + Converter* converter, typename Converter::out_type* out, rle_size_t batch_size, + rle_size_t null_count, rle_size_t value_bit_width, BitRunReader* validity_reader, + BitRun* validity_run, + std::variant, BitPackedRunDecoder>* decoder) + -> GetSpacedResult { + return std::visit( + [&](auto& dec) { + ARROW_DCHECK_GT(dec.remaining(), 0); + return RunGetSpaced(converter, out, batch_size, null_count, value_bit_width, + validity_reader, validity_run, &dec); + }, + *decoder); +} + +} // namespace internal + +template +template +auto RleBitPackedDecoder::GetSpaced(Converter converter, + typename Converter::out_type* out, + rle_size_t batch_size, + const uint8_t* validity_bits, + int64_t validity_bits_offset, + rle_size_t null_count) -> rle_size_t { + using ControlFlow = RleBitPackedParser::ControlFlow; + + ARROW_DCHECK_GT(batch_size, 0); + + auto batch = internal::BatchCounter::FromBatchSizeAndNulls(batch_size, null_count); + + if (ARROW_PREDICT_FALSE(batch.is_fully_null())) { + converter.WriteZero(out, out + batch.null_remaining()); + return batch.null_remaining(); + } + + arrow::internal::BitRunReader validity_reader(validity_bits, validity_bits_offset, + /*length=*/batch.total_remaining()); + arrow::internal::BitRun validity_run = validity_reader.NextRun(); + + const auto check_and_handle_fully_null_remaining = [&]() { + if (batch.is_fully_null()) { + ARROW_DCHECK(validity_run.length == 0 || !validity_run.set); + ARROW_DCHECK_GE(validity_run.length, batch.null_remaining()); + + converter.WriteZero(out, out + batch.null_remaining()); + out += batch.null_remaining(); + batch.AccrueReadNulls(batch.null_remaining()); + } + }; + + // Remaining from a previous call that would have left some unread data from a run. + if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { + const auto read = internal::RunGetSpaced(&converter, out, batch.total_remaining(), + batch.null_remaining(), value_bit_width_, + &validity_reader, &validity_run, &decoder_); + + batch.AccrueReadNulls(read.null_read); + batch.AccrueReadValues(read.values_read); + out += read.values_read + read.null_read; + + // Either we fulfilled all the batch values to be read + if (ARROW_PREDICT_FALSE(batch.values_remaining() == 0)) { + // There may be remaining null if they are not greedily filled + check_and_handle_fully_null_remaining(); + return batch.total_read(); + } + + // We finished the remaining run + ARROW_DCHECK(run_remaining() == 0); + } + + ParseWithCallable([&](auto run) { + using RunDecoder = typename decltype(run)::template DecoderType; + + RunDecoder decoder(run, value_bit_width_); + + const auto read = internal::RunGetSpaced(&converter, out, batch.total_remaining(), + batch.null_remaining(), value_bit_width_, + &validity_reader, &validity_run, &decoder); + + batch.AccrueReadNulls(read.null_read); + batch.AccrueReadValues(read.values_read); + out += read.values_read + read.null_read; + + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(read.values_read == 0 || batch.values_remaining() == 0)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; + } + + return ControlFlow::Continue; + }); + + // There may be remaining null if they are not greedily filled by either decoder calls + check_and_handle_fully_null_remaining(); + + return batch.total_read(); +} + +namespace internal { + // Converter for GetSpaced that handles runs that get returned // directly as output. template -struct PlainRleConverter { - T kZero = {}; - inline bool IsValid(const T& values) const { return true; } - inline bool IsValid(const T* values, int32_t length) const { return true; } - inline void Fill(T* begin, T* end, const T& run_value) const { +struct NoOpConverter { + using in_type = T; + using out_type = T; + using size_type = rle_size_t; + + static constexpr bool InputIsValid(const in_type& values) { return true; } + + static constexpr bool InputIsValid(const in_type* values, size_type length) { + return true; + } + + static void WriteRepeated(out_type* begin, out_type* end, in_type run_value) { std::fill(begin, end, run_value); } - inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); } - inline void Copy(T* out, const T* values, int length) const { - std::memcpy(out, values, length * sizeof(T)); + + static void WriteZero(out_type* begin, out_type* end) { + std::fill(begin, end, out_type{}); + } + + static void WriteRange(out_type* out, const in_type* values, size_type length) { + std::memcpy(out, values, length * sizeof(out_type)); } }; +} // namespace internal + template -inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset, T* out) { +auto RleBitPackedDecoder::GetBatchSpaced(rle_size_t batch_size, rle_size_t null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset, value_type* out) + -> rle_size_t { if (null_count == 0) { - return GetBatch(out, batch_size); + return GetBatch(out, batch_size); } - PlainRleConverter converter; - arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset, - batch_size); + internal::NoOpConverter converter; - int total_processed = 0; - int processed = 0; - arrow::internal::BitBlockCount block; - - do { - block = block_counter.NextFourWords(); - if (block.length == 0) { - break; - } - if (block.AllSet()) { - processed = GetBatch(out, block.length); - } else if (block.NoneSet()) { - converter.FillZero(out, out + block.length); - processed = block.length; - } else { - processed = GetSpaced>( - converter, block.length, block.length - block.popcount, valid_bits, - valid_bits_offset, out); - } - total_processed += processed; - out += block.length; - valid_bits_offset += block.length; - } while (processed == block.length); - return total_processed; + return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count); } -static inline bool IndexInRange(int32_t idx, int32_t dictionary_length) { - return idx >= 0 && idx < dictionary_length; +namespace internal { + +template +bool IndexInRange(I idx, int32_t dictionary_length) { + ARROW_DCHECK_GT(dictionary_length, 0); + using T = std::common_type_t; + return idx >= 0 && static_cast(idx) < static_cast(dictionary_length); } // Converter for GetSpaced that handles runs of returned dictionary // indices. -template +template struct DictionaryConverter { - T kZero = {}; - const T* dictionary; - int32_t dictionary_length; - - inline bool IsValid(int32_t value) { return IndexInRange(value, dictionary_length); } - - inline bool IsValid(const int32_t* values, int32_t length) const { - using IndexType = int32_t; - IndexType min_index = std::numeric_limits::max(); - IndexType max_index = std::numeric_limits::min(); - for (int x = 0; x < length; x++) { - min_index = std::min(values[x], min_index); - max_index = std::max(values[x], max_index); + using out_type = V; + using in_type = I; + using size_type = rle_size_t; + + static constexpr bool kIsIdentity = false; + + const out_type* dictionary; + size_type dictionary_length; + + bool InputIsValid(in_type idx) const { return IndexInRange(idx, dictionary_length); } + + bool InputIsValid(const in_type* indices, size_type length) const { + ARROW_DCHECK(length > 0); + + in_type min_index = std::numeric_limits::max(); + in_type max_index = std::numeric_limits::min(); + for (size_type x = 0; x < length; x++) { + min_index = std::min(indices[x], min_index); + max_index = std::max(indices[x], max_index); } return IndexInRange(min_index, dictionary_length) && IndexInRange(max_index, dictionary_length); } - inline void Fill(T* begin, T* end, const int32_t& run_value) const { + + void WriteRepeated(out_type* begin, out_type* end, in_type run_value) const { std::fill(begin, end, dictionary[run_value]); } - inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); } - inline void Copy(T* out, const int32_t* values, int length) const { - for (int x = 0; x < length; x++) { + static void WriteZero(out_type* begin, out_type* end) { + std::fill(begin, end, out_type{}); + } + + void WriteRange(out_type* out, const in_type* values, size_type length) const { + for (size_type x = 0; x < length; x++) { out[x] = dictionary[values[x]]; } } }; +/// Dummy imitation of BitRun that is all set. +struct AllSetBitRun { + static constexpr bool set = true; + int64_t length = 0; +}; + +/// Dummy imitation of BitRunReader that should never be called. +struct UnreachableBitRunReader { + constexpr static AllSetBitRun NextRun() { return {}; } +}; + +} // namespace internal + template -inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_length, - T* values, int batch_size) { - // Per https://github.com/apache/parquet-format/blob/master/Encodings.md, - // the maximum dictionary index width in Parquet is 32 bits. - using IndexType = int32_t; - DictionaryConverter converter; - converter.dictionary = dictionary; - converter.dictionary_length = dictionary_length; - - ARROW_DCHECK_GE(bit_width_, 0); - int values_read = 0; - - auto* out = values; - - while (values_read < batch_size) { - int remaining = batch_size - values_read; - - if (repeat_count_ > 0) { - auto idx = static_cast(current_value_); - if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) { - return values_read; - } - T val = dictionary[idx]; +template +auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, + int32_t dictionary_length, V* out, + rle_size_t batch_size) -> rle_size_t { + using ControlFlow = RleBitPackedParser::ControlFlow; + + if (ARROW_PREDICT_FALSE(batch_size <= 0)) { + return 0; + } - int repeat_batch = std::min(remaining, repeat_count_); - std::fill(out, out + repeat_batch, val); + internal::DictionaryConverter converter{dictionary, dictionary_length}; - /* Upkeep counters */ - repeat_count_ -= repeat_batch; - values_read += repeat_batch; - out += repeat_batch; - } else if (literal_count_ > 0) { - constexpr int kBufferSize = 1024; - IndexType indices[kBufferSize]; + // Make lightweight BitRun class to reuse previous methods. + constexpr internal::UnreachableBitRunReader validity_reader{}; + internal::AllSetBitRun validity_run = {batch_size}; - int literal_batch = std::min(remaining, literal_count_); - literal_batch = std::min(literal_batch, kBufferSize); + rle_size_t values_read = 0; + auto batch_values_remaining = [&]() { + ARROW_DCHECK_LE(values_read, batch_size); + return batch_size - values_read; + }; - int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch); - if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) { - return values_read; - } - if (ARROW_PREDICT_FALSE(!converter.IsValid(indices, /*length=*/literal_batch))) { - return values_read; - } - converter.Copy(out, indices, literal_batch); + if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { + const auto read = internal::RunGetSpaced(&converter, out, batch_size, + /* null_count= */ 0, value_bit_width_, + &validity_reader, &validity_run, &decoder_); - /* Upkeep counters */ - literal_count_ -= literal_batch; - values_read += literal_batch; - out += literal_batch; - } else { - if (!NextCounts()) return values_read; + ARROW_DCHECK_EQ(read.null_read, 0); + values_read += read.values_read; + out += read.values_read; + + // Either we fulfilled all the batch values to be read + if (ARROW_PREDICT_FALSE(values_read >= batch_size)) { + // There may be remaining null if they are not greedily filled + return values_read; } + + // We finished the remaining run + ARROW_DCHECK(run_remaining() == 0); } - return values_read; -} + ParseWithCallable([&](auto run) { + using RunDecoder = typename decltype(run)::template DecoderType; -template -inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, - int32_t dictionary_length, T* out, - int batch_size, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset) { - if (null_count == 0) { - return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); - } - arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset, - batch_size); - using IndexType = int32_t; - DictionaryConverter converter; - converter.dictionary = dictionary; - converter.dictionary_length = dictionary_length; - - int total_processed = 0; - int processed = 0; - arrow::internal::BitBlockCount block; - do { - block = block_counter.NextFourWords(); - if (block.length == 0) { - break; - } - if (block.AllSet()) { - processed = GetBatchWithDict(dictionary, dictionary_length, out, block.length); - } else if (block.NoneSet()) { - converter.FillZero(out, out + block.length); - processed = block.length; - } else { - processed = GetSpaced>( - converter, block.length, block.length - block.popcount, valid_bits, - valid_bits_offset, out); + RunDecoder decoder(run, value_bit_width_); + + const auto read = internal::RunGetSpaced(&converter, out, batch_values_remaining(), + /* null_count= */ 0, value_bit_width_, + &validity_reader, &validity_run, &decoder); + + ARROW_DCHECK_EQ(read.null_read, 0); + values_read += read.values_read; + out += read.values_read; + + // Stop reading and store remaining decoder + if (ARROW_PREDICT_FALSE(read.values_read == 0 || values_read == batch_size)) { + decoder_ = std::move(decoder); + return ControlFlow::Break; } - total_processed += processed; - out += block.length; - valid_bits_offset += block.length; - } while (processed == block.length); - return total_processed; + + return ControlFlow::Continue; + }); + + return values_read; } template -bool RleDecoder::NextCounts() { - // Read the next run's indicator int, it could be a literal or repeated run. - // The int is encoded as a vlq-encoded value. - uint32_t indicator_value = 0; - if (!bit_reader_.GetVlqInt(&indicator_value)) return false; - - // lsb indicates if it is a literal run or repeated run - bool is_literal = indicator_value & 1; - uint32_t count = indicator_value >> 1; - if (is_literal) { - if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast(INT32_MAX) / 8)) { - return false; - } - literal_count_ = count * 8; - } else { - if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast(INT32_MAX))) { - return false; - } - repeat_count_ = count; - T value = {}; - if (!bit_reader_.GetAligned( - static_cast(::arrow::bit_util::CeilDiv(bit_width_, 8)), &value)) { - return false; - } - current_value_ = static_cast(value); +template +auto RleBitPackedDecoder::GetBatchWithDictSpaced( + const V* dictionary, int32_t dictionary_length, V* out, rle_size_t batch_size, + rle_size_t null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) + -> rle_size_t { + if (null_count == 0) { + return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); } - return true; + internal::DictionaryConverter converter{dictionary, dictionary_length}; + + return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count); } +/************************* + * RleBitPackedEncoder * + *************************/ + /// This function buffers input values 8 at a time. After seeing all 8 values, /// it decides whether they should be encoded as a literal or repeated run. -inline bool RleEncoder::Put(uint64_t value) { +inline bool RleBitPackedEncoder::Put(uint64_t value) { ARROW_DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_)); if (ARROW_PREDICT_FALSE(buffer_full_)) return false; @@ -702,7 +1321,7 @@ inline bool RleEncoder::Put(uint64_t value) { return true; } -inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { +inline void RleBitPackedEncoder::FlushLiteralRun(bool update_indicator_byte) { if (literal_indicator_byte_ == NULL) { // The literal indicator byte has not been reserved yet, get one now. literal_indicator_byte_ = bit_writer_.GetNextBytePtr(); @@ -732,7 +1351,7 @@ inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { } } -inline void RleEncoder::FlushRepeatedRun() { +inline void RleBitPackedEncoder::FlushRepeatedRun() { ARROW_DCHECK_GT(repeat_count_, 0); bool result = true; // The lsb of 0 indicates this is a repeated run @@ -748,7 +1367,7 @@ inline void RleEncoder::FlushRepeatedRun() { /// Flush the values that have been buffered. At this point we decide whether /// we need to switch between the run types or continue the current one. -inline void RleEncoder::FlushBufferedValues(bool done) { +inline void RleBitPackedEncoder::FlushBufferedValues(bool done) { if (repeat_count_ >= 8) { // Clear the buffered values. They are part of the repeated run now and we // don't want to flush them out as literals. @@ -778,7 +1397,7 @@ inline void RleEncoder::FlushBufferedValues(bool done) { repeat_count_ = 0; } -inline int RleEncoder::Flush() { +inline int RleBitPackedEncoder::Flush() { if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0); @@ -805,14 +1424,14 @@ inline int RleEncoder::Flush() { return bit_writer_.bytes_written(); } -inline void RleEncoder::CheckBufferFull() { +inline void RleBitPackedEncoder::CheckBufferFull() { int bytes_written = bit_writer_.bytes_written(); if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) { buffer_full_ = true; } } -inline void RleEncoder::Clear() { +inline void RleBitPackedEncoder::Clear() { buffer_full_ = false; current_value_ = 0; repeat_count_ = 0; @@ -822,5 +1441,4 @@ inline void RleEncoder::Clear() { bit_writer_.Clear(); } -} // namespace util -} // namespace arrow +} // namespace arrow::util diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index 0cc0a276a25..c7f4878b741 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -25,7 +25,10 @@ #include #include "arrow/array.h" -#include "arrow/buffer.h" +#include "arrow/array/concatenate.h" +#include "arrow/array/util.h" +#include "arrow/scalar.h" +#include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/bit_stream_utils_internal.h" @@ -33,8 +36,7 @@ #include "arrow/util/io_util.h" #include "arrow/util/rle_encoding_internal.h" -namespace arrow { -namespace util { +namespace arrow::util { const int MAX_WIDTH = 32; @@ -207,12 +209,303 @@ TEST(BitUtil, RoundTripIntValues) { } } +/// A Rle run is a simple class owning some data and a repetition count. +/// It does not know how to read such data. +TEST(Rle, RleRun) { + const std::array value = {21, 2, 0, 0}; + + const rle_size_t value_count = 12; + + // 12 times the value 21 fitting over 5 bits + const rle_size_t value_bit_width_5 = 5; + const auto run_5 = RleRun(value.data(), value_count, value_bit_width_5); + EXPECT_EQ(run_5.values_count(), value_count); + EXPECT_EQ(run_5.raw_data_size(value_bit_width_5), 1); // 5 bits fit in one byte + EXPECT_EQ(*run_5.raw_data_ptr(), 21); + + // 12 times the value 21 fitting over 8 bits + const rle_size_t value_bit_width_8 = 8; + const auto run_8 = RleRun(value.data(), value_count, value_bit_width_8); + EXPECT_EQ(run_8.values_count(), value_count); + EXPECT_EQ(run_8.raw_data_size(value_bit_width_8), 1); // 8 bits fit in 1 byte + EXPECT_EQ(*run_8.raw_data_ptr(), 21); + + // 12 times the value 533 (21 + 2 * 2^8) fitting over 10 bits + const rle_size_t value_bit_width_10 = 10; + const auto run_10 = RleRun(value.data(), value_count, value_bit_width_10); + EXPECT_EQ(run_10.values_count(), value_count); + EXPECT_EQ(run_10.raw_data_size(value_bit_width_10), 2); // 10 bits fit in 2 bytes + EXPECT_EQ(*(run_10.raw_data_ptr() + 0), 21); + EXPECT_EQ(*(run_10.raw_data_ptr() + 1), 2); + + // 12 times the value 533 (21 + 2 * 2^8) fitting over 32 bits + const rle_size_t value_bit_width_32 = 32; + const auto run_32 = RleRun(value.data(), value_count, value_bit_width_32); + EXPECT_EQ(run_32.values_count(), value_count); + EXPECT_EQ(run_32.raw_data_size(value_bit_width_32), 4); // 32 bits fit in 4 bytes + EXPECT_EQ(*(run_32.raw_data_ptr() + 0), 21); + EXPECT_EQ(*(run_32.raw_data_ptr() + 1), 2); + EXPECT_EQ(*(run_32.raw_data_ptr() + 2), 0); + EXPECT_EQ(*(run_32.raw_data_ptr() + 3), 0); +} + +/// A BitPacked run is a simple class owning some data and its size. +/// It does not know how to read such data. +TEST(BitPacked, BitPackedRun) { + const std::array value = {0b10101010, 0, 0, 0b1111111}; + + // 16 values of 1 bit for a total of 16 bits + const rle_size_t value_count_1 = 16; + const rle_size_t value_bit_width_1 = 1; + const auto run_1 = BitPackedRun(value.data(), value_count_1, value_bit_width_1); + EXPECT_EQ(run_1.values_count(), value_count_1); + EXPECT_EQ(run_1.raw_data_size(value_bit_width_1), 2); // 16 bits fit in 2 bytes + EXPECT_EQ(run_1.raw_data_ptr(), value.data()); + + // 8 values of 3 bits for a total of 24 bits + const rle_size_t value_count_3 = 8; + const rle_size_t value_bit_width_3 = 3; + const auto run_3 = BitPackedRun(value.data(), value_count_3, value_bit_width_3); + EXPECT_EQ(run_3.values_count(), value_count_3); + EXPECT_EQ(run_3.raw_data_size(value_bit_width_3), 3); // 24 bits fit in 3 bytes + EXPECT_EQ(run_3.raw_data_ptr(), value.data()); +} + +template +void TestRleDecoder(std::vector bytes, rle_size_t value_count, + rle_size_t bit_width, T expected_value) { + // Pre-requisite for this test + EXPECT_GT(value_count, 6); + + const auto run = RleRun(bytes.data(), value_count, bit_width); + + auto decoder = RleRunDecoder(run, bit_width); + std::vector vals = {0, 0}; + + EXPECT_EQ(decoder.remaining(), value_count); + + rle_size_t read = 0; + EXPECT_EQ(decoder.Get(vals.data(), bit_width), 1); + read += 1; + EXPECT_EQ(vals.at(0), expected_value); + EXPECT_EQ(decoder.remaining(), value_count - read); + + EXPECT_EQ(decoder.Advance(3, bit_width), 3); + read += 3; + EXPECT_EQ(decoder.remaining(), value_count - read); + + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2, bit_width), vals.size()); + EXPECT_EQ(vals.at(0), expected_value); + EXPECT_EQ(vals.at(1), expected_value); + read += static_cast(vals.size()); + EXPECT_EQ(decoder.remaining(), value_count - read); + + // Exhaust iteration + EXPECT_EQ(decoder.Advance(value_count - read, bit_width), value_count - read); + EXPECT_EQ(decoder.remaining(), 0); + EXPECT_EQ(decoder.Advance(1, bit_width), 0); + vals = {0, 0}; + EXPECT_EQ(decoder.Get(vals.data(), bit_width), 0); + EXPECT_EQ(vals.at(0), 0); + + // Reset the decoder + decoder.Reset(run, bit_width); + EXPECT_EQ(decoder.remaining(), value_count); + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2, bit_width), vals.size()); + EXPECT_EQ(vals.at(0), expected_value); + EXPECT_EQ(vals.at(1), expected_value); +} + +TEST(Rle, RleDecoder) { + TestRleDecoder({21, 0, 0}, /* value_count= */ 23, /* bit_width= */ 5, + /* expected_value= */ 21); + TestRleDecoder({1, 0}, /* value_count= */ 13, /* bit_width= */ 1, + /* expected_value= */ 1); + TestRleDecoder({21, 0, 0}, /* value_count= */ 23, /* bit_width= */ 5, + /* expected_value= */ 21); + TestRleDecoder({21, 0, 0}, /* value_count= */ 23, /* bit_width= */ 5, + /* expected_value= */ 21); + TestRleDecoder({21, 2, 0, 1}, /* value_count= */ 20, /* bit_width= */ 30, + /* expected_value= */ 16777749); +} + +template +void TestBitPackedDecoder(std::vector bytes, rle_size_t value_count, + rle_size_t bit_width, std::vector expected) { + // Pre-requisite for this test + EXPECT_GT(value_count, 6); + + const auto run = BitPackedRun(bytes.data(), value_count, bit_width); + + auto decoder = BitPackedRunDecoder(run, bit_width); + std::vector vals = {0, 0}; + + EXPECT_EQ(decoder.remaining(), value_count); + + rle_size_t read = 0; + EXPECT_EQ(decoder.Get(vals.data(), bit_width), 1); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + read += 1; + EXPECT_EQ(decoder.remaining(), value_count - read); + + EXPECT_EQ(decoder.Advance(3, bit_width), 3); + read += 3; + EXPECT_EQ(decoder.remaining(), value_count - read); + + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2, bit_width), vals.size()); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + EXPECT_EQ(vals.at(1), expected.at(1 + read)); + read += static_cast(vals.size()); + EXPECT_EQ(decoder.remaining(), value_count - read); + + // Exhaust iteration + EXPECT_EQ(decoder.Advance(value_count - read, bit_width), value_count - read); + EXPECT_EQ(decoder.remaining(), 0); + EXPECT_EQ(decoder.Advance(1, bit_width), 0); + vals = {0, 0}; + EXPECT_EQ(decoder.Get(vals.data(), bit_width), 0); + EXPECT_EQ(vals.at(0), 0); + + // Reset the decoder + decoder.Reset(run, bit_width); + read = 0; + EXPECT_EQ(decoder.remaining(), value_count); + vals = {0, 0}; + EXPECT_EQ(decoder.GetBatch(vals.data(), 2, bit_width), vals.size()); + EXPECT_EQ(vals.at(0), expected.at(0 + read)); + EXPECT_EQ(vals.at(1), expected.at(1 + read)); +} + +TEST(BitPacked, BitPackedDecoder) { + // See parquet encoding for bytes layout + TestBitPackedDecoder( + /* bytes= */ {0x88, 0xc6, 0xfa}, + /* values_count= */ 8, + /* bit_width= */ 3, + /* expected= */ {0, 1, 2, 3, 4, 5, 6, 7}); + TestBitPackedDecoder( + /* bytes= */ {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7}, + /* values_count= */ 8, + /* bit_width= */ 8, + /* expected= */ {0, 1, 2, 3, 4, 5, 6, 7}); + TestBitPackedDecoder( + /* bytes= */ {0x47, 0xc, 0x10, 0x35}, + /* values_count= */ 8, + /* bit_width= */ 4, + /* expected= */ {7, 4, 12, 0, 0, 1, 5, 3}); + TestBitPackedDecoder( + /* bytes= */ {0xe8, 0x7, 0x20, 0xc0, 0x0, 0x4, 0x14, 0x60, 0xc0, 0x1}, + /* values_count= */ 8, + /* bit_width= */ 10, + /* expected= */ {1000, 1, 2, 3, 4, 5, 6, 7}); +} + +template +void TestRleBitPackedParser(std::vector bytes, rle_size_t bit_width, + std::vector expected) { + auto parser = + RleBitPackedParser(bytes.data(), static_cast(bytes.size()), bit_width); + EXPECT_FALSE(parser.exhausted()); + + // Try to decode all data of all runs in the decoded vector + decltype(expected) decoded = {}; + auto rle_decoder = RleRunDecoder(); + auto bit_packed_decoder = BitPackedRunDecoder(); + + struct { + decltype(rle_decoder)* rle_decoder_ptr_; + decltype(bit_packed_decoder)* bit_packed_decoder_ptr_; + decltype(decoded)* decoded_ptr_; + decltype(bit_width) bit_width_; + + auto OnRleRun(RleRun run) { + rle_decoder_ptr_->Reset(run, bit_width_); + + const auto n_decoded = decoded_ptr_->size(); + const auto n_to_decode = rle_decoder_ptr_->remaining(); + decoded_ptr_->resize(n_decoded + n_to_decode); + EXPECT_EQ(rle_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, n_to_decode, + bit_width_), + n_to_decode); + EXPECT_EQ(rle_decoder_ptr_->remaining(), 0); + + return RleBitPackedParser::ControlFlow::Continue; + } + + auto OnBitPackedRun(BitPackedRun run) { + bit_packed_decoder_ptr_->Reset(run, bit_width_); + + const auto n_decoded = decoded_ptr_->size(); + const auto n_to_decode = bit_packed_decoder_ptr_->remaining(); + decoded_ptr_->resize(n_decoded + n_to_decode); + EXPECT_EQ(bit_packed_decoder_ptr_->GetBatch(decoded_ptr_->data() + n_decoded, + n_to_decode, bit_width_), + n_to_decode); + EXPECT_EQ(bit_packed_decoder_ptr_->remaining(), 0); + + return RleBitPackedParser::ControlFlow::Continue; + } + } handler{&rle_decoder, &bit_packed_decoder, &decoded, bit_width}; + + // Iterate over all runs + parser.Parse(handler); + + EXPECT_TRUE(parser.exhausted()); + EXPECT_EQ(decoded.size(), expected.size()); + EXPECT_EQ(decoded, expected); +} + +TEST(RleBitPacked, RleBitPackedParser) { + TestRleBitPackedParser( + /* bytes= */ + {/* LEB128 for 8 values bit packed marker */ 0x3, + /* Bitpacked run */ 0x88, 0xc6, 0xfa}, + /* bit_width= */ 3, + /* expected= */ {0, 1, 2, 3, 4, 5, 6, 7}); + + { + std::vector expected = {0, 1, 2, 3, 4, 5, 6, 7}; + expected.resize(expected.size() + 200, 5); + TestRleBitPackedParser( + /* bytes= */ + {/* LEB128 for 8 values bit packed marker */ 0x3, + /* Bitpacked run */ 0x88, 0xc6, 0xfa, + /* LEB128 for 200 RLE marker */ 0x90, 0x3, + /* Value 5 over paded to a byte*/ 0x5}, + /* bit_width= */ 3, + /* expected= */ expected); + } + + { + std::vector expected = {0, 0, 0, 0, 1, 1, 1, 1}; + expected.resize(expected.size() + 200, 1); + expected.resize(expected.size() + 10, 3); + std::array run2 = {1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2}; + expected.insert(expected.end(), run2.begin(), run2.end()); + TestRleBitPackedParser( + /* bytes= */ + {/* LEB128 for 8 values bit packed marker */ 0x3, + /* Bitpacked run */ 0x0, 0x55, + /* LEB128 for 200 RLE marker */ 0x90, 0x3, + /* Value 1 over paded to a byte*/ 0x1, + /* LEB128 for 10 RLE marker */ 0x14, + /* Value 3 over paded to a byte*/ 0x3, + /* LEB128 for 16 values bit packed marker */ 0x5, + /* Bitpacked run */ 0x99, 0x99, 0x99, 0x99}, + /* bit_width= */ 2, + /* expected= */ expected); + } +} + // Validates encoding of values by encoding and decoding them. If // expected_encoding != NULL, also validates that the encoded buffer is // exactly 'expected_encoding'. // if expected_len is not -1, it will validate the encoded size is correct. -void ValidateRle(const std::vector& values, int bit_width, - uint8_t* expected_encoding, int expected_len) { +void ValidateRleBitPacked(const std::vector& values, int bit_width, + uint8_t* expected_encoding, int expected_len) { const int len = 64 * 1024; #ifdef __EMSCRIPTEN__ // don't make this on the stack as it is @@ -224,7 +517,7 @@ void ValidateRle(const std::vector& values, int bit_width, #endif EXPECT_LE(expected_len, len); - RleEncoder encoder(buffer, len, bit_width); + RleBitPackedEncoder encoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { bool result = encoder.Put(values[i]); EXPECT_TRUE(result); @@ -240,7 +533,7 @@ void ValidateRle(const std::vector& values, int bit_width, // Verify read { - RleDecoder decoder(buffer, len, bit_width); + RleBitPackedDecoder decoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { uint64_t val; bool result = decoder.Get(&val); @@ -251,7 +544,7 @@ void ValidateRle(const std::vector& values, int bit_width, // Verify batch read { - RleDecoder decoder(buffer, len, bit_width); + RleBitPackedDecoder decoder(buffer, len, bit_width); std::vector values_read(values.size()); ASSERT_EQ(values.size(), decoder.GetBatch(values_read.data(), static_cast(values.size()))); @@ -271,7 +564,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { #else uint8_t buffer[len]; #endif - RleEncoder encoder(buffer, len, bit_width); + RleBitPackedEncoder encoder(buffer, len, bit_width); for (size_t i = 0; i < values.size(); ++i) { bool result = encoder.Put(values[i]); if (!result) { @@ -282,7 +575,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { int out = 0; { - RleDecoder decoder(buffer, encoded_len, bit_width); + RleBitPackedDecoder decoder(buffer, encoded_len, bit_width); for (size_t i = 0; i < values.size(); ++i) { EXPECT_TRUE(decoder.Get(&out)); if (values[i] != out) { @@ -293,7 +586,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { // Verify batch read { - RleDecoder decoder(buffer, encoded_len, bit_width); + RleBitPackedDecoder decoder(buffer, encoded_len, bit_width); std::vector values_read(values.size()); if (static_cast(values.size()) != decoder.GetBatch(values_read.data(), static_cast(values.size()))) { @@ -308,7 +601,7 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { return true; } -TEST(Rle, SpecificSequences) { +TEST(RleBitPacked, SpecificSequences) { const int len = 1024; uint8_t expected_buffer[len]; std::vector values; @@ -328,12 +621,12 @@ TEST(Rle, SpecificSequences) { expected_buffer[2] = (50 << 1); expected_buffer[3] = 1; for (int width = 1; width <= 8; ++width) { - ValidateRle(values, width, expected_buffer, 4); + ValidateRleBitPacked(values, width, expected_buffer, 4); } for (int width = 9; width <= MAX_WIDTH; ++width) { - ValidateRle(values, width, nullptr, - 2 * (1 + static_cast(bit_util::CeilDiv(width, 8)))); + ValidateRleBitPacked(values, width, nullptr, + 2 * (1 + static_cast(bit_util::CeilDiv(width, 8)))); } // Test 100 0's and 1's alternating @@ -349,11 +642,11 @@ TEST(Rle, SpecificSequences) { expected_buffer[100 / 8 + 1] = 0x0A /* 0b00001010 */; // num_groups and expected_buffer only valid for bit width = 1 - ValidateRle(values, 1, expected_buffer, 1 + num_groups); + ValidateRleBitPacked(values, 1, expected_buffer, 1 + num_groups); for (int width = 2; width <= MAX_WIDTH; ++width) { int num_values = static_cast(bit_util::CeilDiv(100, 8)) * 8; - ValidateRle(values, width, nullptr, - 1 + static_cast(bit_util::CeilDiv(width * num_values, 8))); + ValidateRleBitPacked(values, width, nullptr, + 1 + static_cast(bit_util::CeilDiv(width * num_values, 8))); } // Test 16-bit values to confirm encoded values are stored in little endian @@ -371,7 +664,7 @@ TEST(Rle, SpecificSequences) { expected_buffer[4] = 0x55; expected_buffer[5] = 0xaa; - ValidateRle(values, 16, expected_buffer, 6); + ValidateRleBitPacked(values, 16, expected_buffer, 6); // Test 32-bit values to confirm encoded values are stored in little endian values.resize(28); @@ -392,7 +685,7 @@ TEST(Rle, SpecificSequences) { expected_buffer[8] = 0xaa; expected_buffer[9] = 0x5a; - ValidateRle(values, 32, expected_buffer, 10); + ValidateRleBitPacked(values, 32, expected_buffer, 10); } // ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value @@ -403,10 +696,10 @@ void TestRleValues(int bit_width, int num_vals, int value = -1) { for (int v = 0; v < num_vals; ++v) { values.push_back((value != -1) ? value : static_cast(v % mod)); } - ValidateRle(values, bit_width, NULL, -1); + ValidateRleBitPacked(values, bit_width, NULL, -1); } -TEST(Rle, TestValues) { +TEST(RleBitPacked, TestValues) { for (int width = 1; width <= MAX_WIDTH; ++width) { TestRleValues(width, 1); TestRleValues(width, 1024); @@ -415,11 +708,11 @@ TEST(Rle, TestValues) { } } -TEST(Rle, BitWidthZeroRepeated) { +TEST(RleBitPacked, BitWidthZeroRepeated) { uint8_t buffer[1]; const int num_values = 15; buffer[0] = num_values << 1; // repeated indicator byte - RleDecoder decoder(buffer, sizeof(buffer), 0); + RleBitPackedDecoder decoder(buffer, sizeof(buffer), 0); uint8_t val; for (int i = 0; i < num_values; ++i) { bool result = decoder.Get(&val); @@ -429,11 +722,11 @@ TEST(Rle, BitWidthZeroRepeated) { EXPECT_FALSE(decoder.Get(&val)); } -TEST(Rle, BitWidthZeroLiteral) { +TEST(RleBitPacked, BitWidthZeroLiteral) { uint8_t buffer[1]; const int num_groups = 4; buffer[0] = num_groups << 1 | 1; // literal indicator byte - RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0); + RleBitPackedDecoder decoder = {buffer, sizeof(buffer), 0}; const int num_values = num_groups * 8; uint8_t val; for (int i = 0; i < num_values; ++i) { @@ -450,13 +743,13 @@ TEST(BitRle, Flush) { std::vector values; for (int i = 0; i < 16; ++i) values.push_back(1); values.push_back(0); - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); values.push_back(1); - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); values.push_back(1); - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); values.push_back(1); - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); } // Test some random sequences. @@ -515,17 +808,17 @@ TEST(BitRle, RepeatedPattern) { } } - ValidateRle(values, 1, NULL, -1); + ValidateRleBitPacked(values, 1, NULL, -1); } TEST(BitRle, Overflow) { for (int bit_width = 1; bit_width < 32; bit_width += 3) { - int len = RleEncoder::MinBufferSize(bit_width); + int len = RleBitPackedEncoder::MinBufferSize(bit_width); std::vector buffer(len); int num_added = 0; bool parity = true; - RleEncoder encoder(buffer.data(), len, bit_width); + RleBitPackedEncoder encoder(buffer.data(), len, bit_width); // Insert alternating true/false until there is no space left while (true) { bool result = encoder.Put(parity); @@ -538,7 +831,7 @@ TEST(BitRle, Overflow) { EXPECT_LE(bytes_written, len); EXPECT_GT(num_added, 0); - RleDecoder decoder(buffer.data(), bytes_written, bit_width); + RleBitPackedDecoder decoder(buffer.data(), bytes_written, bit_width); parity = true; uint32_t v; for (int i = 0; i < num_added; ++i) { @@ -553,69 +846,300 @@ TEST(BitRle, Overflow) { } } +/// Check RleBitPacked encoding/decoding round trip. +/// +/// \param spaced If set to false, treat Nulls in the input array as regular data. +/// \param parts The number of parts in which the data will be decoded. +/// For number greater than one, this ensure that the decoder intermediate state +/// is valid. template -void CheckRoundTripSpaced(const Array& data, int bit_width) { +void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts, + std::shared_ptr dict = {}) { using ArrayType = typename TypeTraits::ArrayType; - using T = typename Type::c_type; + using value_type = typename Type::c_type; - int num_values = static_cast(data.length()); - int buffer_size = RleEncoder::MaxBufferSize(bit_width, num_values); + const int data_size = static_cast(data.length()); + const int data_values_count = + static_cast(data.length() - spaced * data.null_count()); + const int buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, data_size); + ASSERT_GE(parts, 1); + ASSERT_LE(parts, data_size); - const T* values = static_cast(data).raw_values(); + const value_type* data_values = static_cast(data).raw_values(); + // Encode the data into `buffer` using the encoder. std::vector buffer(buffer_size); - RleEncoder encoder(buffer.data(), buffer_size, bit_width); - for (int i = 0; i < num_values; ++i) { - if (data.IsValid(i)) { - if (!encoder.Put(static_cast(values[i]))) { - FAIL() << "Encoding failed"; - } + RleBitPackedEncoder encoder(buffer.data(), buffer_size, bit_width); + int32_t encoded_values_size = 0; + for (int i = 0; i < data_size; ++i) { + // Depending on `spaced` we treat nulls as regular values. + if (data.IsValid(i) || !spaced) { + bool success = encoder.Put(static_cast(data_values[i])); + ASSERT_TRUE(success) << "Encoding failed in pos " << i; + ++encoded_values_size; } } - int encoded_size = encoder.Flush(); + int encoded_byte_size = encoder.Flush(); + ASSERT_EQ(encoded_values_size, data_values_count) + << "All values input were not encoded successfully by the encoder"; + + // Now we verify batch read + RleBitPackedDecoder decoder(buffer.data(), encoded_byte_size, bit_width); + // We will only use one of them depending on whether this is a dictionary tests + std::vector dict_read; + std::vector values_read; + if (dict) { + dict_read.resize(data_size); + } else { + values_read.resize(data_size); + } - // Verify batch read - RleDecoder decoder(buffer.data(), encoded_size, bit_width); - std::vector values_read(num_values); + // We will read the data in `parts` calls to make sure intermediate states are valid + rle_size_t total_read_count = 0; + while (total_read_count < data_size) { + const auto remaining = data_size - total_read_count; + auto to_read = data_size / parts; + if (remaining / to_read == 1) { + to_read = remaining; + } - if (num_values != decoder.GetBatchSpaced( - num_values, static_cast(data.null_count()), - data.null_bitmap_data(), data.offset(), values_read.data())) { - FAIL(); - } + rle_size_t read = 0; + if (spaced) { + // We need to slice the input array get the proper null count and bitmap + auto data_remaining = data.Slice(total_read_count, to_read); + + if (dict) { + auto* out = dict_read.data() + total_read_count; + read = decoder.GetBatchWithDictSpaced( + dict->raw_values(), static_cast(dict->length()), out, to_read, + static_cast(data_remaining->null_count()), + data_remaining->null_bitmap_data(), data_remaining->offset()); + } else { + auto* out = values_read.data() + total_read_count; + read = decoder.GetBatchSpaced( + to_read, static_cast(data_remaining->null_count()), + data_remaining->null_bitmap_data(), data_remaining->offset(), out); + } + } else { + if (dict) { + auto* out = dict_read.data() + total_read_count; + read = decoder.GetBatchWithDict( + dict->raw_values(), static_cast(dict->length()), out, to_read); + } else { + auto* out = values_read.data() + total_read_count; + read = decoder.GetBatch(out, to_read); + } + } + ASSERT_EQ(read, to_read) << "Decoder did not read as many values as requested"; - for (int64_t i = 0; i < num_values; ++i) { - if (data.IsValid(i)) { - if (values_read[i] != values[i]) { - FAIL() << "Index " << i << " read " << values_read[i] << " but should be " - << values[i]; + total_read_count += read; + } + EXPECT_EQ(total_read_count, data_size) << "Total number of values read is off"; + + // Verify the round trip: encoded-decoded values must equal the original one + for (int64_t i = 0; i < data_size; ++i) { + if (data.IsValid(i) || !spaced) { + if (dict) { + EXPECT_EQ(dict_read.at(i), dict->Value(data_values[i])) + << "Encoded then decoded and mapped value at position " << i << " (" + << values_read[i] << ") differs from original value (" << data_values[i] + << " mapped to " << dict->Value(data_values[i]) << ")"; + } else { + EXPECT_EQ(values_read.at(i), data_values[i]) + << "Encoded then decoded value at position " << i << " (" << values_read.at(i) + << ") differs from original value (" << data_values[i] << ")"; } } } } template -struct GetBatchSpacedTestCase { - T max_value; - int64_t size; +struct DataTestRleBitPackedRandomPart { + using value_type = T; + + value_type max; + int32_t size; + double null_probability; +}; + +template +struct DataTestRleBitPackedRepeatPart { + using value_type = T; + + value_type value; + int32_t size; double null_probability; - int bit_width; }; -TEST(RleDecoder, GetBatchSpaced) { - uint32_t kSeed = 1337; - ::arrow::random::RandomArrayGenerator rand(kSeed); +template +struct DataTestRleBitPackedNullPart { + using value_type = T; - std::vector> int32_cases{ - {1, 100000, 0.01, 1}, {1, 100000, 0.1, 1}, {1, 100000, 0.5, 1}, - {4, 100000, 0.05, 3}, {100, 100000, 0.05, 7}, + int32_t size; +}; + +template +struct DataTestRleBitPacked { + using value_type = T; + using ArrowType = typename arrow::CTypeTraits::ArrowType; + using RandomPart = DataTestRleBitPackedRandomPart; + using RepeatPart = DataTestRleBitPackedRepeatPart; + using NullPart = DataTestRleBitPackedNullPart; + using AnyPart = std::variant; + + std::vector parts; + int32_t bit_width; + + std::shared_ptr<::arrow::Array> MakeArray( + ::arrow::random::RandomArrayGenerator& rand) const { + using Traits = arrow::TypeTraits; + + std::vector> arrays = {}; + + for (const auto& dyn_part : parts) { + if (auto* part = std::get_if(&dyn_part)) { + auto arr = rand.Numeric(part->size, /* min= */ value_type(0), + part->max, part->null_probability); + arrays.push_back(std::move(arr)); + + } else if (auto* part = std::get_if(&dyn_part)) { + auto arr = + rand.Numeric(part->size, /* min= */ part->value, + /* max= */ part->value, part->null_probability); + arrays.push_back(std::move(arr)); + + } else if (auto* part = std::get_if(&dyn_part)) { + EXPECT_OK_AND_ASSIGN( + auto arr, ::arrow::MakeArrayOfNull(Traits::type_singleton(), part->size)); + arrays.push_back(std::move(arr)); + } + } + ARROW_DCHECK_EQ(parts.size(), arrays.size()); + + return ::arrow::Concatenate(arrays).ValueOrDie(); + } +}; + +template +void DoTestGetBatchSpacedRoundtrip() { + using Data = DataTestRleBitPacked; + using ArrowType = typename Data::ArrowType; + using RandomPart = typename Data::RandomPart; + using NullPart = typename Data::NullPart; + using RepeatPart = typename Data::RepeatPart; + + std::vector test_cases = { + { + {RandomPart{/* max=*/1, /* size=*/400, /* null_proba= */ 0.1}}, + /* bit_width= */ 1, + }, + { + { + RandomPart{/* max=*/7, /* size=*/1037, /* null_proba= */ 0.0}, + NullPart{/* size= */ 1153}, + RandomPart{/* max=*/7, /* size=*/800, /* null_proba= */ 0.5}, + }, + /* bit_width= */ 3, + }, + { + { + NullPart{/* size= */ 80}, + RandomPart{/* max=*/static_cast(1023), /* size=*/800, + /* null_proba= */ 0.01}, + NullPart{/* size= */ 1023}, + }, + /* bit_width= */ 11, + }, + { + {RepeatPart{/* value=*/13, /* size=*/1024, /* null_proba= */ 0.01}}, + /* bit_width= */ 10, + }, + { + { + NullPart{/* size= */ 1024}, + RepeatPart{/* value=*/static_cast(10000), /* size=*/1025, + /* null_proba= */ 0.1}, + NullPart{/* size= */ 77}, + }, + /* bit_width= */ 23, + }, + { + { + RepeatPart{/* value=*/13, /* size=*/1023, /* null_proba= */ 0.0}, + NullPart{/* size= */ 1153}, + RepeatPart{/* value=*/72, /* size=*/1799, /* null_proba= */ 0.5}, + }, + /* bit_width= */ 10, + }, + { + { + RandomPart{/* max=*/1, /* size=*/1013, /* null_proba= */ 0.01}, + NullPart{/* size=*/8}, + RepeatPart{1, /* size= */ 256, /* null_proba= */ 0.1}, + NullPart{/* size=*/128}, + RepeatPart{0, /* size= */ 256, /* null_proba= */ 0.0}, + NullPart{/* size=*/15}, + RandomPart{/* max=*/1, /* size=*/1024, /* null_proba= */ 0.01}, + }, + /* bit_width= */ 1, + }, }; - for (auto case_ : int32_cases) { - auto arr = rand.Int32(case_.size, /*min=*/0, case_.max_value, case_.null_probability); - CheckRoundTripSpaced(*arr, case_.bit_width); - CheckRoundTripSpaced(*arr->Slice(1), case_.bit_width); + + ::arrow::random::RandomArrayGenerator rand(/* seed= */ 12); + // FRAGILE: we create a dictionary large enough so that any encoded value from the + // previous test cases can be used as an index in the dictionary. + // Its size must be increased accordingly if larger values are encoded in the test + // cases. + auto dict = std::static_pointer_cast(rand.Float32(20000, -1.0, 1.0)); + + // Number of bits available in T to write a positive integer. + constexpr int kBitsAvailable = 8 * sizeof(T) - (std::is_signed_v ? 1 : 0); + + for (auto case_ : test_cases) { + if (case_.bit_width > kBitsAvailable) { + continue; + } + + auto array = case_.MakeArray(rand); + + // Tests for GetBatch + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ false, + /* parts= */ 1); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ false, + /* parts= */ 3); + + // Tests for GetBatchSpaced + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ true, + /* parts= */ 1); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ true, + /* parts= */ 7); + CheckRoundTrip(*array->Slice(1), case_.bit_width, /* spaced= */ true, + /* parts= */ 1); + + // Cannot test GetBatchWithDict with this method since unknown null values + + // Tests for GetBatchWithDictSpaced + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ true, /* parts= */ 1, + dict); + CheckRoundTrip(*array, case_.bit_width, /* spaced= */ true, /* parts= */ 5, + dict); } } -} // namespace util -} // namespace arrow +TEST(RleBitPacked, GetBatchSpacedRoundtripUint8) { + DoTestGetBatchSpacedRoundtrip(); +} +TEST(RleBitPacked, GetBatchSpacedRoundtripUint16) { + DoTestGetBatchSpacedRoundtrip(); +} +TEST(RleBitPacked, GetBatchSpacedRoundtripInt32) { + DoTestGetBatchSpacedRoundtrip(); +} +TEST(RleBitPacked, GetBatchSpacedRoundtripUInt32) { + DoTestGetBatchSpacedRoundtrip(); +} +TEST(RleBitPacked, GetBatchSpacedRoundtripUint64) { + DoTestGetBatchSpacedRoundtrip(); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/secure_string.cc b/cpp/src/arrow/util/secure_string.cc new file mode 100644 index 00000000000..bd52c55f312 --- /dev/null +++ b/cpp/src/arrow/util/secure_string.cc @@ -0,0 +1,198 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// __STDC_WANT_LIB_EXT1__ and string.h are required by memset_s: +// https://en.cppreference.com/w/c/string/byte/memset +#define __STDC_WANT_LIB_EXT1__ 1 +#include +#include + +#if defined(ARROW_USE_OPENSSL) +# include +# include +#endif + +#include "arrow/util/windows_compatibility.h" +#if defined(_WIN32) +# include +#endif + +#include "arrow/util/logging.h" +#include "arrow/util/secure_string.h" +#include "arrow/util/span.h" + +namespace arrow::util { + +/// Note: +/// A std::string is securely moved into a SecureString in two steps: +/// 1. the std::string is moved via std::move(string) +/// 2. the std::string is securely cleared +/// +/// The std::move has two different effects, depending on the size of the string. +/// A very short string (called local string) stores the string in a local buffer, +/// a long string stores a pointer to allocated memory that stores the string. +/// +/// If the string is a small string, std::move copies the local buffer. +/// If the string is a long string, std::move moves the pointer and then resets the +/// string size to 0 (which turns the string into a local string). +/// +/// In both cases, after a std::move(string), the string uses the local buffer. +/// +/// Thus, after a std::move(string), calling SecureClear(std::string*) only +/// securely clears the **local buffer** of the string. Therefore, std::move(string) +/// must move the pointer of long string into SecureString (which later clears the +/// string). Otherwise, the content of the string cannot be securely cleared. +/// +/// This condition is checked by SecureMove. + +namespace { +void SecureMove(std::string& string, std::string& dst) { + auto ptr = string.data(); + dst = std::move(string); + + // We require the buffer address string.data() to remain (not be freed) as is, + // or to be reused by dst. Otherwise, we cannot securely clear string after std::move + ARROW_CHECK(string.data() == ptr || dst.data() == ptr); +} +} // namespace + +void SecureString::SecureClear(std::string* secret) { + // call SecureClear first just in case secret->clear() frees some memory + SecureClear(reinterpret_cast(secret->data()), secret->capacity()); + secret->clear(); +} + +inline void SecureString::SecureClear(uint8_t* data, size_t size) { + // There is various prior art for this: + // https://www.cryptologie.net/article/419/zeroing-memory-compiler-optimizations-and-memset_s/ + // - libb2's `secure_zero_memory` at + // https://github.com/BLAKE2/libb2/blob/30d45a17c59dc7dbf853da3085b71d466275bd0a/src/blake2-impl.h#L140-L160 + // - libsodium's `sodium_memzero` at + // https://github.com/jedisct1/libsodium/blob/be58b2e6664389d9c7993b55291402934b43b3ca/src/libsodium/sodium/utils.c#L78:L101 + // Note: + // https://www.daemonology.net/blog/2014-09-06-zeroing-buffers-is-insufficient.html +#if defined(_WIN32) + // SecureZeroMemory is meant to not be optimized away + SecureZeroMemory(data, size); +#elif defined(__STDC_LIB_EXT1__) + // memset_s is meant to not be optimized away + memset_s(data, size, 0, size); +#elif defined(OPENSSL_VERSION_NUMBER) && OPENSSL_VERSION_NUMBER >= 0x30000000 + // rely on some implementation in OpenSSL cryptographic library + OPENSSL_cleanse(data, size); +#elif defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25)) + // explicit_bzero is meant to not be optimized away + explicit_bzero(data, size); +#else + // Volatile pointer to memset function is an attempt to avoid + // that the compiler optimizes away the memset function call. + // pretty much what OPENSSL_cleanse above does + // https://github.com/openssl/openssl/blob/3423c30db3aa044f46e1f0270e2ecd899415bf5f/crypto/mem_clr.c#L22 + static const volatile auto memset_v = &memset; + memset_v(data, 0, size); + +# if defined(__GNUC__) || defined(__clang__) + // __asm__ only supported by GCC and Clang + // not supported by MSVC on the ARM and x64 processors + // https://en.cppreference.com/w/c/language/asm.html + // https://en.cppreference.com/w/cpp/language/asm.html + + // Additional attempt on top of volatile memset_v above + // to avoid that the compiler optimizes away the memset function call. + // Assembler code that tells the compiler 'data' has side effects. + // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html: + // - "volatile": the asm produces side effects + // - "memory": effectively forms a read/write memory barrier for the compiler + __asm__ __volatile__("" /* no actual code */ + : /* no output */ + : "r"(data) /* input */ + : "memory" /* memory side effects beyond input and output */); +# endif +#endif +} + +SecureString::SecureString(SecureString&& other) noexcept { + SecureMove(other.secret_, secret_); + other.Dispose(); +} + +SecureString::SecureString(std::string&& secret) noexcept { + SecureMove(secret, secret_); + SecureClear(&secret); +} + +SecureString::SecureString(size_t n, char c) noexcept : secret_(n, c) {} + +SecureString& SecureString::operator=(SecureString&& other) noexcept { + if (this == &other) { + // self-assignment + return *this; + } + Dispose(); + SecureMove(other.secret_, secret_); + other.Dispose(); + return *this; +} + +SecureString& SecureString::operator=(const SecureString& other) { + if (this == &other) { + // self-assignment + return *this; + } + Dispose(); + secret_ = other.secret_; + return *this; +} + +SecureString& SecureString::operator=(std::string&& secret) noexcept { + Dispose(); + SecureMove(secret, secret_); + SecureClear(&secret); + return *this; +} + +bool SecureString::operator==(const SecureString& other) const { + return secret_ == other.secret_; +} + +bool SecureString::operator!=(const SecureString& other) const { + return secret_ != other.secret_; +} + +bool SecureString::empty() const { return secret_.empty(); } + +std::size_t SecureString::size() const { return secret_.size(); } + +std::size_t SecureString::length() const { return secret_.length(); } + +std::size_t SecureString::capacity() const { return secret_.capacity(); } + +span SecureString::as_span() { + return {reinterpret_cast(secret_.data()), secret_.size()}; +} + +span SecureString::as_span() const { + return {reinterpret_cast(secret_.data()), secret_.size()}; +} + +std::string_view SecureString::as_view() const { + return {secret_.data(), secret_.size()}; +} + +void SecureString::Dispose() { SecureClear(&secret_); } + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/secure_string.h b/cpp/src/arrow/util/secure_string.h new file mode 100644 index 00000000000..30088c78d4c --- /dev/null +++ b/cpp/src/arrow/util/secure_string.h @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/span.h" +#include "arrow/util/visibility.h" + +namespace arrow::util { +/** + * A secure string that ensures the wrapped string is cleared from memory on + * deconstruction. This class can only be created from std::string that are securely + * erased after creation. + * + * Note: This class does not provide a constructor / assignment operator that copies a + * std::string because that would allow code to create a SecureString while accidentally + * not noticing the need to securely erasing the argument after invoking the constructor / + * calling the assignment operator. + */ +class ARROW_EXPORT SecureString { + public: + SecureString() = default; + SecureString(SecureString&&) noexcept; + SecureString(const SecureString&) = default; + explicit SecureString(std::string&&) noexcept; + explicit SecureString(size_t, char) noexcept; + + SecureString& operator=(SecureString&&) noexcept; + SecureString& operator=(const SecureString&); + SecureString& operator=(std::string&&) noexcept; + + bool operator==(const SecureString&) const; + bool operator!=(const SecureString&) const; + + ~SecureString() { Dispose(); } + + [[nodiscard]] bool empty() const; + [[nodiscard]] std::size_t size() const; + [[nodiscard]] std::size_t length() const; + [[nodiscard]] std::size_t capacity() const; + + [[nodiscard]] span as_span(); + [[nodiscard]] span as_span() const; + [[nodiscard]] std::string_view as_view() const; + + void Dispose(); + + static void SecureClear(std::string*); + static void SecureClear(uint8_t* data, size_t size); + + private: + std::string secret_; +}; + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/secure_string_test.cc b/cpp/src/arrow/util/secure_string_test.cc new file mode 100644 index 00000000000..213a4b11f20 --- /dev/null +++ b/cpp/src/arrow/util/secure_string_test.cc @@ -0,0 +1,498 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/util/secure_string.h" + +namespace arrow::util::test { + +#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER) +# define CAN_TEST_DEALLOCATED_AREAS 0 +#else +# define CAN_TEST_DEALLOCATED_AREAS 1 +#endif + +std::string_view StringArea(const std::string& string) { + return {string.data(), string.capacity()}; +} + +// same as GTest ASSERT_PRED_FORMAT2 macro, but without the outer GTEST_ASSERT_ +#define COMPARE(val1, val2) \ + ::testing::internal::EqHelper::Compare(#val1, #val2, val1, val2) + +::testing::AssertionResult IsSecurelyCleared(const std::string_view& area) { + // the entire area is filled with zeros + std::string zeros(area.size(), '\0'); + return COMPARE(area, std::string_view(zeros)); +} + +::testing::AssertionResult IsSecurelyCleared(const std::string& string) { + return IsSecurelyCleared(StringArea(string)); +} + +/** + * Checks the area has been securely cleared after some position. + */ +::testing::AssertionResult IsSecurelyCleared(const std::string_view& area, + const size_t pos) { + // the area after pos is filled with zeros + if (pos < area.size()) { + std::string zeros(area.size() - pos, '\0'); + return COMPARE(area.substr(pos), std::string_view(zeros)); + } + return ::testing::AssertionSuccess(); +} + +/** + * Checks the area has been securely cleared from the secret value. + * Assumes the area has been deallocated, so it might have been reclaimed and changed + * after cleaning. We cannot check for all-zeros, best we can check here is no secret + * character has leaked. If by any chance the modification produced a former key character + * at the right position, this will be false negative / flaky. Therefore, we check for + * three consecutive secret characters before we fail. + */ +::testing::AssertionResult IsSecurelyCleared(const std::string_view& area, + const std::string& secret_value) { +#if !CAN_TEST_DEALLOCATED_AREAS + return testing::AssertionSuccess() << "Not checking deallocated memory"; +#else + // accessing deallocated memory will fail when running with Address Sanitizer enabled + auto leaks = 0; + for (size_t i = 0; i < std::min(area.length(), secret_value.length()); i++) { + if (area[i] == secret_value[i]) { + leaks++; + } else { + if (leaks >= 3) { + break; + } + leaks = 0; + } + } + if (leaks >= 3) { + return ::testing::AssertionFailure() + << leaks << " characters of secret leaked into " << area; + } + return ::testing::AssertionSuccess(); +#endif +} + +#undef COMPARE + +TEST(TestSecureString, AssertSecurelyCleared) { + // This tests AssertSecurelyCleared helper methods is actually able to identify secret + // leakage. It retrieves assertion results and asserts result type and message. + testing::AssertionResult result = testing::AssertionSuccess(); + + // check short string with all zeros + auto short_zeros = std::string(8, '\0'); + short_zeros.resize(short_zeros.capacity(), '\0'); // for string buffers longer than 8 + short_zeros.resize(8); // now the entire string buffer has zeros + // checks the entire string buffer (capacity) + ASSERT_TRUE(IsSecurelyCleared(short_zeros)); + // checks only 10 bytes (length) + ASSERT_TRUE(IsSecurelyCleared(std::string_view(short_zeros))); + + // check long string with all zeros + auto long_zeros = std::string(1000, '\0'); + long_zeros.resize(long_zeros.capacity(), '\0'); // for longer string buffers + long_zeros.resize(1000); // now the entire string buffer has zeros + // checks the entire string buffer (capacity) + ASSERT_TRUE(IsSecurelyCleared(long_zeros)); + // checks only 1000 bytes (length) + ASSERT_TRUE(IsSecurelyCleared(std::string_view(long_zeros))); + + auto no_zeros = std::string("abcdefghijklmnopqrstuvwxyz"); + // string buffer in no_zeros can be larger than no_zeros.length() + // assert only the area that we can control + auto no_zeros_view = std::string_view(no_zeros); + result = IsSecurelyCleared(no_zeros_view); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "Expected equality of these values:\n" + " area\n" + " Which is: \"abcdefghijklmnopqrstuvwxyz\"\n" + " std::string_view(zeros)\n" + " Which is: " + "\"\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\" + "0\\0\\0\\0\\0\""); + + // check short string with zeros and non-zeros after string length + auto stars = std::string(12, '*'); + auto short_some_zeros = stars; + memset(short_some_zeros.data(), '\0', 8); + short_some_zeros.resize(8); + // string buffer in short_some_zeros can be larger than 12 + // assert only the area that we can control + auto short_some_zeros_view = std::string_view(short_some_zeros.data(), 12); + result = IsSecurelyCleared(short_some_zeros_view); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "Expected equality of these values:\n" + " area\n" + " Which is: \"\\0\\0\\0\\0\\0\\0\\0\\0\\0***\"\n" + " std::string_view(zeros)\n" + " Which is: \"\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\""); + + ASSERT_TRUE(IsSecurelyCleared(short_some_zeros, stars)); +#if CAN_TEST_DEALLOCATED_AREAS + result = IsSecurelyCleared(short_some_zeros_view, stars); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "3 characters of secret leaked into " + "\\0\\0\\0\\0\\0\\0\\0\\0\\0***"); +#endif + + // check long string with zeros and non-zeros after string length + stars = std::string(42, '*'); + auto long_some_zeros = stars; + memset(long_some_zeros.data(), '\0', 32); + long_some_zeros.resize(32); + // string buffer in long_some_zeros can be larger than 42 + // assert only the area that we can control + auto long_some_zeros_view = std::string_view(long_some_zeros.data(), 42); + result = IsSecurelyCleared(long_some_zeros_view); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "Expected equality of these values:\n" + " area\n" + " Which is: " + "\"\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\" + "0\\0\\0\\0\\0\\0\\0\\0\\0*********\"\n" + " std::string_view(zeros)\n" + " Which is: " + "\"\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\" + "0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\""); + + ASSERT_TRUE(IsSecurelyCleared(long_some_zeros, stars)); +#if CAN_TEST_DEALLOCATED_AREAS + result = IsSecurelyCleared(long_some_zeros_view, stars); + ASSERT_FALSE(result); + ASSERT_EQ(std::string(result.message()), + "9 characters of secret leaked into " + "\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0\\" + "0\\0\\0\\0\\0\\0\\0\\0\\0*********"); +#endif + + // check string with non-zeros and zeros after string length + auto some_zeros_back = std::string(no_zeros.length() + 3, '\0'); + some_zeros_back = no_zeros; + memset(some_zeros_back.data() + no_zeros.length() * sizeof(char), '\0', 3 + 1); + // string buffer in some_zeros_back can be larger than no_zeros.length() + 3 + // assert only the area that we can control + auto some_zeros_back_view = + std::string_view(some_zeros_back.data(), no_zeros.length() + 3); + ASSERT_TRUE(IsSecurelyCleared(some_zeros_back_view, no_zeros.length())); +} + +TEST(TestSecureString, SecureClearString) { + // short string + { + std::string tiny("abc"); + auto old_area = StringArea(tiny); + SecureString::SecureClear(&tiny); + ASSERT_TRUE(IsSecurelyCleared(tiny)); + ASSERT_TRUE(IsSecurelyCleared(old_area)); + } + + // long string + { + std::string large(1024, 'x'); + large.resize(512, 'y'); + auto old_area = StringArea(large); + SecureString::SecureClear(&large); + ASSERT_TRUE(IsSecurelyCleared(large)); + ASSERT_TRUE(IsSecurelyCleared(old_area)); + } + + // empty string + { + // this creates an empty string with some non-zero characters in the string buffer + // we test that all those characters are securely cleared + std::string empty("abcdef"); + empty.resize(0); + auto old_area = StringArea(empty); + SecureString::SecureClear(&empty); + ASSERT_TRUE(IsSecurelyCleared(empty)); + ASSERT_TRUE(IsSecurelyCleared(old_area)); + } +} + +TEST(TestSecureString, Construct) { + // We use a very short and a very long string as memory management of short and long + // strings behaves differently. + std::vector strings = {"short secret", std::string(1024, 'x')}; + + for (const auto& original_string : strings) { + // move-constructing from a string either reuses its buffer or securely clears + // that string + std::string string = original_string; + auto old_string = StringArea(string); + SecureString secret_from_string(std::move(string)); + ASSERT_TRUE(IsSecurelyCleared(string)); + if (secret_from_string.as_view().data() != old_string.data()) { + ASSERT_TRUE(IsSecurelyCleared(old_string)); + } + ASSERT_FALSE(secret_from_string.empty()); + ASSERT_EQ(secret_from_string.as_view(), original_string); + + // move-constructing from a secure string securely clears that secure string + auto old_secret_from_string_view = secret_from_string.as_view(); + auto old_secret_from_string_value = std::string(secret_from_string.as_view()); + SecureString secret_from_move_secret(std::move(secret_from_string)); + ASSERT_TRUE(secret_from_string.empty()); + if (secret_from_move_secret.as_view().data() != old_secret_from_string_view.data()) { + ASSERT_TRUE(IsSecurelyCleared(old_secret_from_string_view)); + } + ASSERT_FALSE(secret_from_move_secret.empty()); + ASSERT_EQ(secret_from_move_secret.as_view(), old_secret_from_string_value); + + // copy-constructing from a secure string does not modify that secure string + SecureString secret_from_secret(secret_from_move_secret); + ASSERT_FALSE(secret_from_move_secret.empty()); + ASSERT_EQ(secret_from_move_secret.as_view(), old_secret_from_string_value); + ASSERT_FALSE(secret_from_secret.empty()); + ASSERT_EQ(secret_from_secret, secret_from_move_secret); + } +} + +TEST(TestSecureString, Assign) { + // We initialize with the first string and iteratively assign the subsequent values. + // The first two values are local (very short strings), the remainder are non-local + // strings. Memory management of short and long strings behaves differently. + std::vector test_strings = {"secret", "another secret", + std::string(128, 'x'), std::string(1024, 'y')}; + for (auto& string : test_strings) { + // string buffer might be longer than string.length with arbitrary bytes + // secure string does not have to protect that garbage bytes + // zeroing here so we get expected results + auto length = string.length(); + string.resize(string.capacity(), '\0'); + string.resize(length); + } + + std::vector reverse_strings = std::vector(test_strings); + std::reverse(reverse_strings.begin(), reverse_strings.end()); + + for (auto vec : {test_strings, reverse_strings}) { + auto init_string = vec[0]; + auto strings = std::vector(vec.begin() + 1, vec.end()); + + { + // an initialized secure string + std::string init_string_copy(init_string); + SecureString secret_from_string(std::move(init_string_copy)); + + // move-assigning from a string securely clears that string + // the earlier value of the secure string is securely cleared + for (const auto& string : strings) { + auto string_copy = std::string(string); + auto old_string_copy_area = StringArea(string_copy); + ASSERT_FALSE(string.empty()); + ASSERT_FALSE(string_copy.empty()); + auto old_secret_from_string_area = secret_from_string.as_view(); + auto old_secret_from_string_value = std::string(secret_from_string.as_view()); + + secret_from_string = std::move(string_copy); + + ASSERT_FALSE(string.empty()); + ASSERT_TRUE(string_copy.empty()); + ASSERT_TRUE(IsSecurelyCleared(string_copy)); + auto secret_from_string_view = secret_from_string.as_view(); + // the secure string can reuse the string_copy's string buffer after assignment + // then, string_copy's string buffer is obviously not cleared + if (secret_from_string_view.data() != old_string_copy_area.data()) { + ASSERT_TRUE(IsSecurelyCleared(old_string_copy_area, string)); + } + ASSERT_FALSE(secret_from_string.empty()); + ASSERT_EQ(secret_from_string.size(), string.size()); + ASSERT_EQ(secret_from_string.length(), string.length()); + ASSERT_EQ(secret_from_string_view, string); + if (secret_from_string_view.data() == old_secret_from_string_area.data()) { + // when secure string reuses the buffer, the old value must be cleared + ASSERT_TRUE( + IsSecurelyCleared(old_secret_from_string_area, secret_from_string.size())); + } else { + // when secure string has a new buffer, the old buffer must be cleared + ASSERT_TRUE(IsSecurelyCleared(old_secret_from_string_area, + old_secret_from_string_value)); + } + } + } + + { + // an initialized secure string + std::string init_string_copy(init_string); + SecureString secret_from_move_secret(std::move(init_string_copy)); + + // move-assigning from a secure string securely clears that secure string + // the earlier value of the secure string is securely cleared + for (const auto& string : strings) { + auto string_copy = std::string(string); + SecureString secret_string(std::move(string_copy)); + ASSERT_FALSE(string.empty()); + ASSERT_TRUE(string_copy.empty()); + ASSERT_FALSE(secret_string.empty()); + auto old_secret_string_area = secret_string.as_view(); + auto old_secret_string_value = std::string(secret_string.as_view()); + auto old_secret_from_move_secret_area = secret_from_move_secret.as_view(); + auto old_secret_from_move_secret_value = + std::string(secret_from_move_secret.as_view()); + + secret_from_move_secret = std::move(secret_string); + + ASSERT_TRUE(secret_string.empty()); + auto secret_from_move_secret_view = secret_from_move_secret.as_view(); + // the secure string can reuse the string_copy's string buffer after assignment + // then, string_copy's string buffer is obviously not cleared + if (old_secret_string_area.data() != secret_from_move_secret_view.data()) { + ASSERT_TRUE(IsSecurelyCleared(old_secret_string_area, + old_secret_from_move_secret_value)); + } + ASSERT_FALSE(secret_from_move_secret.empty()); + ASSERT_EQ(secret_from_move_secret.size(), string.size()); + ASSERT_EQ(secret_from_move_secret.length(), string.length()); + ASSERT_EQ(secret_from_move_secret_view, string); + if (old_secret_from_move_secret_area.data() == + secret_from_move_secret_view.data()) { + // when secure string reuses the buffer, the old value must be cleared + ASSERT_TRUE(IsSecurelyCleared(old_secret_from_move_secret_area, + secret_from_move_secret.size())); + } else { + // when secure string has a new buffer, the old buffer must be cleared + ASSERT_TRUE(IsSecurelyCleared(old_secret_from_move_secret_area, + old_secret_from_move_secret_value)); + } + } + } + + { + // an initialized secure string + std::string init_string_copy(init_string); + SecureString secret_from_copy_secret(std::move(init_string_copy)); + + // copy-assigning from a secure string does not modify that secure string + // the earlier value of the secure string is securely cleared + for (const auto& string : strings) { + auto string_copy = std::string(string); + SecureString secret_string(std::move(string_copy)); + ASSERT_FALSE(string.empty()); + ASSERT_TRUE(string_copy.empty()); + ASSERT_FALSE(secret_string.empty()); + auto old_secret_from_copy_secret_area = secret_from_copy_secret.as_view(); + auto old_secret_from_copy_secret_value = + std::string(secret_from_copy_secret.as_view()); + + secret_from_copy_secret = secret_string; + + ASSERT_FALSE(secret_string.empty()); + ASSERT_FALSE(secret_from_copy_secret.empty()); + ASSERT_EQ(secret_from_copy_secret.size(), string.size()); + ASSERT_EQ(secret_from_copy_secret.length(), string.length()); + ASSERT_EQ(secret_from_copy_secret.as_view(), string); + if (old_secret_from_copy_secret_area.data() == + secret_from_copy_secret.as_view().data()) { + // when secure string reuses the buffer, the old value must be cleared + ASSERT_TRUE(IsSecurelyCleared(old_secret_from_copy_secret_area, + secret_from_copy_secret.size())); + } else { + // when secure string has a new buffer, the old buffer must be cleared + ASSERT_TRUE(IsSecurelyCleared(old_secret_from_copy_secret_area, + old_secret_from_copy_secret_value)); + } + } + } + } +} + +TEST(TestSecureString, Deconstruct) { +#if !CAN_TEST_DEALLOCATED_AREAS + GTEST_SKIP() << "Test accesses deallocated memory"; +#else + // We use a very short and a very long string as memory management of short and long + // strings behaves differently. + std::vector strings = {"short secret", std::string(1024, 'x')}; + + for (auto& string : strings) { + auto old_string_value = string; + std::string_view view; + { + // construct secret + auto secret = SecureString(std::move(string)); + // memorize view + view = secret.as_view(); + // deconstruct secret on leaving this context + } + // assert secret memory is cleared on deconstruction + ASSERT_TRUE(IsSecurelyCleared(view, old_string_value)); + // so is the string (tested more thoroughly elsewhere) + ASSERT_TRUE(IsSecurelyCleared(string)); + } +#endif +} + +TEST(TestSecureString, Compare) { + ASSERT_TRUE(SecureString("") == SecureString("")); + ASSERT_FALSE(SecureString("") != SecureString("")); + + ASSERT_TRUE(SecureString("hello world") == SecureString("hello world")); + ASSERT_FALSE(SecureString("hello world") != SecureString("hello world")); + + ASSERT_FALSE(SecureString("hello world") == SecureString("hello worlds")); + ASSERT_TRUE(SecureString("hello world") != SecureString("hello worlds")); +} + +TEST(TestSecureString, Cardinality) { + ASSERT_TRUE(SecureString("").empty()); + ASSERT_EQ(SecureString("").size(), 0); + ASSERT_EQ(SecureString("").length(), 0); + + ASSERT_FALSE(SecureString("hello world").empty()); + ASSERT_EQ(SecureString("hello world").size(), 11); + ASSERT_EQ(SecureString("hello world").length(), 11); +} + +TEST(TestSecureString, AsSpan) { + SecureString secret("hello world"); + const SecureString& const_secret(secret); + auto const_span = const_secret.as_span(); + auto mutable_span = secret.as_span(); + + std::string expected = "hello world"; + span expected_span = {reinterpret_cast(expected.data()), expected.size()}; + ASSERT_EQ(const_span, expected_span); + ASSERT_EQ(mutable_span, expected_span); + + // modify secret through mutual span + // the const span shares the same secret, so it is changed as well + mutable_span[0] = 'H'; + expected_span[0] = 'H'; + ASSERT_EQ(const_span, expected_span); + ASSERT_EQ(mutable_span, expected_span); +} + +TEST(TestSecureString, AsView) { + const SecureString secret = SecureString("hello world"); + const std::string_view view = secret.as_view(); + ASSERT_EQ(view, "hello world"); +} + +#undef CAN_TEST_DEALLOCATED_AREAS + +} // namespace arrow::util::test diff --git a/cpp/src/arrow/util/small_vector.h b/cpp/src/arrow/util/small_vector.h index 52e191c4c07..f371e647152 100644 --- a/cpp/src/arrow/util/small_vector.h +++ b/cpp/src/arrow/util/small_vector.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "arrow/util/aligned_storage.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/util/sort.h b/cpp/src/arrow/util/sort_internal.h similarity index 100% rename from cpp/src/arrow/util/sort.h rename to cpp/src/arrow/util/sort_internal.h diff --git a/cpp/src/arrow/util/spaced.h b/cpp/src/arrow/util/spaced.h deleted file mode 100644 index 8265e1d22ae..00000000000 --- a/cpp/src/arrow/util/spaced.h +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/util/bit_run_reader.h" - -namespace arrow { -namespace util { -namespace internal { - -/// \brief Compress the buffer to spaced, excluding the null entries. -/// -/// \param[in] src the source buffer -/// \param[in] num_values the size of source buffer -/// \param[in] valid_bits bitmap data indicating position of valid slots -/// \param[in] valid_bits_offset offset into valid_bits -/// \param[out] output the output buffer spaced -/// \return The size of spaced buffer. -template -inline int SpacedCompress(const T* src, int num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset, T* output) { - int num_valid_values = 0; - - arrow::internal::SetBitRunReader reader(valid_bits, valid_bits_offset, num_values); - while (true) { - const auto run = reader.NextRun(); - if (run.length == 0) { - break; - } - std::memcpy(output + num_valid_values, src + run.position, run.length * sizeof(T)); - num_valid_values += static_cast(run.length); - } - - return num_valid_values; -} - -/// \brief Relocate values in buffer into positions of non-null values as indicated by -/// a validity bitmap. -/// -/// \param[in, out] buffer the in-place buffer -/// \param[in] num_values total size of buffer including null slots -/// \param[in] null_count number of null slots -/// \param[in] valid_bits bitmap data indicating position of valid slots -/// \param[in] valid_bits_offset offset into valid_bits -/// \return The number of values expanded, including nulls. -template -inline int SpacedExpand(T* buffer, int num_values, int null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset) { - // Point to end as we add the spacing from the back. - int idx_decode = num_values - null_count; - - // Depending on the number of nulls, some of the value slots in buffer may - // be uninitialized, and this will cause valgrind warnings / potentially UB - std::memset(static_cast(buffer + idx_decode), 0, null_count * sizeof(T)); - if (idx_decode == 0) { - // All nulls, nothing more to do - return num_values; - } - - arrow::internal::ReverseSetBitRunReader reader(valid_bits, valid_bits_offset, - num_values); - while (true) { - const auto run = reader.NextRun(); - if (run.length == 0) { - break; - } - idx_decode -= static_cast(run.length); - assert(idx_decode >= 0); - std::memmove(buffer + run.position, buffer + idx_decode, run.length * sizeof(T)); - } - - // Otherwise caller gave an incorrect null_count - assert(idx_decode == 0); - return num_values; -} - -} // namespace internal -} // namespace util -} // namespace arrow diff --git a/cpp/src/arrow/util/spaced_internal.h b/cpp/src/arrow/util/spaced_internal.h new file mode 100644 index 00000000000..b43cbdd751e --- /dev/null +++ b/cpp/src/arrow/util/spaced_internal.h @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/util/bit_run_reader.h" + +namespace arrow::util::internal { + +/// \brief Compress the buffer to spaced, excluding the null entries. +/// +/// \param[in] src the source buffer +/// \param[in] num_values the size of source buffer +/// \param[in] valid_bits bitmap data indicating position of valid slots +/// \param[in] valid_bits_offset offset into valid_bits +/// \param[out] output the output buffer spaced +/// \return The size of spaced buffer. +template +inline int SpacedCompress(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset, T* output) { + int num_valid_values = 0; + + arrow::internal::SetBitRunReader reader(valid_bits, valid_bits_offset, num_values); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + std::memcpy(output + num_valid_values, src + run.position, run.length * sizeof(T)); + num_valid_values += static_cast(run.length); + } + + return num_valid_values; +} + +/// \brief Relocate values according to a validity bitmap, to the right +/// +/// Non-null values should initially be densely packed at the left of the buffer. +/// This method spreads the values out according to the given validity bitmap. +/// Null entries are zero-initialized. +/// +/// \param[in, out] buffer the in-place buffer +/// \param[in] num_values total size of buffer including null slots +/// \param[in] null_count number of null slots +/// \param[in] valid_bits bitmap data indicating position of valid slots +/// \param[in] valid_bits_offset offset into valid_bits +template +inline void SpacedExpandRightward(T* buffer, int num_values, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + // Point to end as we add the spacing from the back. + int idx_decode = num_values - null_count; + + // Depending on the number of nulls, some of the value slots in buffer may + // be uninitialized, and this will cause valgrind warnings / potentially UB + memset(static_cast(buffer + idx_decode), 0, null_count * sizeof(T)); + if (idx_decode == 0) { + // All nulls, nothing more to do + return; + } + + arrow::internal::ReverseSetBitRunReader reader(valid_bits, valid_bits_offset, + num_values); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + idx_decode -= static_cast(run.length); + assert(idx_decode >= 0); + if (idx_decode == run.position) { + // We have come to the point where no more expansion is required: the remaining + // values are already in their final position. + return; + } + // Source and destination may overlap if run.length > 1 + memmove(buffer + run.position, buffer + idx_decode, run.length * sizeof(T)); + } + + // Otherwise caller gave an incorrect null_count + assert(idx_decode == 0); +} + +/// \brief Relocate values according to a validity bitmap, to the left +/// +/// Non-null values should initially be densely packed at the right of the buffer. +/// This method spreads the values out according to the given validity bitmap. +/// Null entries are zero-initialized. +/// +/// \param[in, out] buffer the in-place buffer +/// \param[in] byte_width the byte width of values +/// \param[in] length total length of buffer including null slots +/// \param[in] null_count number of null slots +/// \param[in] valid_bits bitmap data indicating position of valid slots +/// \param[in] valid_bits_offset offset into valid_bits +inline void SpacedExpandLeftward(uint8_t* buffer, int byte_width, int64_t length, + int64_t null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset) { + // Point to start of values. + int64_t idx_decode = byte_width * null_count; + + // Depending on the number of nulls, some of the value slots in buffer may + // be uninitialized, and this will cause valgrind warnings / potentially UB + memset(buffer, 0, idx_decode); + + arrow::internal::SetBitRunReader reader(valid_bits, valid_bits_offset, length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + if (idx_decode == run.position * byte_width) { + // We have come to the point where no more expansion is required: the remaining + // values are already in their final position. + return; + } + // Source and destination may overlap if run.length > 1 + memmove(buffer + run.position * byte_width, buffer + idx_decode, + run.length * byte_width); + idx_decode += run.length * byte_width; + } + + // Otherwise caller gave an incorrect null_count + assert(idx_decode == length * byte_width); +} + +} // namespace arrow::util::internal diff --git a/cpp/src/arrow/util/span.h b/cpp/src/arrow/util/span.h index 8a84d028b2a..1e57ee8c8d1 100644 --- a/cpp/src/arrow/util/span.h +++ b/cpp/src/arrow/util/span.h @@ -90,7 +90,7 @@ writing code which would break when it is replaced by std::span.)"); return out; } - constexpr bool operator==(span const& other) const { + constexpr bool operator==(const span& other) const { if (size_ != other.size_) return false; if constexpr (std::is_integral_v) { @@ -106,7 +106,7 @@ writing code which would break when it is replaced by std::span.)"); return true; } } - constexpr bool operator!=(span const& other) const { return !(*this == other); } + constexpr bool operator!=(const span& other) const { return !(*this == other); } private: T* data_{}; @@ -121,7 +121,7 @@ span(T*, size_t) -> span; template constexpr span as_bytes(span s) { - return {reinterpret_cast(s.data()), s.size_bytes()}; + return {reinterpret_cast(s.data()), s.size_bytes()}; } template diff --git a/cpp/src/arrow/util/stl_util_test.cc b/cpp/src/arrow/util/stl_util_test.cc index 3f16051f1df..836469bc065 100644 --- a/cpp/src/arrow/util/stl_util_test.cc +++ b/cpp/src/arrow/util/stl_util_test.cc @@ -22,7 +22,7 @@ #include #include "arrow/testing/gtest_util.h" -#include "arrow/util/sort.h" +#include "arrow/util/sort_internal.h" #include "arrow/util/string.h" #include "arrow/util/vector.h" diff --git a/cpp/src/arrow/util/stopwatch.h b/cpp/src/arrow/util/stopwatch_internal.h similarity index 100% rename from cpp/src/arrow/util/stopwatch.h rename to cpp/src/arrow/util/stopwatch_internal.h diff --git a/cpp/src/arrow/util/string_builder.cc b/cpp/src/arrow/util/string_builder.cc deleted file mode 100644 index ae526494141..00000000000 --- a/cpp/src/arrow/util/string_builder.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/util/string_builder.h" - -#include -#include - -namespace arrow { - -namespace util { -namespace detail { - -StringStreamWrapper::StringStreamWrapper() - : sstream_(std::make_unique()), ostream_(*sstream_) {} - -StringStreamWrapper::~StringStreamWrapper() {} - -std::string StringStreamWrapper::str() { return sstream_->str(); } - -} // namespace detail -} // namespace util -} // namespace arrow diff --git a/cpp/src/arrow/util/string_builder.h b/cpp/src/arrow/util/string_builder.h deleted file mode 100644 index 448fb57d7a7..00000000000 --- a/cpp/src/arrow/util/string_builder.h +++ /dev/null @@ -1,90 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. template - -#pragma once - -#include -#include -#include -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -namespace detail { - -class ARROW_EXPORT StringStreamWrapper { - public: - StringStreamWrapper(); - ~StringStreamWrapper(); - - std::ostream& stream() { return ostream_; } - std::string str(); - - protected: - std::unique_ptr sstream_; - std::ostream& ostream_; -}; - -} // namespace detail - -template -void StringBuilderRecursive(std::ostream& stream, Head&& head) { - if constexpr (std::is_floating_point_v>) { - // Avoid losing precision when printing floating point numbers - stream << std::to_string(head); - } else { - stream << head; - } -} - -template -void StringBuilderRecursive(std::ostream& stream, Head&& head, Tail&&... tail) { - StringBuilderRecursive(stream, std::forward(head)); - StringBuilderRecursive(stream, std::forward(tail)...); -} - -template -std::string StringBuilder(Args&&... args) { - detail::StringStreamWrapper ss; - StringBuilderRecursive(ss.stream(), std::forward(args)...); - return ss.str(); -} - -/// CRTP helper for declaring string representation. Defines operator<< -template -class ToStringOstreamable { - public: - ~ToStringOstreamable() { - static_assert( - std::is_same().ToString()), std::string>::value, - "ToStringOstreamable depends on the method T::ToString() const"); - } - - private: - const T& cast() const { return static_cast(*this); } - - friend inline std::ostream& operator<<(std::ostream& os, const ToStringOstreamable& t) { - return os << t.cast().ToString(); - } -}; - -} // namespace util -} // namespace arrow diff --git a/cpp/src/arrow/util/string_util.cc b/cpp/src/arrow/util/string_util.cc new file mode 100644 index 00000000000..6889b890972 --- /dev/null +++ b/cpp/src/arrow/util/string_util.cc @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/string_util.h" + +#include +#include + +namespace arrow { +namespace internal { + +StringStreamWrapper::StringStreamWrapper() + : sstream_(std::make_unique()), ostream_(*sstream_) {} + +StringStreamWrapper::~StringStreamWrapper() {} + +std::string StringStreamWrapper::str() { return sstream_->str(); } + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/string_util.h b/cpp/src/arrow/util/string_util.h new file mode 100644 index 00000000000..3f7803dc92d --- /dev/null +++ b/cpp/src/arrow/util/string_util.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. template + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace internal { + +class ARROW_EXPORT StringStreamWrapper { + public: + StringStreamWrapper(); + ~StringStreamWrapper(); + + std::ostream& stream() { return ostream_; } + std::string str(); + + protected: + std::unique_ptr sstream_; + std::ostream& ostream_; +}; + +template +std::string JoinToString(Args&&... args) { + StringStreamWrapper ss; + ( + [&ss](auto&& arg) { + // Avoid losing precision when printing floating point numbers + if constexpr (std::is_floating_point_v>) { + ss.stream() << std::to_string(arg); + } else { + ss.stream() << arg; + } + }(std::forward(args)), + ...); + return ss.str(); +} +} // namespace internal + +namespace util { +/// CRTP helper for declaring string representation. Defines operator<< +template +class ToStringOstreamable { + public: + ~ToStringOstreamable() { + static_assert( + std::is_same().ToString()), std::string>::value, + "ToStringOstreamable depends on the method T::ToString() const"); + } + + private: + const T& cast() const { return static_cast(*this); } + + friend inline std::ostream& operator<<(std::ostream& os, const ToStringOstreamable& t) { + return os << t.cast().ToString(); + } +}; + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/tdigest.cc b/cpp/src/arrow/util/tdigest.cc index ec92fabed8e..36a83fb336d 100644 --- a/cpp/src/arrow/util/tdigest.cc +++ b/cpp/src/arrow/util/tdigest.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/tdigest.h" +#include "arrow/util/tdigest_internal.h" #include #include diff --git a/cpp/src/arrow/util/tdigest_benchmark.cc b/cpp/src/arrow/util/tdigest_benchmark.cc index d9cd632c394..f0bf4234125 100644 --- a/cpp/src/arrow/util/tdigest_benchmark.cc +++ b/cpp/src/arrow/util/tdigest_benchmark.cc @@ -19,7 +19,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" -#include "arrow/util/tdigest.h" +#include "arrow/util/tdigest_internal.h" namespace arrow { namespace util { diff --git a/cpp/src/arrow/util/tdigest.h b/cpp/src/arrow/util/tdigest_internal.h similarity index 100% rename from cpp/src/arrow/util/tdigest.h rename to cpp/src/arrow/util/tdigest_internal.h diff --git a/cpp/src/arrow/util/tdigest_test.cc b/cpp/src/arrow/util/tdigest_test.cc index 63395b676a6..04742ec46d8 100644 --- a/cpp/src/arrow/util/tdigest_test.cc +++ b/cpp/src/arrow/util/tdigest_test.cc @@ -33,7 +33,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" -#include "arrow/util/tdigest.h" +#include "arrow/util/tdigest_internal.h" namespace arrow { namespace internal { diff --git a/cpp/src/arrow/util/thread_pool.cc b/cpp/src/arrow/util/thread_pool.cc index 33531b384d0..bf107006f8b 100644 --- a/cpp/src/arrow/util/thread_pool.cc +++ b/cpp/src/arrow/util/thread_pool.cc @@ -171,9 +171,10 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce task, "Attempt to schedule a task on a serial executor that has already finished or " "been abandoned"); } - state->task_queue.push(QueuedTask{std::move(task), std::move(stop_token), - std::move(stop_callback), hints.priority, - state_->spawned_tasks_count_++}); + state->task_queue.push( + QueuedTask{{std::move(task), std::move(stop_token), std::move(stop_callback)}, + hints.priority, + state_->spawned_tasks_count_++}); } state->wait_for_tasks.notify_one(); return Status::OK(); @@ -208,9 +209,10 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce task, "been abandoned"); } - state_->task_queue.push(QueuedTask{std::move(task), std::move(stop_token), - std::move(stop_callback), hints.priority, - state_->spawned_tasks_count_++}); + state_->task_queue.push( + QueuedTask{{std::move(task), std::move(stop_token), std::move(stop_callback)}, + hints.priority, + state_->spawned_tasks_count_++}); return Status::OK(); } @@ -730,19 +732,23 @@ static int ParseOMPEnvVar(const char* name) { } int ThreadPool::DefaultCapacity() { - int capacity, limit; - capacity = ParseOMPEnvVar("OMP_NUM_THREADS"); - if (capacity == 0) { - capacity = std::thread::hardware_concurrency(); + int capacity = ParseOMPEnvVar("OMP_NUM_THREADS"); + if (capacity <= 0) { + capacity = static_cast(GetNumAffinityCores().ValueOr(0)); } - limit = ParseOMPEnvVar("OMP_THREAD_LIMIT"); - if (limit > 0) { - capacity = std::min(limit, capacity); + if (capacity <= 0) { + capacity = static_cast(std::thread::hardware_concurrency()); } - if (capacity == 0) { - ARROW_LOG(WARNING) << "Failed to determine the number of available threads, " - "using a hardcoded arbitrary value"; + if (capacity <= 0) { capacity = 4; + ARROW_LOG(WARNING) << "Failed to determine the number of available threads, " + "using a hardcoded arbitrary value of " + << capacity; + } + + const int limit = ParseOMPEnvVar("OMP_THREAD_LIMIT"); + if (limit > 0) { + capacity = std::min(limit, capacity); } return capacity; } diff --git a/cpp/src/arrow/util/thread_pool.h b/cpp/src/arrow/util/thread_pool.h index cd32781aed7..201b8cef790 100644 --- a/cpp/src/arrow/util/thread_pool.h +++ b/cpp/src/arrow/util/thread_pool.h @@ -475,6 +475,7 @@ class ARROW_EXPORT ThreadPool : public Executor { // Heuristic for the default capacity of a thread pool for CPU-bound tasks. // This is exposed as a static method to help with testing. + // The number returned is guaranteed to be greater or equal to one. static int DefaultCapacity(); // Shutdown the pool. Once the pool starts shutting down, new tasks @@ -592,9 +593,11 @@ typename Fut::SyncType RunSynchronously(FnOnce get_future, } /// \brief Potentially iterate an async generator serially (if use_threads is false) +/// using a potentially custom Executor /// \see IterateGenerator /// -/// If `use_threads` is true, the global CPU executor will be used. Each call to +/// If `use_threads` is true, the custom executor or, if null, +/// the global CPU executor will be used. Each call to /// the iterator will simply wait until the next item is available. Tasks may run in /// the background between calls. /// @@ -604,9 +607,11 @@ typename Fut::SyncType RunSynchronously(FnOnce get_future, /// calls. template Iterator IterateSynchronously( - FnOnce()>>(Executor*)> get_gen, bool use_threads) { + FnOnce()>>(Executor*)> get_gen, bool use_threads, + Executor* executor) { if (use_threads) { - auto maybe_gen = std::move(get_gen)(GetCpuThreadPool()); + auto used_executor = executor != NULLPTR ? executor : GetCpuThreadPool(); + auto maybe_gen = std::move(get_gen)(used_executor); if (!maybe_gen.ok()) { return MakeErrorIterator(maybe_gen.status()); } @@ -616,5 +621,23 @@ Iterator IterateSynchronously( } } +/// \brief Potentially iterate an async generator serially (if use_threads is false) +/// using the default CPU thread pool +/// \see IterateGenerator +/// +/// If `use_threads` is true, the global CPU executor will be used. Each call to +/// the iterator will simply wait until the next item is available. Tasks may run in +/// the background between calls. +/// +/// If `use_threads` is false, the calling thread only will be used. Each call to +/// the iterator will use the calling thread to do enough work to generate one item. +/// Tasks will be left in a queue until the next call and no work will be done between +/// calls. +template +Iterator IterateSynchronously( + FnOnce()>>(Executor*)> get_gen, bool use_threads) { + return IterateSynchronously(std::move(get_gen), use_threads, NULLPTR); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/thread_pool_test.cc b/cpp/src/arrow/util/thread_pool_test.cc index 2c831460302..45441fa3216 100644 --- a/cpp/src/arrow/util/thread_pool_test.cc +++ b/cpp/src/arrow/util/thread_pool_test.cc @@ -1039,35 +1039,46 @@ TEST(TestGlobalThreadPool, Capacity) { // Exercise default capacity heuristic ASSERT_OK(DelEnvVar("OMP_NUM_THREADS")); ASSERT_OK(DelEnvVar("OMP_THREAD_LIMIT")); + int hw_capacity = std::thread::hardware_concurrency(); - ASSERT_EQ(ThreadPool::DefaultCapacity(), hw_capacity); + ASSERT_LE(ThreadPool::DefaultCapacity(), hw_capacity); + ASSERT_GE(ThreadPool::DefaultCapacity(), 1); + ASSERT_OK(SetEnvVar("OMP_NUM_THREADS", "13")); ASSERT_EQ(ThreadPool::DefaultCapacity(), 13); + ASSERT_OK(SetEnvVar("OMP_NUM_THREADS", "7,5,13")); ASSERT_EQ(ThreadPool::DefaultCapacity(), 7); ASSERT_OK(DelEnvVar("OMP_NUM_THREADS")); ASSERT_OK(SetEnvVar("OMP_THREAD_LIMIT", "1")); ASSERT_EQ(ThreadPool::DefaultCapacity(), 1); + ASSERT_OK(SetEnvVar("OMP_THREAD_LIMIT", "999")); - if (hw_capacity <= 999) { - ASSERT_EQ(ThreadPool::DefaultCapacity(), hw_capacity); - } + ASSERT_LE(ThreadPool::DefaultCapacity(), std::min(999, hw_capacity)); + ASSERT_GE(ThreadPool::DefaultCapacity(), 1); + ASSERT_OK(SetEnvVar("OMP_NUM_THREADS", "6,5,13")); ASSERT_EQ(ThreadPool::DefaultCapacity(), 6); + ASSERT_OK(SetEnvVar("OMP_THREAD_LIMIT", "2")); ASSERT_EQ(ThreadPool::DefaultCapacity(), 2); // Invalid env values ASSERT_OK(SetEnvVar("OMP_NUM_THREADS", "0")); ASSERT_OK(SetEnvVar("OMP_THREAD_LIMIT", "0")); - ASSERT_EQ(ThreadPool::DefaultCapacity(), hw_capacity); + ASSERT_LE(ThreadPool::DefaultCapacity(), hw_capacity); + ASSERT_GE(ThreadPool::DefaultCapacity(), 1); + ASSERT_OK(SetEnvVar("OMP_NUM_THREADS", "zzz")); ASSERT_OK(SetEnvVar("OMP_THREAD_LIMIT", "x")); - ASSERT_EQ(ThreadPool::DefaultCapacity(), hw_capacity); + ASSERT_LE(ThreadPool::DefaultCapacity(), hw_capacity); + ASSERT_GE(ThreadPool::DefaultCapacity(), 1); + ASSERT_OK(SetEnvVar("OMP_THREAD_LIMIT", "-1")); ASSERT_OK(SetEnvVar("OMP_NUM_THREADS", "99999999999999999999999999")); - ASSERT_EQ(ThreadPool::DefaultCapacity(), hw_capacity); + ASSERT_LE(ThreadPool::DefaultCapacity(), hw_capacity); + ASSERT_GE(ThreadPool::DefaultCapacity(), 1); ASSERT_OK(DelEnvVar("OMP_NUM_THREADS")); ASSERT_OK(DelEnvVar("OMP_THREAD_LIMIT")); diff --git a/cpp/src/arrow/util/tracing.cc b/cpp/src/arrow/util/tracing.cc index f4c18f1236e..18257eced72 100644 --- a/cpp/src/arrow/util/tracing.cc +++ b/cpp/src/arrow/util/tracing.cc @@ -37,7 +37,9 @@ bool Span::valid() const { return static_cast<::arrow::internal::tracing::SpanImpl*>(details.get())->valid(); } -void Span::reset() { details.reset(); } +void Span::reset() { + static_cast<::arrow::internal::tracing::SpanImpl*>(details.get())->reset(); +} #else diff --git a/cpp/src/arrow/util/tracing_internal.cc b/cpp/src/arrow/util/tracing_internal.cc index e47acf42bcc..5e28bdb461d 100644 --- a/cpp/src/arrow/util/tracing_internal.cc +++ b/cpp/src/arrow/util/tracing_internal.cc @@ -97,6 +97,11 @@ class OtlpOStreamExporter final : public sdktrace::SpanExporter { std::chrono::microseconds(0)) noexcept override { return exporter_.Shutdown(timeout); } + // XXX: OTel 1.19 silent breaking change: this must be overridden + bool ForceFlush(std::chrono::microseconds /*timeout*/) noexcept override { + (*out_).flush(); + return true; + } private: std::basic_ostream* out_; diff --git a/cpp/src/arrow/util/tracing_internal.h b/cpp/src/arrow/util/tracing_internal.h index 6ed731599a9..8e20e657095 100644 --- a/cpp/src/arrow/util/tracing_internal.h +++ b/cpp/src/arrow/util/tracing_internal.h @@ -110,6 +110,7 @@ class SpanImpl : public ::arrow::util::tracing::SpanDetails { public: ~SpanImpl() override = default; bool valid() const { return ot_span != nullptr; } + void reset() { ot_span = nullptr; } opentelemetry::nostd::shared_ptr ot_span; }; @@ -149,14 +150,13 @@ opentelemetry::trace::StartSpanOptions SpanOptionsWithParent( target_span.details.get(), \ ::arrow::internal::tracing::GetTracer()->StartSpan(__VA_ARGS__)))) -# define START_SCOPED_SPAN_SV(target_span, name, ...) \ - ::arrow::internal::tracing::Scope( \ - ::arrow::internal::tracing::GetTracer()->WithActiveSpan( \ - ::arrow::internal::tracing::RewrapSpan( \ - target_span.details.get(), \ - ::arrow::internal::tracing::GetTracer()->StartSpan( \ - ::opentelemetry::nostd::string_view(name.data(), name.size()), \ - ##__VA_ARGS__)))) +# define START_SCOPED_SPAN_SV(target_span, name) \ + ::arrow::internal::tracing::Scope( \ + ::arrow::internal::tracing::GetTracer()->WithActiveSpan( \ + ::arrow::internal::tracing::RewrapSpan( \ + target_span.details.get(), \ + ::arrow::internal::tracing::GetTracer()->StartSpan( \ + ::opentelemetry::nostd::string_view(name.data(), name.size()))))) # define START_SCOPED_SPAN_WITH_PARENT_SV(target_span, parent_span, name, ...) \ ::arrow::internal::tracing::Scope( \ @@ -226,7 +226,7 @@ struct Scope { # define START_SPAN(target_span, ...) # define START_SCOPED_SPAN(target_span, ...) ::arrow::internal::tracing::Scope() -# define START_SCOPED_SPAN_SV(target_span, name, ...) ::arrow::internal::tracing::Scope() +# define START_SCOPED_SPAN_SV(target_span, name) ::arrow::internal::tracing::Scope() # define START_COMPUTE_SPAN(target_span, ...) # define ACTIVATE_SPAN(target_span) ::arrow::internal::tracing::Scope() # define MARK_SPAN(target_span, status) diff --git a/cpp/src/arrow/util/tracing_test.cc b/cpp/src/arrow/util/tracing_test.cc index 08d737ddfd5..b4f67b42d34 100644 --- a/cpp/src/arrow/util/tracing_test.cc +++ b/cpp/src/arrow/util/tracing_test.cc @@ -46,6 +46,33 @@ TEST(Tracing, OtLifetime) { })); } +// This test checks that the Span valid invariant is maintained: +// 1. Span is invalid before START_SPAN +// 2. Span is valid after START_SPAN +// 3. Span is invalid after reset +// 4. Span can be restarted after reset +TEST(Tracing, ValidInvariant) { + Span span; + + EXPECT_FALSE(span.valid()); + + START_SPAN(span, "TestSpan"); + + EXPECT_TRUE(span.valid()); + + span.reset(); + + EXPECT_FALSE(span.valid()); + + span.reset(); + + EXPECT_FALSE(span.valid()); + { + START_SPAN(span, "TestSpan2"); + EXPECT_TRUE(span.valid()); + } +} + #endif } // namespace tracing diff --git a/cpp/src/arrow/util/trie.cc b/cpp/src/arrow/util/trie.cc index 7862c86f38d..2b2a60154e6 100644 --- a/cpp/src/arrow/util/trie.cc +++ b/cpp/src/arrow/util/trie.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/trie.h" +#include "arrow/util/trie_internal.h" #include #include diff --git a/cpp/src/arrow/util/trie_benchmark.cc b/cpp/src/arrow/util/trie_benchmark.cc index b938f87d8d1..27fbffac4d0 100644 --- a/cpp/src/arrow/util/trie_benchmark.cc +++ b/cpp/src/arrow/util/trie_benchmark.cc @@ -23,7 +23,7 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" -#include "arrow/util/trie.h" +#include "arrow/util/trie_internal.h" namespace arrow { namespace internal { diff --git a/cpp/src/arrow/util/trie.h b/cpp/src/arrow/util/trie_internal.h similarity index 100% rename from cpp/src/arrow/util/trie.h rename to cpp/src/arrow/util/trie_internal.h diff --git a/cpp/src/arrow/util/trie_test.cc b/cpp/src/arrow/util/trie_test.cc index 9c6b7678a46..86d274178d6 100644 --- a/cpp/src/arrow/util/trie_test.cc +++ b/cpp/src/arrow/util/trie_test.cc @@ -26,7 +26,7 @@ #include #include "arrow/testing/gtest_util.h" -#include "arrow/util/trie.h" +#include "arrow/util/trie_internal.h" namespace arrow { namespace internal { diff --git a/cpp/src/arrow/util/type_traits.h b/cpp/src/arrow/util/type_traits.h index c1906152423..9c3b388dab2 100644 --- a/cpp/src/arrow/util/type_traits.h +++ b/cpp/src/arrow/util/type_traits.h @@ -42,5 +42,32 @@ template struct is_null_pointer : std::is_same::type> { }; +template +struct SizedIntImpl; + +template <> +struct SizedIntImpl<1> { + using type = int8_t; +}; + +template <> +struct SizedIntImpl<2> { + using type = int16_t; +}; + +template <> +struct SizedIntImpl<4> { + using type = int32_t; +}; + +template <> +struct SizedIntImpl<8> { + using type = int64_t; +}; + +// Map a number of bytes to a type +template +using SizedInt = typename SizedIntImpl::type; + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index 8cecc6365a3..1a8e8066d70 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -47,7 +47,7 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, double* out } // Half float -bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* out) { +bool StringToFloat(const char* s, size_t length, char decimal_point, Float16* out) { ::arrow_vendored::fast_float::parse_options options{ ::arrow_vendored::fast_float::chars_format::general, decimal_point}; float temp_out; @@ -55,7 +55,7 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* o ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, temp_out, options); const bool ok = res.ec == std::errc() && res.ptr == s + length; if (ok) { - *out = Float16::FromFloat(temp_out).bits(); + *out = Float16::FromFloat(temp_out); } return ok; } diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index 609906052cd..d20c0d22b95 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -32,6 +32,7 @@ #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/config.h" +#include "arrow/util/float16.h" #include "arrow/util/macros.h" #include "arrow/util/time.h" #include "arrow/util/visibility.h" @@ -136,7 +137,8 @@ ARROW_EXPORT bool StringToFloat(const char* s, size_t length, char decimal_point, double* out); ARROW_EXPORT -bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* out); +bool StringToFloat(const char* s, size_t length, char decimal_point, + ::arrow::util::Float16* out); template <> struct StringConverter { @@ -168,7 +170,7 @@ struct StringConverter { template <> struct StringConverter { - using value_type = uint16_t; + using value_type = ::arrow::util::Float16; explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {} @@ -822,7 +824,7 @@ static inline bool ParseTimestampStrptime(const char* buf, size_t length, if (!ignore_time_in_day) { secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) + std::chrono::seconds(result.tm_sec)); -#ifndef _WIN32 +#if !defined(_WIN32) && !defined(_AIX) secs -= std::chrono::seconds(result.tm_gmtoff); #endif } diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index a833d266a85..a3d53ea6089 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -33,6 +33,22 @@ using util::Float16; namespace internal { +template +struct ConversionValueTrait; + +template +struct ConversionValueTrait> { + using Type = typename T::c_type; +}; + +template <> +struct ConversionValueTrait { + using Type = Float16; +}; + +template +using ConversionValueType = typename ConversionValueTrait::Type; + template void AssertValueEquals(T a, T b) { ASSERT_EQ(a, b); @@ -52,30 +68,31 @@ void AssertValueEquals(double a, double b) { template void AssertConversion(StringConverter* converter, const T& type, const std::string& s, - typename T::c_type expected) { + ConversionValueType expected) { ARROW_SCOPED_TRACE("When converting: '", s, "', expecting: ", expected); - typename T::c_type out{}; + ConversionValueType out{}; ASSERT_TRUE(converter->Convert(type, s.data(), s.length(), &out)); AssertValueEquals(out, expected); } template void AssertConversion(StringConverter* converter, const std::string& s, - typename T::c_type expected) { + ConversionValueType expected) { auto type = checked_pointer_cast(TypeTraits::type_singleton()); AssertConversion(converter, *type, s, expected); } template -void AssertConversion(const T& type, const std::string& s, typename T::c_type expected) { +void AssertConversion(const T& type, const std::string& s, + ConversionValueType expected) { ARROW_SCOPED_TRACE("When converting: '", s, "', expecting: ", expected); - typename T::c_type out{}; + ConversionValueType out{}; ASSERT_TRUE(ParseValue(type, s.data(), s.length(), &out)); AssertValueEquals(out, expected); } template -void AssertConversion(const std::string& s, typename T::c_type expected) { +void AssertConversion(const std::string& s, ConversionValueType expected) { auto type = checked_pointer_cast(TypeTraits::type_singleton()); AssertConversion(*type, s, expected); } @@ -83,7 +100,7 @@ void AssertConversion(const std::string& s, typename T::c_type expected) { template void AssertConversionFails(StringConverter* converter, const T& type, const std::string& s) { - typename T::c_type out{}; + ConversionValueType out{}; ASSERT_FALSE(converter->Convert(type, s.data(), s.length(), &out)) << "Conversion should have failed for '" << s << "' (returned " << out << ")"; } @@ -96,7 +113,7 @@ void AssertConversionFails(StringConverter* converter, const std::string& s) template void AssertConversionFails(const T& type, const std::string& s) { - typename T::c_type out{}; + ConversionValueType out{}; ASSERT_FALSE(ParseValue(type, s.data(), s.length(), &out)) << "Conversion should have failed for '" << s << "' (returned " << out << ")"; } @@ -157,21 +174,21 @@ TEST(StringConversion, ToDouble) { } TEST(StringConversion, ToHalfFloat) { - AssertConversion("1.5", Float16(1.5f).bits()); - AssertConversion("0", Float16(0.0f).bits()); - AssertConversion("-0.0", Float16(-0.0f).bits()); - AssertConversion("-1e15", Float16(-1e15).bits()); - AssertConversion("+Infinity", 0x7c00); - AssertConversion("-Infinity", 0xfc00); - AssertConversion("Infinity", 0x7c00); + AssertConversion("1.5", Float16(1.5f)); + AssertConversion("0", Float16(0.0f)); + AssertConversion("-0.0", Float16(-0.0f)); + AssertConversion("-1e15", Float16(-1e15)); + AssertConversion("+Infinity", Float16::FromBits(0x7c00)); + AssertConversion("-Infinity", Float16::FromBits(0xfc00)); + AssertConversion("Infinity", Float16::FromBits(0x7c00)); AssertConversionFails(""); AssertConversionFails("e"); AssertConversionFails("1,5"); StringConverter converter(/*decimal_point=*/','); - AssertConversion(&converter, "1,5", Float16(1.5f).bits()); - AssertConversion(&converter, "0", Float16(0.0f).bits()); + AssertConversion(&converter, "1,5", Float16(1.5f)); + AssertConversion(&converter, "0", Float16(0.0f)); AssertConversionFails(&converter, "1.5"); } @@ -207,11 +224,11 @@ TEST(StringConversion, ToHalfFloatLocale) { // French locale uses the comma as decimal point LocaleGuard locale_guard("fr_FR.UTF-8"); - AssertConversion("1.5", Float16(1.5).bits()); + AssertConversion("1.5", Float16(1.5)); AssertConversionFails("1,5"); StringConverter converter(/*decimal_point=*/'#'); - AssertConversion(&converter, "1#5", Float16(1.5).bits()); + AssertConversion(&converter, "1#5", Float16(1.5)); AssertConversionFails(&converter, "1.5"); AssertConversionFails(&converter, "1,5"); } diff --git a/cpp/src/arrow/util/visibility.h b/cpp/src/arrow/util/visibility.h index 9a53cdbdeff..67988071c9d 100644 --- a/cpp/src/arrow/util/visibility.h +++ b/cpp/src/arrow/util/visibility.h @@ -36,7 +36,7 @@ # define ARROW_DLLIMPORT __declspec(dllimport) # endif -// _declspec(dllexport) even when the #included by a non-arrow source +// _declspec(dllexport) even when #included by a non-arrow source # define ARROW_FORCE_EXPORT ARROW_DLLEXPORT # ifdef ARROW_STATIC @@ -67,6 +67,15 @@ # ifndef ARROW_NO_EXPORT # define ARROW_NO_EXPORT [[gnu::visibility("hidden")]] # endif +// The C++ language does not have clear rules for how to export explicit template +// instantiations, and clang/gcc have differing syntax. See +// https://github.com/llvm/llvm-project/issues/29464 and +// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html +# if defined(__clang__) +# define ARROW_TEMPLATE_EXPORT +# else +# define ARROW_TEMPLATE_EXPORT ARROW_EXPORT +# endif # else // Not C++, or not gcc/clang # ifndef ARROW_EXPORT @@ -75,10 +84,10 @@ # ifndef ARROW_NO_EXPORT # define ARROW_NO_EXPORT # endif +# define ARROW_TEMPLATE_EXPORT # endif # define ARROW_FRIEND_EXPORT -# define ARROW_TEMPLATE_EXPORT // [[gnu::visibility("default")]] even when #included by a non-arrow source # define ARROW_FORCE_EXPORT [[gnu::visibility("default")]] diff --git a/cpp/src/arrow/vendored/datetime/meson.build b/cpp/src/arrow/vendored/datetime/meson.build new file mode 100644 index 00000000000..9ea9278ed41 --- /dev/null +++ b/cpp/src/arrow/vendored/datetime/meson.build @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + ['date.h', 'ios.h', 'tz.h', 'tz_private.h', 'visibility.h'], + subdir: 'arrow/vendored/datetime', +) diff --git a/cpp/src/arrow/vendored/double-conversion/meson.build b/cpp/src/arrow/vendored/double-conversion/meson.build new file mode 100644 index 00000000000..182a7d6a2d4 --- /dev/null +++ b/cpp/src/arrow/vendored/double-conversion/meson.build @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + [ + 'bignum-dtoa.h', + 'bignum.h', + 'cached-powers.h', + 'diy-fp.h', + 'double-conversion.h', + 'double-to-string.h', + 'fast-dtoa.h', + 'fixed-dtoa.h', + 'ieee.h', + 'string-to-double.h', + 'strtod.h', + 'utils.h', + ], + subdir: 'arrow/vendored/double-conversion', +) diff --git a/cpp/src/arrow/vendored/meson.build b/cpp/src/arrow/vendored/meson.build new file mode 100644 index 00000000000..bd23b534c70 --- /dev/null +++ b/cpp/src/arrow/vendored/meson.build @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + ['datetime.h', 'ProducerConsumerQueue.h', 'strptime.h', 'xxhash.h'], + subdir: 'arrow/vendored', +) + +subdir('datetime') +subdir('double-conversion') +subdir('pcg') +subdir('portable-snippets') +subdir('xxhash') diff --git a/cpp/src/arrow/vendored/pcg/meson.build b/cpp/src/arrow/vendored/pcg/meson.build new file mode 100644 index 00000000000..9db808f9d18 --- /dev/null +++ b/cpp/src/arrow/vendored/pcg/meson.build @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + ['pcg_extras.hpp', 'pcg_random.hpp', 'pcg_uint128.hpp'], + subdir: 'arrow/vendored/pcg', +) diff --git a/cpp/src/arrow/vendored/portable-snippets/meson.build b/cpp/src/arrow/vendored/portable-snippets/meson.build new file mode 100644 index 00000000000..5107f66cd7f --- /dev/null +++ b/cpp/src/arrow/vendored/portable-snippets/meson.build @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers( + ['debug-trap.h', 'safe-math.h'], + subdir: 'arrow/vendored/portable-snippets', +) diff --git a/cpp/src/arrow/vendored/whereami/whereami.cc b/cpp/src/arrow/vendored/whereami/whereami.cc new file mode 100644 index 00000000000..945226193f9 --- /dev/null +++ b/cpp/src/arrow/vendored/whereami/whereami.cc @@ -0,0 +1,674 @@ +// (‑●‑●)> dual licensed under the WTFPL v2 and MIT licenses +// without any warranty. +// by Gregory Pakosz (@gpakosz) +// https://github.com/gpakosz/whereami +// Copyright 2024 Gregory Pakosz + +// in case you want to #include "whereami.c" in a larger compilation unit +#if !defined(WHEREAMI_H) +# include "whereami.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__linux__) || defined(__CYGWIN__) +# undef _DEFAULT_SOURCE +# define _DEFAULT_SOURCE +#elif defined(__APPLE__) +# undef _DARWIN_C_SOURCE +# define _DARWIN_C_SOURCE +# define _DARWIN_BETTER_REALPATH +#endif + +#if !defined(WAI_MALLOC) || !defined(WAI_FREE) || !defined(WAI_REALLOC) +# include +#endif + +#if !defined(WAI_MALLOC) +# define WAI_MALLOC(size) malloc(size) +#endif + +#if !defined(WAI_FREE) +# define WAI_FREE(p) free(p) +#endif + +#if !defined(WAI_REALLOC) +# define WAI_REALLOC(p, size) realloc(p, size) +#endif + +#ifndef WAI_NOINLINE +# if defined(_MSC_VER) +# define WAI_NOINLINE __declspec(noinline) +# elif defined(__GNUC__) +# define WAI_NOINLINE __attribute__((noinline)) +# else +# error unsupported compiler +# endif +#endif + +#if defined(_MSC_VER) +# define WAI_RETURN_ADDRESS() _ReturnAddress() +#elif defined(__GNUC__) +# define WAI_RETURN_ADDRESS() __builtin_extract_return_addr(__builtin_return_address(0)) +#else +# error unsupported compiler +#endif + +#if defined(_WIN32) + +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# if defined(_MSC_VER) +# pragma warning(push, 3) +# endif +# include +# include +# if defined(_MSC_VER) +# pragma warning(pop) +# endif +# include + +static int WAI_PREFIX(getModulePath_)(HMODULE module, char* out, int capacity, + int* dirname_length) { + wchar_t buffer1[MAX_PATH]; + wchar_t buffer2[MAX_PATH]; + wchar_t* path = NULL; + int length = -1; + bool ok; + + for (ok = false; !ok; ok = true) { + DWORD size; + int length_, length__; + + size = GetModuleFileNameW(module, buffer1, sizeof(buffer1) / sizeof(buffer1[0])); + + if (size == 0) { + break; + } else if (size == (DWORD)(sizeof(buffer1) / sizeof(buffer1[0]))) { + DWORD size_ = size; + do { + wchar_t* path_; + + path_ = (wchar_t*)WAI_REALLOC(path, sizeof(wchar_t) * size_ * 2); + if (!path_) break; + size_ *= 2; + path = path_; + size = GetModuleFileNameW(module, path, size_); + } while (size == size_); + + if (size == size_) break; + } else { + path = buffer1; + } + + if (!_wfullpath(buffer2, path, MAX_PATH)) break; + length_ = (int)wcslen(buffer2); + length__ = + WideCharToMultiByte(CP_UTF8, 0, buffer2, length_, out, capacity, NULL, NULL); + + if (length__ == 0) + length__ = WideCharToMultiByte(CP_UTF8, 0, buffer2, length_, NULL, 0, NULL, NULL); + if (length__ == 0) break; + + if (length__ <= capacity && dirname_length) { + int i; + + for (i = length__ - 1; i >= 0; --i) { + if (out[i] == '\\') { + *dirname_length = i; + break; + } + } + } + + length = length__; + } + + if (path != buffer1) WAI_FREE(path); + + return ok ? length : -1; +} + +WAI_NOINLINE WAI_FUNCSPEC int WAI_PREFIX(getExecutablePath)(char* out, int capacity, + int* dirname_length) { + return WAI_PREFIX(getModulePath_)(NULL, out, capacity, dirname_length); +} + +WAI_NOINLINE WAI_FUNCSPEC int WAI_PREFIX(getModulePath)(char* out, int capacity, + int* dirname_length) { + HMODULE module; + int length = -1; + +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4054) +# endif + if (GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | + GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + (LPCTSTR)WAI_RETURN_ADDRESS(), &module)) +# if defined(_MSC_VER) +# pragma warning(pop) +# endif + { + length = WAI_PREFIX(getModulePath_)(module, out, capacity, dirname_length); + } + + return length; +} + +#elif defined(__linux__) || defined(__CYGWIN__) || defined(__sun) || \ + defined(WAI_USE_PROC_SELF_EXE) + +# include +# include +# if defined(__linux__) +# include +# else +# include +# endif +# ifndef __STDC_FORMAT_MACROS +# define __STDC_FORMAT_MACROS +# endif +# include + +# if !defined(WAI_PROC_SELF_EXE) +# if defined(__sun) +# define WAI_PROC_SELF_EXE "/proc/self/path/a.out" +# else +# define WAI_PROC_SELF_EXE "/proc/self/exe" +# endif +# endif + +WAI_FUNCSPEC +int WAI_PREFIX(getExecutablePath)(char* out, int capacity, int* dirname_length) { + char buffer[PATH_MAX]; + char* resolved = NULL; + int length = -1; + bool ok; + + for (ok = false; !ok; ok = true) { + resolved = realpath(WAI_PROC_SELF_EXE, buffer); + if (!resolved) break; + + length = (int)strlen(resolved); + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + } + + return ok ? length : -1; +} + +# if !defined(WAI_PROC_SELF_MAPS_RETRY) +# define WAI_PROC_SELF_MAPS_RETRY 5 +# endif + +# if !defined(WAI_PROC_SELF_MAPS) +# if defined(__sun) +# define WAI_PROC_SELF_MAPS "/proc/self/map" +# else +# define WAI_PROC_SELF_MAPS "/proc/self/maps" +# endif +# endif + +# if defined(__ANDROID__) || defined(ANDROID) +# include +# include +# include +# endif + +WAI_NOINLINE WAI_FUNCSPEC int WAI_PREFIX(getModulePath)(char* out, int capacity, + int* dirname_length) { + int length = -1; + FILE* maps = NULL; + + for (int r = 0; r < WAI_PROC_SELF_MAPS_RETRY; ++r) { + maps = fopen(WAI_PROC_SELF_MAPS, "r"); + if (!maps) break; + + for (;;) { + char buffer[PATH_MAX < 1024 ? 1024 : PATH_MAX]; + uint64_t low, high; + char perms[5]; + uint64_t offset; + uint32_t major, minor; + char path[PATH_MAX]; + uint32_t inode; + + if (!fgets(buffer, sizeof(buffer), maps)) break; + + if (sscanf(buffer, "%" PRIx64 "-%" PRIx64 " %s %" PRIx64 " %x:%x %u %s\n", &low, + &high, perms, &offset, &major, &minor, &inode, path) == 8) { + uint64_t addr = (uintptr_t)WAI_RETURN_ADDRESS(); + if (low <= addr && addr <= high) { + char* resolved; + + resolved = realpath(path, buffer); + if (!resolved) break; + + length = (int)strlen(resolved); +# if defined(__ANDROID__) || defined(ANDROID) + if (length > 4 && buffer[length - 1] == 'k' && buffer[length - 2] == 'p' && + buffer[length - 3] == 'a' && buffer[length - 4] == '.') { + int fd = open(path, O_RDONLY); + if (fd == -1) { + length = -1; // retry + break; + } + + char* begin = (char*)mmap(0, offset, PROT_READ, MAP_SHARED, fd, 0); + if (begin == MAP_FAILED) { + close(fd); + length = -1; // retry + break; + } + + char* p = begin + offset - 30; // minimum size of local file header + while (p >= begin) { // scan backwards + if (*((uint32_t*)p) == 0x04034b50UL) { // local file header signature found + uint16_t length_ = *((uint16_t*)(p + 26)); + + if (length + 2 + length_ < (int)sizeof(buffer)) { + memcpy(&buffer[length], "!/", 2); + memcpy(&buffer[length + 2], p + 30, length_); + length += 2 + length_; + } + + break; + } + + --p; + } + + munmap(begin, offset); + close(fd); + } +# endif + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + + break; + } + } + } + + fclose(maps); + maps = NULL; + + if (length != -1) break; + } + + return length; +} + +#elif defined(__APPLE__) + +# include +# include + +WAI_FUNCSPEC +int WAI_PREFIX(getExecutablePath)(char* out, int capacity, int* dirname_length) { + char buffer1[PATH_MAX]; + char buffer2[PATH_MAX]; + char* path = buffer1; + char* resolved = NULL; + int length = -1; + bool ok; + + for (ok = false; !ok; ok = true) { + uint32_t size = (uint32_t)sizeof(buffer1); + if (_NSGetExecutablePath(path, &size) == -1) { + path = (char*)WAI_MALLOC(size); + if (!_NSGetExecutablePath(path, &size)) break; + } + + resolved = realpath(path, buffer2); + if (!resolved) break; + + length = (int)strlen(resolved); + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + } + + if (path != buffer1) WAI_FREE(path); + + return ok ? length : -1; +} + +WAI_NOINLINE WAI_FUNCSPEC int WAI_PREFIX(getModulePath)(char* out, int capacity, + int* dirname_length) { + char buffer[PATH_MAX]; + char* resolved = NULL; + int length = -1; + + for (;;) { + Dl_info info; + + if (dladdr(WAI_RETURN_ADDRESS(), &info)) { + resolved = realpath(info.dli_fname, buffer); + if (!resolved) break; + + length = (int)strlen(resolved); + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + } + + break; + } + + return length; +} + +#elif defined(__QNXNTO__) + +# include + +# if !defined(WAI_PROC_SELF_EXE) +# define WAI_PROC_SELF_EXE "/proc/self/exefile" +# endif + +WAI_FUNCSPEC +int WAI_PREFIX(getExecutablePath)(char* out, int capacity, int* dirname_length) { + char buffer1[PATH_MAX]; + char buffer2[PATH_MAX]; + char* resolved = NULL; + FILE* self_exe = NULL; + int length = -1; + bool ok; + + for (ok = false; !ok; ok = true) { + self_exe = fopen(WAI_PROC_SELF_EXE, "r"); + if (!self_exe) break; + + if (!fgets(buffer1, sizeof(buffer1), self_exe)) break; + + resolved = realpath(buffer1, buffer2); + if (!resolved) break; + + length = (int)strlen(resolved); + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + } + + fclose(self_exe); + + return ok ? length : -1; +} + +WAI_FUNCSPEC +int WAI_PREFIX(getModulePath)(char* out, int capacity, int* dirname_length) { + char buffer[PATH_MAX]; + char* resolved = NULL; + int length = -1; + + for (;;) { + Dl_info info; + + if (dladdr(WAI_RETURN_ADDRESS(), &info)) { + resolved = realpath(info.dli_fname, buffer); + if (!resolved) break; + + length = (int)strlen(resolved); + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + } + + break; + } + + return length; +} + +#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || \ + defined(__NetBSD__) || defined(__OpenBSD__) + +# include +# include + +# if defined(__OpenBSD__) + +WAI_FUNCSPEC +int WAI_PREFIX(getExecutablePath)(char* out, int capacity, int* dirname_length) { + char buffer1[4096]; + char buffer2[PATH_MAX]; + char buffer3[PATH_MAX]; + char** argv = (char**)buffer1; + char* resolved = NULL; + int length = -1; + bool ok; + + for (ok = false; !ok; ok = true) { + int mib[4] = {CTL_KERN, KERN_PROC_ARGS, getpid(), KERN_PROC_ARGV}; + size_t size; + + if (sysctl(mib, 4, NULL, &size, NULL, 0) != 0) break; + + if (size > sizeof(buffer1)) { + argv = (char**)WAI_MALLOC(size); + if (!argv) break; + } + + if (sysctl(mib, 4, argv, &size, NULL, 0) != 0) break; + + if (strchr(argv[0], '/')) { + resolved = realpath(argv[0], buffer2); + if (!resolved) break; + } else { + const char* PATH = getenv("PATH"); + if (!PATH) break; + + size_t argv0_length = strlen(argv[0]); + + const char* begin = PATH; + while (1) { + const char* separator = strchr(begin, ':'); + const char* end = separator ? separator : begin + strlen(begin); + + if (end - begin > 0) { + if (*(end - 1) == '/') --end; + + if (((end - begin) + 1 + argv0_length + 1) <= sizeof(buffer2)) { + memcpy(buffer2, begin, end - begin); + buffer2[end - begin] = '/'; + memcpy(buffer2 + (end - begin) + 1, argv[0], argv0_length + 1); + + resolved = realpath(buffer2, buffer3); + if (resolved) break; + } + } + + if (!separator) break; + + begin = ++separator; + } + + if (!resolved) break; + } + + length = (int)strlen(resolved); + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + } + + if (argv != (char**)buffer1) WAI_FREE(argv); + + return ok ? length : -1; +} + +# else + +WAI_FUNCSPEC +int WAI_PREFIX(getExecutablePath)(char* out, int capacity, int* dirname_length) { + char buffer1[PATH_MAX]; + char buffer2[PATH_MAX]; + char* path = buffer1; + char* resolved = NULL; + int length = -1; + bool ok; + + for (ok = false; !ok; ok = true) { +# if defined(__NetBSD__) + int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME}; +# else + int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; +# endif + size_t size = sizeof(buffer1); + + if (sysctl(mib, 4, path, &size, NULL, 0) != 0) break; + + resolved = realpath(path, buffer2); + if (!resolved) break; + + length = (int)strlen(resolved); + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + } + + return ok ? length : -1; +} + +# endif + +WAI_NOINLINE WAI_FUNCSPEC int WAI_PREFIX(getModulePath)(char* out, int capacity, + int* dirname_length) { + char buffer[PATH_MAX]; + char* resolved = NULL; + int length = -1; + + for (;;) { + Dl_info info; + + if (dladdr(WAI_RETURN_ADDRESS(), &info)) { + resolved = realpath(info.dli_fname, buffer); + if (!resolved) break; + + length = (int)strlen(resolved); + if (length <= capacity) { + memcpy(out, resolved, length); + + if (dirname_length) { + int i; + + for (i = length - 1; i >= 0; --i) { + if (out[i] == '/') { + *dirname_length = i; + break; + } + } + } + } + } + + break; + } + + return length; +} + +#else + +# error unsupported platform + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/cpp/src/arrow/vendored/whereami/whereami.h b/cpp/src/arrow/vendored/whereami/whereami.h new file mode 100644 index 00000000000..abb137bbefa --- /dev/null +++ b/cpp/src/arrow/vendored/whereami/whereami.h @@ -0,0 +1,68 @@ +// (‑●‑●)> dual licensed under the WTFPL v2 and MIT licenses +// without any warranty. +// by Gregory Pakosz (@gpakosz) +// https://github.com/gpakosz/whereami +// Copyright 2024 Gregory Pakosz + +#ifndef WHEREAMI_H +#define WHEREAMI_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef WAI_FUNCSPEC +# define WAI_FUNCSPEC +#endif +#ifndef WAI_PREFIX +# define WAI_PREFIX(function) wai_##function +#endif + +/** + * Returns the path to the current executable. + * + * Usage: + * - first call `int length = wai_getExecutablePath(NULL, 0, NULL);` to + * retrieve the length of the path + * - allocate the destination buffer with `path = (char*)malloc(length + 1);` + * - call `wai_getExecutablePath(path, length, NULL)` again to retrieve the + * path + * - add a terminal NUL character with `path[length] = '\0';` + * + * @param out destination buffer, optional + * @param capacity destination buffer capacity + * @param dirname_length optional recipient for the length of the dirname part + * of the path. + * + * @return the length of the executable path on success (without a terminal NUL + * character), otherwise `-1` + */ +WAI_FUNCSPEC +int WAI_PREFIX(getExecutablePath)(char* out, int capacity, int* dirname_length); + +/** + * Returns the path to the current module + * + * Usage: + * - first call `int length = wai_getModulePath(NULL, 0, NULL);` to retrieve + * the length of the path + * - allocate the destination buffer with `path = (char*)malloc(length + 1);` + * - call `wai_getModulePath(path, length, NULL)` again to retrieve the path + * - add a terminal NUL character with `path[length] = '\0';` + * + * @param out destination buffer, optional + * @param capacity destination buffer capacity + * @param dirname_length optional recipient for the length of the dirname part + * of the path. + * + * @return the length of the module path on success (without a terminal NUL + * character), otherwise `-1` + */ +WAI_FUNCSPEC +int WAI_PREFIX(getModulePath)(char* out, int capacity, int* dirname_length); + +#ifdef __cplusplus +} +#endif + +#endif // #ifndef WHEREAMI_H diff --git a/cpp/src/arrow/vendored/xxhash/README.md b/cpp/src/arrow/vendored/xxhash/README.md index 6872c6652ed..464c638016f 100644 --- a/cpp/src/arrow/vendored/xxhash/README.md +++ b/cpp/src/arrow/vendored/xxhash/README.md @@ -17,5 +17,5 @@ under the License. --> -The files in this directory are vendored from xxHash git tag v0.8.2 +The files in this directory are vendored from xxHash git tag v0.8.3 (https://github.com/Cyan4973/xxHash). diff --git a/cpp/src/arrow/vendored/xxhash/meson.build b/cpp/src/arrow/vendored/xxhash/meson.build new file mode 100644 index 00000000000..5463bee747a --- /dev/null +++ b/cpp/src/arrow/vendored/xxhash/meson.build @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers(['xxhash.h'], subdir: 'arrow/vendored/xxhash') diff --git a/cpp/src/arrow/vendored/xxhash/xxhash.c b/cpp/src/arrow/vendored/xxhash/xxhash.c index 083b039d70d..e60cc37f13c 100644 --- a/cpp/src/arrow/vendored/xxhash/xxhash.c +++ b/cpp/src/arrow/vendored/xxhash/xxhash.c @@ -1,6 +1,6 @@ /* * xxHash - Extremely Fast Hash algorithm - * Copyright (C) 2012-2021 Yann Collet + * Copyright (C) 2012-2023 Yann Collet * * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) * @@ -32,12 +32,11 @@ * - xxHash source repository: https://github.com/Cyan4973/xxHash */ - /* * xxhash.c instantiates functions defined in xxhash.h */ -#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ -#define XXH_IMPLEMENTATION /* access definitions */ +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ #include "xxhash.h" diff --git a/cpp/src/arrow/vendored/xxhash/xxhash.h b/cpp/src/arrow/vendored/xxhash/xxhash.h index a18e8c762da..9c819f93b79 100644 --- a/cpp/src/arrow/vendored/xxhash/xxhash.h +++ b/cpp/src/arrow/vendored/xxhash/xxhash.h @@ -1,7 +1,7 @@ /* * xxHash - Extremely Fast Hash algorithm * Header File - * Copyright (C) 2012-2021 Yann Collet + * Copyright (C) 2012-2023 Yann Collet * * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) * @@ -130,6 +130,7 @@ * } * @endcode * + * * @anchor streaming_example * **Streaming** * @@ -165,11 +166,82 @@ * } * @endcode * + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * + * @anchor canonical_representation_example + * **Canonical Representation** + * + * The default return values from XXH functions are unsigned 32, 64 and 128 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + * + * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(), + * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(), + * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(), + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which prints XXH32_hash_t in human readable format + * void printXxh32(XXH32_hash_t hash) + * { + * XXH32_canonical_t cano; + * XXH32_canonicalFromHash(&cano, hash); + * size_t i; + * for(i = 0; i < sizeof(cano.digest); ++i) { + * printf("%02x", cano.digest[i]); + * } + * printf("\n"); + * } + * + * // Example for a function which converts XXH32_canonical_t to XXH32_hash_t + * XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano) + * { + * XXH32_hash_t hash = XXH32_hashFromCanonical(&cano); + * return hash; + * } + * @endcode + * + * * @file xxhash.h * xxHash prototypes and implementation */ -#if defined (__cplusplus) +#if defined(__cplusplus) && !defined(XXH_NO_EXTERNC_GUARD) extern "C" { #endif @@ -261,7 +333,7 @@ extern "C" { /* make all functions private */ # undef XXH_PUBLIC_API # if defined(__GNUC__) -# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# define XXH_PUBLIC_API static __inline __attribute__((__unused__)) # elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) # define XXH_PUBLIC_API static inline # elif defined(_MSC_VER) @@ -373,7 +445,7 @@ extern "C" { /*! @brief Marks a global symbol. */ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) -# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) # ifdef XXH_EXPORT # define XXH_PUBLIC_API __declspec(dllexport) # elif XXH_IMPORT @@ -449,7 +521,7 @@ extern "C" { /* specific declaration modes for Windows */ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) -# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) # ifdef XXH_EXPORT # define XXH_PUBLIC_API __declspec(dllexport) # elif XXH_IMPORT @@ -461,9 +533,9 @@ extern "C" { #endif #if defined (__GNUC__) -# define XXH_CONSTF __attribute__((const)) -# define XXH_PUREF __attribute__((pure)) -# define XXH_MALLOCF __attribute__((malloc)) +# define XXH_CONSTF __attribute__((__const__)) +# define XXH_PUREF __attribute__((__pure__)) +# define XXH_MALLOCF __attribute__((__malloc__)) #else # define XXH_CONSTF /* disable */ # define XXH_PUREF @@ -475,7 +547,7 @@ extern "C" { ***************************************/ #define XXH_VERSION_MAJOR 0 #define XXH_VERSION_MINOR 8 -#define XXH_VERSION_RELEASE 2 +#define XXH_VERSION_RELEASE 3 /*! @brief Version number, encoded as two digits each */ #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) @@ -517,7 +589,11 @@ typedef uint32_t XXH32_hash_t; #elif !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include +# ifdef _AIX +# include +# else +# include +# endif typedef uint32_t XXH32_hash_t; #else @@ -551,10 +627,6 @@ typedef uint32_t XXH32_hash_t; /*! * @brief Calculates the 32-bit hash of @p input using xxHash32. * - * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s - * - * See @ref single_shot_example "Single Shot Example" for an example. - * * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. * @param seed The 32-bit seed to alter the hash's output predictably. @@ -564,63 +636,44 @@ typedef uint32_t XXH32_hash_t; * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return The calculated 32-bit hash value. + * @return The calculated 32-bit xxHash32 value. * - * @see - * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): - * Direct equivalents for the other variants of xxHash. - * @see - * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. + * @see @ref single_shot_example "Single Shot Example" for an example. */ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); #ifndef XXH_NO_STREAM -/*! - * Streaming functions generate the xxHash value from an incremental input. - * This method is slower than single-call functions, due to state management. - * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. - * - * An XXH state must first be allocated using `XXH*_createState()`. - * - * Start a new hash by initializing the state with a seed using `XXH*_reset()`. - * - * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. - * - * The function returns an error code, with 0 meaning OK, and any other value - * meaning there is an error. - * - * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. - * This function returns the nn-bits hash as an int or long long. - * - * It's still possible to continue inserting input into the hash state after a - * digest, and generate new hash values later on by invoking `XXH*_digest()`. - * - * When done, release the state using `XXH*_freeState()`. - * - * @see streaming_example at the top of @ref xxhash.h for an example. - */ - /*! * @typedef struct XXH32_state_s XXH32_state_t * @brief The opaque state struct for the XXH32 streaming API. * * @see XXH32_state_s for details. + * @see @ref streaming_example "Streaming Example" */ typedef struct XXH32_state_s XXH32_state_t; /*! * @brief Allocates an @ref XXH32_state_t. * - * Must be freed with XXH32_freeState(). - * @return An allocated XXH32_state_t on success, `NULL` on failure. + * @return An allocated pointer of @ref XXH32_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH32_freeState(). + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); /*! * @brief Frees an @ref XXH32_state_t. * - * Must be allocated with XXH32_createState(). * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). - * @return XXH_OK. + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH32_createState(). + * + * @see @ref streaming_example "Streaming Example" + * */ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); /*! @@ -636,23 +689,24 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_ /*! * @brief Resets an @ref XXH32_state_t to begin a new hash. * - * This function resets and seeds a state. Call it before @ref XXH32_update(). - * * @param statePtr The state struct to reset. * @param seed The 32-bit seed to alter the hash result predictably. * * @pre * @p statePtr must not be `NULL`. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); /*! * @brief Consumes a block of @p input to an @ref XXH32_state_t. * - * Call this to incrementally consume blocks of data. - * * @param statePtr The state struct to update. * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. @@ -664,48 +718,36 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); /*! * @brief Returns the calculated hash value from an @ref XXH32_state_t. * - * @note - * Calling XXH32_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * * @param statePtr The state struct to calculate the hash from. * * @pre * @p statePtr must not be `NULL`. * - * @return The calculated xxHash32 value from that state. + * @return The calculated 32-bit xxHash32 value from that state. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); #endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ -/* - * The default return values from XXH functions are unsigned 32 and 64 bit - * integers. - * This the simplest and fastest format for further post-processing. - * - * However, this leaves open the question of what is the order on the byte level, - * since little and big endian conventions will store the same number differently. - * - * The canonical representation settles this issue by mandating big-endian - * convention, the same convention as human-readable numbers (large digits first). - * - * When writing hash values to storage, sending them over a network, or printing - * them, it's highly recommended to use the canonical representation to ensure - * portability across a wider range of systems, present and future. - * - * The following functions allow transformation of hash values to and from - * canonical format. - */ - /*! * @brief Canonical (big endian) representation of @ref XXH32_hash_t. */ @@ -716,11 +758,13 @@ typedef struct { /*! * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. * - * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param dst The @ref XXH32_canonical_t pointer to be stored to. * @param hash The @ref XXH32_hash_t to be converted. * * @pre * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" */ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); @@ -733,6 +777,8 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t * @p src must not be `NULL`. * * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" */ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); @@ -745,18 +791,9 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni #endif /*! @endcond */ -/*! @cond Doxygen ignores this part */ -/* - * C23 __STDC_VERSION__ number hasn't been specified yet. For now - * leave as `201711L` (C17 + 1). - * TODO: Update to correct value when its been specified. - */ -#define XXH_C23_VN 201711L -/*! @endcond */ - /*! @cond Doxygen ignores this part */ /* C-language Attributes are added in C23. */ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 202311L) && defined(__has_c_attribute) # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) #else # define XXH_HAS_C_ATTRIBUTE(x) 0 @@ -794,7 +831,7 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni * As of writing this, only supported by clang. */ #if XXH_HAS_ATTRIBUTE(noescape) -# define XXH_NOESCAPE __attribute__((noescape)) +# define XXH_NOESCAPE __attribute__((__noescape__)) #else # define XXH_NOESCAPE #endif @@ -821,7 +858,11 @@ typedef uint64_t XXH64_hash_t; #elif !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include +# ifdef _AIX +# include +# else +# include +# endif typedef uint64_t XXH64_hash_t; #else # include @@ -851,9 +892,6 @@ typedef uint64_t XXH64_hash_t; /*! * @brief Calculates the 64-bit hash of @p input using xxHash64. * - * This function usually runs faster on 64-bit systems, but slower on 32-bit - * systems (see benchmark). - * * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. * @param seed The 64-bit seed to alter the hash's output predictably. @@ -863,13 +901,9 @@ typedef uint64_t XXH64_hash_t; * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return The calculated 64-bit hash. + * @return The calculated 64-bit xxHash64 value. * - * @see - * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): - * Direct equivalents for the other variants of xxHash. - * @see - * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + * @see @ref single_shot_example "Single Shot Example" for an example. */ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); @@ -879,23 +913,32 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size * @brief The opaque state struct for the XXH64 streaming API. * * @see XXH64_state_s for details. + * @see @ref streaming_example "Streaming Example" */ typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ /*! * @brief Allocates an @ref XXH64_state_t. * - * Must be freed with XXH64_freeState(). - * @return An allocated XXH64_state_t on success, `NULL` on failure. + * @return An allocated pointer of @ref XXH64_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH64_freeState(). + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); /*! * @brief Frees an @ref XXH64_state_t. * - * Must be allocated with XXH64_createState(). * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). - * @return XXH_OK. + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH64_createState(). + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); @@ -912,23 +955,24 @@ XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const /*! * @brief Resets an @ref XXH64_state_t to begin a new hash. * - * This function resets and seeds a state. Call it before @ref XXH64_update(). - * * @param statePtr The state struct to reset. * @param seed The 64-bit seed to alter the hash result predictably. * * @pre * @p statePtr must not be `NULL`. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH64_update(). + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); /*! * @brief Consumes a block of @p input to an @ref XXH64_state_t. * - * Call this to incrementally consume blocks of data. - * * @param statePtr The state struct to update. * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. @@ -940,23 +984,30 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); /*! * @brief Returns the calculated hash value from an @ref XXH64_state_t. * - * @note - * Calling XXH64_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * * @param statePtr The state struct to calculate the hash from. * * @pre * @p statePtr must not be `NULL`. * - * @return The calculated xxHash64 value from that state. + * @return The calculated 64-bit xxHash64 value from that state. + * + * @note + * Calling XXH64_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); #endif /* !XXH_NO_STREAM */ @@ -975,6 +1026,8 @@ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t * * @pre * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" */ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); @@ -987,6 +1040,8 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, * @p src must not be `NULL`. * * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" */ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); @@ -1046,40 +1101,75 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const * * The API supports one-shot hashing, streaming mode, and custom secrets. */ + +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Unless set explicitly, determined automatically. + */ +# define XXH_SCALAR 0 /*!< Portable scalar version */ +# define XXH_SSE2 1 /*!< SSE2 for Pentium 4, Opteron, all x86_64. */ +# define XXH_AVX2 2 /*!< AVX2 for Haswell and Bulldozer */ +# define XXH_AVX512 3 /*!< AVX512 for Skylake and Icelake */ +# define XXH_NEON 4 /*!< NEON for most ARMv7-A, all AArch64, and WASM SIMD128 */ +# define XXH_VSX 5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */ +# define XXH_SVE 6 /*!< SVE for some ARMv8-A and ARMv9-A */ +# define XXH_LSX 7 /*!< LSX (128-bit SIMD) for LoongArch64 */ +# define XXH_LASX 8 /*!< LASX (256-bit SIMD) for LoongArch64 */ +# define XXH_RVV 9 /*!< RVV (RISC-V Vector) for RISC-V */ + /*-********************************************************************** * XXH3 64-bit variant ************************************************************************/ /*! - * @brief 64-bit unseeded variant of XXH3. + * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input. * - * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however - * it may have slightly better performance due to constant propagation of the - * defaults. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. * - * @see - * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms * @see * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants - * @see - * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. + * @see @ref single_shot_example "Single Shot Example" for an example. */ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); /*! - * @brief 64-bit seeded variant of XXH3 + * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input. * - * This variant generates a custom secret on the fly based on default secret - * altered using the `seed` value. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. * - * While this operation is decently fast, note that it's not completely free. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. * * @note * seed == 0 produces the same results as @ref XXH3_64bits(). * - * @param input The data to hash - * @param length The length - * @param seed The 64-bit seed to alter the state. + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see @ref single_shot_example "Single Shot Example" for an example. */ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); @@ -1093,22 +1183,36 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const vo #define XXH3_SECRET_SIZE_MIN 136 /*! - * @brief 64-bit variant of XXH3 with a custom "secret". + * @brief Calculates 64-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. * * It's possible to provide any blob of bytes as a "secret" to generate the hash. * This makes it more difficult for an external actor to prepare an intentional collision. - * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). * However, the quality of the secret impacts the dispersion of the hash algorithm. * Therefore, the secret _must_ look like a bunch of random bytes. * Avoid "trivial" or structured data such as repeated sequences or a text document. * Whenever in doubt about the "randomness" of the blob of bytes, - * consider employing "XXH3_generateSecret()" instead (see below). + * consider employing @ref XXH3_generateSecret() instead (see below). * It will generate a proper high entropy secret derived from the blob of bytes. * Another advantage of using XXH3_generateSecret() is that * it guarantees that all bits within the initial blob of bytes * will impact every bit of the output. * This is not necessarily the case when using the blob of bytes directly * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. */ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); @@ -1123,9 +1227,10 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const */ /*! - * @brief The state struct for the XXH3 streaming API. + * @brief The opaque state struct for the XXH3 streaming API. * * @see XXH3_state_s for details. + * @see @ref streaming_example "Streaming Example" */ typedef struct XXH3_state_s XXH3_state_t; XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); @@ -1144,15 +1249,20 @@ XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOE /*! * @brief Resets an @ref XXH3_state_t to begin a new hash. * - * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_64bits_update(). - * Digest will be equivalent to `XXH3_64bits()`. - * * @param statePtr The state struct to reset. * * @pre * @p statePtr must not be `NULL`. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits()`. + * + * @see @ref streaming_example "Streaming Example" * */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); @@ -1160,36 +1270,54 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* stateP /*! * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. * - * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_64bits_update(). - * Digest will be equivalent to `XXH3_64bits_withSeed()`. - * * @param statePtr The state struct to reset. - * @param seed The 64-bit seed to alter the state. + * @param seed The 64-bit seed to alter the hash result predictably. * * @pre * @p statePtr must not be `NULL`. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits_withSeed()`. + * + * @see @ref streaming_example "Streaming Example" * */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); /*! - * XXH3_64bits_reset_withSecret(): - * `secret` is referenced, it _must outlive_ the hash streaming session. - * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * `secret` is referenced, it _must outlive_ the hash streaming session. + * + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, * and the quality of produced hash values depends on secret's entropy * (secret's content should look like a bunch of random bytes). * When in doubt about the randomness of a candidate `secret`, * consider employing `XXH3_generateSecret()` instead (see below). + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); /*! * @brief Consumes a block of @p input to an @ref XXH3_state_t. * - * Call this to incrementally consume blocks of data. - * * @param statePtr The state struct to update. * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. @@ -1201,25 +1329,32 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_stat * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); /*! * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. * - * @note - * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * * @param statePtr The state struct to calculate the hash from. * * @pre * @p statePtr must not be `NULL`. * * @return The calculated XXH3 64-bit hash value from that state. + * + * @note + * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); #endif /* !XXH_NO_STREAM */ /* note : canonical representation of XXH3 is the same as XXH64 @@ -1242,26 +1377,71 @@ typedef struct { } XXH128_hash_t; /*! - * @brief Unseeded 128-bit variant of XXH3 + * @brief Calculates 128-bit unseeded variant of XXH3 of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. * * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead * for shorter inputs. * - * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however * it may have slightly better performance due to constant propagation of the * defaults. * - * @see - * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms - * @see - * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants - * @see - * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. + * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. */ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); -/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ +/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); -/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ +/*! + * @brief Calculates 128-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing @ref XXH3_generateSecret() instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); /******* Streaming *******/ @@ -1281,36 +1461,65 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE cons /*! * @brief Resets an @ref XXH3_state_t to begin a new hash. * - * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_128bits_update(). - * Digest will be equivalent to `XXH3_128bits()`. - * * @param statePtr The state struct to reset. * * @pre * @p statePtr must not be `NULL`. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits()`. + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); /*! * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. * - * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_128bits_update(). - * Digest will be equivalent to `XXH3_128bits_withSeed()`. - * * @param statePtr The state struct to reset. - * @param seed The 64-bit seed to alter the state. + * @param seed The 64-bit seed to alter the hash result predictably. * * @pre * @p statePtr must not be `NULL`. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits_withSeed()`. + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); -/*! @brief Custom secret 128-bit variant of XXH3. @see XXH_64bits_reset_withSecret(). */ +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + * + * @see @ref streaming_example "Streaming Example" + */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); /*! @@ -1324,28 +1533,32 @@ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_sta * * @pre * @p statePtr must not be `NULL`. - * @pre + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note * The memory between @p input and @p input + @p length must be valid, * readable, contiguous memory. However, if @p length is `0`, @p input may be * `NULL`. In C++, this also must be *TriviallyCopyable*. * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); /*! * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. * - * @note - * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * * @param statePtr The state struct to calculate the hash from. * * @pre * @p statePtr must not be `NULL`. * * @return The calculated XXH3 128-bit hash value from that state. + * + * @note + * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * */ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); #endif /* !XXH_NO_STREAM */ @@ -1355,18 +1568,27 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const X * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ /*! - * XXH128_isEqual(): - * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + * @brief Check equality of two XXH128_hash_t values + * + * @param h1 The 128-bit hash value. + * @param h2 Another 128-bit hash value. + * + * @return `1` if `h1` and `h2` are equal. + * @return `0` if they are not. */ XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); /*! * @brief Compares two @ref XXH128_hash_t + * * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. * - * @return: >0 if *h128_1 > *h128_2 - * =0 if *h128_1 == *h128_2 - * <0 if *h128_1 < *h128_2 + * @param h128_1 Left-hand side value + * @param h128_2 Right-hand side value + * + * @return >0 if @p h128_1 > @p h128_2 + * @return =0 if @p h128_1 == @p h128_2 + * @return <0 if @p h128_1 < @p h128_2 */ XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); @@ -1378,11 +1600,12 @@ typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical /*! * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. * - * @param dst The @ref XXH128_canonical_t pointer to be stored to. + * @param dst The @ref XXH128_canonical_t pointer to be stored to. * @param hash The @ref XXH128_hash_t to be converted. * * @pre * @p dst must not be `NULL`. + * @see @ref canonical_representation_example "Canonical Representation Example" */ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); @@ -1395,6 +1618,7 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* ds * @p src must not be `NULL`. * * @return The converted hash. + * @see @ref canonical_representation_example "Canonical Representation Example" */ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); @@ -1440,9 +1664,9 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE con struct XXH32_state_s { XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ - XXH32_hash_t v[4]; /*!< Accumulator lanes */ - XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ - XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t acc[4]; /*!< Accumulator lanes */ + unsigned char buffer[16]; /*!< Internal buffer for partial reads. */ + XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */ XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ }; /* typedef'd to XXH32_state_t */ @@ -1463,9 +1687,9 @@ struct XXH32_state_s { */ struct XXH64_state_s { XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ - XXH64_hash_t v[4]; /*!< Accumulator lanes */ - XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ - XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH64_hash_t acc[4]; /*!< Accumulator lanes */ + unsigned char buffer[32]; /*!< Internal buffer for partial reads.. */ + XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */ XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ }; /* typedef'd to XXH64_state_t */ @@ -1473,8 +1697,7 @@ struct XXH64_state_s { #ifndef XXH_NO_XXH3 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ -# include -# define XXH_ALIGN(n) alignas(n) +# define XXH_ALIGN(n) _Alignas(n) #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ /* In C++ alignas() is a keyword */ # define XXH_ALIGN(n) alignas(n) @@ -1496,6 +1719,7 @@ struct XXH64_state_s { #endif /*! + * @internal * @brief The size of the internal XXH3 buffer. * * This is the optimal update size for incremental hashing. @@ -1505,10 +1729,11 @@ struct XXH64_state_s { #define XXH3_INTERNALBUFFER_SIZE 256 /*! - * @internal - * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * @def XXH3_SECRET_DEFAULT_SIZE + * @brief Default Secret's size * - * This is the size used in @ref XXH3_kSecret and the seeded functions. + * This is the size of internal XXH3_kSecret + * and is needed by XXH3_generateSecret_fromSeed(). * * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. */ @@ -1538,7 +1763,7 @@ struct XXH64_state_s { */ struct XXH3_state_s { XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + /*!< The 8 accumulators. See @ref XXH32_state_s::acc and @ref XXH64_state_s::acc */ XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); /*!< Used to store a custom secret generated from a seed. */ XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); @@ -1587,7 +1812,20 @@ struct XXH3_state_s { /*! - * simple alias to pre-selected XXH3_128bits variant + * @brief Calculates the 128-bit hash of @p data using XXH3. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p len is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 128-bit XXH3 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. */ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); @@ -1596,9 +1834,16 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, siz /* Symbols defined below must be considered tied to a specific library version. */ /*! - * XXH3_generateSecret(): + * @brief Derive a high-entropy secret from any user-defined content, named customSeed. + * + * @param secretBuffer A writable buffer for derived high-entropy secret data. + * @param secretSize Size of secretBuffer, in bytes. Must be >= XXH3_SECRET_SIZE_MIN. + * @param customSeed A user-defined content. + * @param customSeedSize Size of customSeed, in bytes. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. * - * Derive a high-entropy secret from any user-defined content, named customSeed. * The generated secret can be used in combination with `*_withSecret()` functions. * The `_withSecret()` variants are useful to provide a higher level of protection * than 64-bit seed, as it becomes much more difficult for an external actor to @@ -1651,6 +1896,9 @@ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer /*! * @brief Generate the same secret as the _withSeed() variants. * + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_DEFAULT_SIZE bytes + * @param seed The 64-bit seed to alter the hash result predictably. + * * The generated secret can be used in combination with *`*_withSecret()` and `_withSecretandSeed()` variants. * @@ -1670,7 +1918,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer * }; * // Fast, caches the seeded secret for future uses. * class HashFast { - * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * unsigned char secret[XXH3_SECRET_DEFAULT_SIZE]; * public: * HashFast(XXH64_hash_t s) { * XXH3_generateSecret_fromSeed(secret, seed); @@ -1682,15 +1930,26 @@ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer * } * }; * @endcode - * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes - * @param seed The seed to seed the state. */ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); /*! - * These variants generate hash values using either - * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) - * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * @brief Maximum size of "short" key in bytes. + */ +#define XXH3_MIDSIZE_MAX 240 + +/*! + * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * These variants generate hash values using either: + * - @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes) + * - @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX). * * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. * `_withSeed()` has to generate the secret on the fly for "large" keys. @@ -1717,22 +1976,71 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed); -/*! @copydoc XXH3_64bits_withSecretandSeed() */ + +/*! + * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param input The memory segment to be hashed, at least @p len bytes in size. + * @param length The length of @p data, in bytes. + * @param secret The secret used to alter hash result predictably. + * @param secretSize The length of @p secret, in bytes (must be >= XXH3_SECRET_SIZE_MIN) + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed(): contract is the same. + */ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64); + #ifndef XXH_NO_STREAM -/*! @copydoc XXH3_64bits_withSecretandSeed() */ +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed(). Contract is identical. + */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64); -/*! @copydoc XXH3_64bits_withSecretandSeed() */ + +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed(). Contract is identical. + * + * Note: there was a bug in an earlier version of this function (<= v0.8.2) + * that would make it generate an incorrect hash value + * when @p seed == 0 and @p length < XXH3_MIDSIZE_MAX + * and @p secret is different from XXH3_generateSecret_fromSeed(). + * As stated in the contract, the correct hash result must be + * the same as XXH3_128bits_withSeed() when @p length <= XXH3_MIDSIZE_MAX. + * Results generated by this older version are wrong, hence not comparable. + */ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64); + #endif /* !XXH_NO_STREAM */ #endif /* !XXH_NO_XXH3 */ @@ -2077,16 +2385,35 @@ static void XXH_free(void* p) { free(p); } #endif /* XXH_NO_STDLIB */ -#include +#ifndef XXH_memcpy +/*! + * @internal + * @brief XXH_memcpy() macro can be redirected at compile time + */ +# include +# define XXH_memcpy memcpy +#endif +#ifndef XXH_memset /*! * @internal - * @brief Modify this function to use a different routine than memcpy(). + * @brief XXH_memset() macro can be redirected at compile time */ -static void* XXH_memcpy(void* dest, const void* src, size_t size) -{ - return memcpy(dest,src,size); -} +# include +# define XXH_memset memset +#endif + +#ifndef XXH_memcmp +/*! + * @internal + * @brief XXH_memcmp() macro can be redirected at compile time + * Note: only needed by XXH128. + */ +# include +# define XXH_memcmp memcmp +#endif + + #include /* ULLONG_MAX */ @@ -2100,15 +2427,15 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) #if XXH_NO_INLINE_HINTS /* disable inlining hints */ # if defined(__GNUC__) || defined(__clang__) -# define XXH_FORCE_INLINE static __attribute__((unused)) +# define XXH_FORCE_INLINE static __attribute__((__unused__)) # else # define XXH_FORCE_INLINE static # endif # define XXH_NO_INLINE static /* enable inlining hints */ #elif defined(__GNUC__) || defined(__clang__) -# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) -# define XXH_NO_INLINE static __attribute__((noinline)) +# define XXH_FORCE_INLINE static __inline__ __attribute__((__always_inline__, __unused__)) +# define XXH_NO_INLINE static __attribute__((__noinline__)) #elif defined(_MSC_VER) /* Visual Studio */ # define XXH_FORCE_INLINE static __forceinline # define XXH_NO_INLINE static __declspec(noinline) @@ -2121,12 +2448,34 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) # define XXH_NO_INLINE static #endif +#if defined(XXH_INLINE_ALL) +# define XXH_STATIC XXH_FORCE_INLINE +#else +# define XXH_STATIC static +#endif + #if XXH3_INLINE_SECRET # define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE #else # define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE #endif +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined (__clang__)) \ + || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ + || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) +/* + * There are a LOT more compilers that recognize __restrict but this + * covers the major ones. + */ +# define XXH_RESTRICT __restrict +#else +# define XXH_RESTRICT /* disable */ +#endif /* ************************************* * Debug @@ -2206,10 +2555,14 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) #if !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - typedef uint8_t xxh_u8; +# ifdef _AIX +# include +# else +# include +# endif + typedef uint8_t xxh_u8; #else - typedef unsigned char xxh_u8; + typedef unsigned char xxh_u8; #endif typedef XXH32_hash_t xxh_u32; @@ -2295,11 +2648,11 @@ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; * https://gcc.godbolt.org/z/xYez1j67Y. */ #ifdef XXH_OLD_NAMES -typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +typedef union { xxh_u32 u32; } __attribute__((__packed__)) unalign; #endif static xxh_u32 XXH_read32(const void* ptr) { - typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + typedef __attribute__((__aligned__(1))) __attribute__((__may_alias__)) xxh_u32 xxh_unalign32; return *((const xxh_unalign32*)ptr); } @@ -2391,7 +2744,7 @@ static int XXH_isLittleEndian(void) * additional case: * * ``` - * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 202311L) * # include * # ifdef unreachable * # define XXH_UNREACHABLE() unreachable() @@ -2445,6 +2798,9 @@ static int XXH_isLittleEndian(void) && XXH_HAS_BUILTIN(__builtin_rotateleft64) # define XXH_rotl32 __builtin_rotateleft32 # define XXH_rotl64 __builtin_rotateleft64 +#elif XXH_HAS_BUILTIN(__builtin_stdc_rotate_left) +# define XXH_rotl32 __builtin_stdc_rotate_left +# define XXH_rotl64 __builtin_stdc_rotate_left /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ #elif defined(_MSC_VER) # define XXH_rotl32(x,r) _rotl(x,r) @@ -2590,7 +2946,7 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) #if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) /* * UGLY HACK: - * A compiler fence is the only thing that prevents GCC and Clang from + * A compiler fence is used to prevent GCC and Clang from * autovectorizing the XXH32 loop (pragmas and attributes don't work for some * reason) without globally disabling SSE4.1. * @@ -2649,8 +3005,63 @@ static xxh_u32 XXH32_avalanche(xxh_u32 hash) return hash; } -#define XXH_get32bits(p) XXH_readLE32_align(p, align) - +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Sets up the initial accumulator state for XXH32(). + */ +XXH_FORCE_INLINE void +XXH32_initAccs(xxh_u32 *acc, xxh_u32 seed) +{ + XXH_ASSERT(acc != NULL); + acc[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + acc[1] = seed + XXH_PRIME32_2; + acc[2] = seed + 0; + acc[3] = seed - XXH_PRIME32_1; +} + +/*! + * @internal + * @brief Consumes a block of data for XXH32(). + * + * @return the end input pointer. + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH32_consumeLong( + xxh_u32 *XXH_RESTRICT acc, + xxh_u8 const *XXH_RESTRICT input, + size_t len, + XXH_alignment align +) +{ + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + XXH_ASSERT(acc != NULL); + XXH_ASSERT(input != NULL); + XXH_ASSERT(len >= 16); + do { + acc[0] = XXH32_round(acc[0], XXH_get32bits(input)); input += 4; + acc[1] = XXH32_round(acc[1], XXH_get32bits(input)); input += 4; + acc[2] = XXH32_round(acc[2], XXH_get32bits(input)); input += 4; + acc[3] = XXH32_round(acc[3], XXH_get32bits(input)); input += 4; + } while (input < limit); + + return input; +} + +/*! + * @internal + * @brief Merges the accumulator lanes together for XXH32() + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_mergeAccs(const xxh_u32 *acc) +{ + XXH_ASSERT(acc != NULL); + return XXH_rotl32(acc[0], 1) + XXH_rotl32(acc[1], 7) + + XXH_rotl32(acc[2], 12) + XXH_rotl32(acc[3], 18); +} + /*! * @internal * @brief Processes the last 0-15 bytes of @p ptr. @@ -2763,22 +3174,12 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment if (input==NULL) XXH_ASSERT(len == 0); if (len>=16) { - const xxh_u8* const bEnd = input + len; - const xxh_u8* const limit = bEnd - 15; - xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; - xxh_u32 v2 = seed + XXH_PRIME32_2; - xxh_u32 v3 = seed + 0; - xxh_u32 v4 = seed - XXH_PRIME32_1; + xxh_u32 acc[4]; + XXH32_initAccs(acc, seed); - do { - v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; - v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; - v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; - v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; - } while (input < limit); - - h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) - + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + input = XXH32_consumeLong(acc, input, len, align); + + h32 = XXH32_mergeAccs(acc); } else { h32 = seed + XXH_PRIME32_5; } @@ -2833,11 +3234,8 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) { XXH_ASSERT(statePtr != NULL); - memset(statePtr, 0, sizeof(*statePtr)); - statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; - statePtr->v[1] = seed + XXH_PRIME32_2; - statePtr->v[2] = seed + 0; - statePtr->v[3] = seed - XXH_PRIME32_1; + XXH_memset(statePtr, 0, sizeof(*statePtr)); + XXH32_initAccs(statePtr->acc, seed); return XXH_OK; } @@ -2851,45 +3249,37 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len) return XXH_OK; } - { const xxh_u8* p = (const xxh_u8*)input; - const xxh_u8* const bEnd = p + len; + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); - state->total_len_32 += (XXH32_hash_t)len; - state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + XXH_ASSERT(state->bufferedSize < sizeof(state->buffer)); + if (len < sizeof(state->buffer) - state->bufferedSize) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } - if (state->memsize + len < 16) { /* fill in tmp buffer */ - XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); - state->memsize += (XXH32_hash_t)len; - return XXH_OK; - } + { const xxh_u8* xinput = (const xxh_u8*)input; + const xxh_u8* const bEnd = xinput + len; - if (state->memsize) { /* some data left from previous update */ - XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); - { const xxh_u32* p32 = state->mem32; - state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; - state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; - state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; - state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); - } - p += 16-state->memsize; - state->memsize = 0; + if (state->bufferedSize) { /* non-empty buffer: complete first */ + XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize); + xinput += sizeof(state->buffer) - state->bufferedSize; + /* then process one round */ + (void)XXH32_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned); + state->bufferedSize = 0; } - if (p <= bEnd-16) { - const xxh_u8* const limit = bEnd - 16; - - do { - state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; - state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; - state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; - state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; - } while (p<=limit); - + XXH_ASSERT(xinput <= bEnd); + if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) { + /* Process the remaining data */ + xinput = XXH32_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned); } - if (p < bEnd) { - XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); + if (xinput < bEnd) { + /* Copy the leftover to the tmp buffer */ + XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput)); + state->bufferedSize = (unsigned)(bEnd-xinput); } } @@ -2903,36 +3293,20 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) xxh_u32 h32; if (state->large_len) { - h32 = XXH_rotl32(state->v[0], 1) - + XXH_rotl32(state->v[1], 7) - + XXH_rotl32(state->v[2], 12) - + XXH_rotl32(state->v[3], 18); + h32 = XXH32_mergeAccs(state->acc); } else { - h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + h32 = state->acc[2] /* == seed */ + XXH_PRIME32_5; } h32 += state->total_len_32; - return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); + return XXH32_finalize(h32, state->buffer, state->bufferedSize, XXH_aligned); } #endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ -/*! - * @ingroup XXH32_family - * The default return values from XXH functions are unsigned 32 and 64 bit - * integers. - * - * The canonical representation uses big endian convention, the same convention - * as human-readable numbers (large digits first). - * - * This way, hash values can be written into a file or buffer, remaining - * comparable across different systems. - * - * The following functions allow transformation of hash values to and from their - * canonical format. - */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) { XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); @@ -2987,11 +3361,11 @@ static xxh_u64 XXH_read64(const void* memPtr) * https://gcc.godbolt.org/z/xYez1j67Y. */ #ifdef XXH_OLD_NAMES -typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((__packed__)) unalign64; #endif static xxh_u64 XXH_read64(const void* ptr) { - typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + typedef __attribute__((__aligned__(1))) __attribute__((__may_alias__)) xxh_u64 xxh_unalign64; return *((const xxh_unalign64*)ptr); } @@ -3110,6 +3484,23 @@ static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) acc += input * XXH_PRIME64_2; acc = XXH_rotl64(acc, 31); acc *= XXH_PRIME64_1; +#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * DISABLE AUTOVECTORIZATION: + * A compiler fence is used to prevent GCC and Clang from + * autovectorizing the XXH64 loop (pragmas and attributes don't work for some + * reason) without globally disabling AVX512. + * + * Autovectorization of XXH64 tends to be detrimental, + * though the exact outcome may change depending on exact cpu and compiler version. + * For information, it has been reported as detrimental for Skylake-X, + * but possibly beneficial for Zen4. + * + * The default is to disable auto-vectorization, + * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable. + */ + XXH_COMPILER_GUARD(acc); +#endif return acc; } @@ -3135,6 +3526,85 @@ static xxh_u64 XXH64_avalanche(xxh_u64 hash) #define XXH_get64bits(p) XXH_readLE64_align(p, align) +/*! + * @internal + * @brief Sets up the initial accumulator state for XXH64(). + */ +XXH_FORCE_INLINE void +XXH64_initAccs(xxh_u64 *acc, xxh_u64 seed) +{ + XXH_ASSERT(acc != NULL); + acc[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + acc[1] = seed + XXH_PRIME64_2; + acc[2] = seed + 0; + acc[3] = seed - XXH_PRIME64_1; +} + +/*! + * @internal + * @brief Consumes a block of data for XXH64(). + * + * @return the end input pointer. + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH64_consumeLong( + xxh_u64 *XXH_RESTRICT acc, + xxh_u8 const *XXH_RESTRICT input, + size_t len, + XXH_alignment align +) +{ + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + XXH_ASSERT(acc != NULL); + XXH_ASSERT(input != NULL); + XXH_ASSERT(len >= 32); + do { + /* reroll on 32-bit */ + if (sizeof(void *) < sizeof(xxh_u64)) { + size_t i; + for (i = 0; i < 4; i++) { + acc[i] = XXH64_round(acc[i], XXH_get64bits(input)); + input += 8; + } + } else { + acc[0] = XXH64_round(acc[0], XXH_get64bits(input)); input += 8; + acc[1] = XXH64_round(acc[1], XXH_get64bits(input)); input += 8; + acc[2] = XXH64_round(acc[2], XXH_get64bits(input)); input += 8; + acc[3] = XXH64_round(acc[3], XXH_get64bits(input)); input += 8; + } + } while (input < limit); + + return input; +} + +/*! + * @internal + * @brief Merges the accumulator lanes together for XXH64() + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_mergeAccs(const xxh_u64 *acc) +{ + XXH_ASSERT(acc != NULL); + { + xxh_u64 h64 = XXH_rotl64(acc[0], 1) + XXH_rotl64(acc[1], 7) + + XXH_rotl64(acc[2], 12) + XXH_rotl64(acc[3], 18); + /* reroll on 32-bit */ + if (sizeof(void *) < sizeof(xxh_u64)) { + size_t i; + for (i = 0; i < 4; i++) { + h64 = XXH64_mergeRound(h64, acc[i]); + } + } else { + h64 = XXH64_mergeRound(h64, acc[0]); + h64 = XXH64_mergeRound(h64, acc[1]); + h64 = XXH64_mergeRound(h64, acc[2]); + h64 = XXH64_mergeRound(h64, acc[3]); + } + return h64; + } +} + /*! * @internal * @brief Processes the last 0-31 bytes of @p ptr. @@ -3150,7 +3620,7 @@ static xxh_u64 XXH64_avalanche(xxh_u64 hash) * @return The finalized hash * @see XXH32_finalize(). */ -static XXH_PUREF xxh_u64 +XXH_STATIC XXH_PUREF xxh_u64 XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) { if (ptr==NULL) XXH_ASSERT(len == 0); @@ -3200,27 +3670,13 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment xxh_u64 h64; if (input==NULL) XXH_ASSERT(len == 0); - if (len>=32) { - const xxh_u8* const bEnd = input + len; - const xxh_u8* const limit = bEnd - 31; - xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; - xxh_u64 v2 = seed + XXH_PRIME64_2; - xxh_u64 v3 = seed + 0; - xxh_u64 v4 = seed - XXH_PRIME64_1; + if (len>=32) { /* Process a large block of data */ + xxh_u64 acc[4]; + XXH64_initAccs(acc, seed); - do { - v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; - v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; - v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; - v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; - } while (inputv[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; - statePtr->v[1] = seed + XXH_PRIME64_2; - statePtr->v[2] = seed + 0; - statePtr->v[3] = seed - XXH_PRIME64_1; + XXH_memset(statePtr, 0, sizeof(*statePtr)); + XXH64_initAccs(statePtr->acc, seed); return XXH_OK; } @@ -3292,42 +3745,36 @@ XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, return XXH_OK; } - { const xxh_u8* p = (const xxh_u8*)input; - const xxh_u8* const bEnd = p + len; + state->total_len += len; - state->total_len += len; + XXH_ASSERT(state->bufferedSize <= sizeof(state->buffer)); + if (len < sizeof(state->buffer) - state->bufferedSize) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } - if (state->memsize + len < 32) { /* fill in tmp buffer */ - XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); - state->memsize += (xxh_u32)len; - return XXH_OK; - } + { const xxh_u8* xinput = (const xxh_u8*)input; + const xxh_u8* const bEnd = xinput + len; - if (state->memsize) { /* tmp buffer is full */ - XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); - state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); - state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); - state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); - state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); - p += 32 - state->memsize; - state->memsize = 0; + if (state->bufferedSize) { /* non-empty buffer => complete first */ + XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize); + xinput += sizeof(state->buffer) - state->bufferedSize; + /* and process one round */ + (void)XXH64_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned); + state->bufferedSize = 0; } - if (p+32 <= bEnd) { - const xxh_u8* const limit = bEnd - 32; - - do { - state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; - state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; - state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; - state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; - } while (p<=limit); - + XXH_ASSERT(xinput <= bEnd); + if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) { + /* Process the remaining data */ + xinput = XXH64_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned); } - if (p < bEnd) { - XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); + if (xinput < bEnd) { + /* Copy the leftover to the tmp buffer */ + XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput)); + state->bufferedSize = (unsigned)(bEnd-xinput); } } @@ -3341,18 +3788,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state xxh_u64 h64; if (state->total_len >= 32) { - h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); - h64 = XXH64_mergeRound(h64, state->v[0]); - h64 = XXH64_mergeRound(h64, state->v[1]); - h64 = XXH64_mergeRound(h64, state->v[2]); - h64 = XXH64_mergeRound(h64, state->v[3]); + h64 = XXH64_mergeAccs(state->acc); } else { - h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + h64 = state->acc[2] /*seed*/ + XXH_PRIME64_5; } h64 += (xxh_u64) state->total_len; - return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); + return XXH64_finalize(h64, state->buffer, (size_t)state->total_len, XXH_aligned); } #endif /* !XXH_NO_STREAM */ @@ -3387,22 +3830,6 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can /* === Compiler specifics === */ -#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ -# define XXH_RESTRICT /* disable */ -#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ -# define XXH_RESTRICT restrict -#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ - || (defined (__clang__)) \ - || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ - || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) -/* - * There are a LOT more compilers that recognize __restrict but this - * covers the major ones. - */ -# define XXH_RESTRICT __restrict -#else -# define XXH_RESTRICT /* disable */ -#endif #if (defined(__GNUC__) && (__GNUC__ >= 3)) \ || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ @@ -3416,7 +3843,11 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can #ifndef XXH_HAS_INCLUDE # ifdef __has_include -# define XXH_HAS_INCLUDE(x) __has_include(x) +/* + * Not defined as XXH_HAS_INCLUDE(x) (function-like) because + * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion) + */ +# define XXH_HAS_INCLUDE __has_include # else # define XXH_HAS_INCLUDE(x) 0 # endif @@ -3437,6 +3868,13 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can # include # elif defined(__SSE2__) # include +# elif defined(__loongarch_asx) +# include +# include +# elif defined(__loongarch_sx) +# include +# elif defined(__riscv_vector) +# include # endif #endif @@ -3526,40 +3964,13 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can * @ingroup tuning * @brief Overrides the vectorization implementation chosen for XXH3. * - * Can be defined to 0 to disable SIMD or any of the values mentioned in - * @ref XXH_VECTOR_TYPE. + * Can be defined to 0 to disable SIMD, + * or any other authorized value of @ref XXH_VECTOR. * * If this is not defined, it uses predefined macros to determine the best * implementation. */ # define XXH_VECTOR XXH_SCALAR -/*! - * @ingroup tuning - * @brief Possible values for @ref XXH_VECTOR. - * - * Note that these are actually implemented as macros. - * - * If this is not defined, it is detected automatically. - * internal macro XXH_X86DISPATCH overrides this. - */ -enum XXH_VECTOR_TYPE /* fake enum */ { - XXH_SCALAR = 0, /*!< Portable scalar version */ - XXH_SSE2 = 1, /*!< - * SSE2 for Pentium 4, Opteron, all x86_64. - * - * @note SSE2 is also guaranteed on Windows 10, macOS, and - * Android x86. - */ - XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ - XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ - XXH_NEON = 4, /*!< - * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 - * via the SIMDeverywhere polyfill provided with the - * Emscripten SDK. - */ - XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ - XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ -}; /*! * @ingroup tuning * @brief Selects the minimum alignment for XXH3's accumulators. @@ -3574,19 +3985,10 @@ enum XXH_VECTOR_TYPE /* fake enum */ { /* Actual definition */ #ifndef XXH_DOXYGEN -# define XXH_SCALAR 0 -# define XXH_SSE2 1 -# define XXH_AVX2 2 -# define XXH_AVX512 3 -# define XXH_NEON 4 -# define XXH_VSX 5 -# define XXH_SVE 6 #endif #ifndef XXH_VECTOR /* can be defined on command line */ -# if defined(__ARM_FEATURE_SVE) -# define XXH_VECTOR XXH_SVE -# elif ( \ +# if ( \ defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* wasm simd128 via SIMDe */ \ @@ -3595,16 +3997,24 @@ enum XXH_VECTOR_TYPE /* fake enum */ { || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ ) # define XXH_VECTOR XXH_NEON +# elif defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE # elif defined(__AVX512F__) # define XXH_VECTOR XXH_AVX512 # elif defined(__AVX2__) # define XXH_VECTOR XXH_AVX2 -# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# elif defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) # define XXH_VECTOR XXH_SSE2 # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ || (defined(__s390x__) && defined(__VEC__)) \ && defined(__GNUC__) /* TODO: IBM XL */ # define XXH_VECTOR XXH_VSX +# elif defined(__loongarch_asx) +# define XXH_VECTOR XXH_LASX +# elif defined(__loongarch_sx) +# define XXH_VECTOR XXH_LSX +# elif defined(__riscv_vector) +# define XXH_VECTOR XXH_RVV # else # define XXH_VECTOR XXH_SCALAR # endif @@ -3642,6 +4052,12 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_ACC_ALIGN 64 # elif XXH_VECTOR == XXH_SVE /* sve */ # define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_LASX /* lasx */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_LSX /* lsx */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_RVV /* rvv */ +# define XXH_ACC_ALIGN 64 # endif #endif @@ -3650,12 +4066,14 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_SEC_ALIGN XXH_ACC_ALIGN #elif XXH_VECTOR == XXH_SVE # define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_RVV +# define XXH_SEC_ALIGN XXH_ACC_ALIGN #else # define XXH_SEC_ALIGN 8 #endif #if defined(__GNUC__) || defined(__clang__) -# define XXH_ALIASING __attribute__((may_alias)) +# define XXH_ALIASING __attribute__((__may_alias__)) #else # define XXH_ALIASING /* nothing */ #endif @@ -3971,7 +4389,10 @@ do { \ # error "default keyset is not large enough" #endif -/*! Pseudorandom secret taken directly from FARSH. */ +/*! + * @internal + * @def XXH3_kSecret + * @brief Pseudorandom secret taken directly from FARSH. */ XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, @@ -4408,8 +4829,6 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, } } -#define XXH3_MIDSIZE_MAX 240 - XXH_NO_INLINE XXH_PUREF XXH64_hash_t XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, @@ -4853,10 +5272,18 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR (void)(&XXH_writeLE64); { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); -# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ - XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; - __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER <= 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 + * and some specific variants of 2015 may also lack it */ + /* Cast to unsigned 64-bit first to avoid signed arithmetic issues */ + xxh_u64 const seed64_unsigned = (xxh_u64)seed64; + xxh_u64 const neg_seed64 = (xxh_u64)(0ULL - seed64_unsigned); + __m128i const seed = _mm_set_epi32( + (int)(neg_seed64 >> 32), /* high 32 bits of negated seed */ + (int)(neg_seed64), /* low 32 bits of negated seed */ + (int)(seed64_unsigned >> 32), /* high 32 bits of original seed */ + (int)(seed64_unsigned) /* low 32 bits of original seed */ + ); # else __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); # endif @@ -5281,6 +5708,260 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, #endif +#if (XXH_VECTOR == XXH_LSX) +#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + __m128i* const xacc = (__m128i *) acc; + const __m128i* const xinput = (const __m128i *) input; + const __m128i* const xsecret = (const __m128i *) secret; + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = __lsx_vld(xinput + i, 0); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = __lsx_vld(xsecret + i, 0); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32); + // __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2)); + __m128i const sum = __lsx_vadd_d(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = __lsx_vadd_d(product, sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + __m128i* const xacc = (__m128i*) acc; + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = __lsx_vreplgr2vr_d(XXH_PRIME32_1); + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = __lsx_vsrli_d(acc_vec, 47); + __m128i const data_vec = __lsx_vxor_v(acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = __lsx_vld(xsecret + i, 0); + __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + xacc[i] = __lsx_vmul_d(data_key, prime32); + } + } +} + +#endif + +#if (XXH_VECTOR == XXH_LASX) +#define _LASX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_lasx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { + __m256i* const xacc = (__m256i *) acc; + const __m256i* const xinput = (const __m256i *) input; + const __m256i* const xsecret = (const __m256i *) secret; + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = __lasx_xvld(xinput + i, 0); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = __lasx_xvld(xsecret + i, 0); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32); + // __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = __lasx_xvmulwev_d_wu(data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = __lasx_xvshuf4i_w(data_vec, _LASX_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = __lasx_xvadd_d(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = __lasx_xvadd_d(product, sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lasx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_lasx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { + __m256i* const xacc = (__m256i*) acc; + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = __lasx_xvreplgr2vr_d(XXH_PRIME32_1); + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = __lasx_xvsrli_d(acc_vec, 47); + __m256i const data_vec = __lasx_xvxor_v(acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m256i const key_vec = __lasx_xvld(xsecret + i, 0); + __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + xacc[i] = __lasx_xvmul_d(data_key, prime32); + } + } +} + +#endif + +#if (XXH_VECTOR == XXH_RVV) +#if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \ + (defined(__clang__) && __clang_major__ < 16)) + #define RVV_OP(op) op +#else + #define concat2(X, Y) X ## Y + #define concat(X, Y) concat2(X, Y) + #define RVV_OP(op) concat(__riscv_, op) +#endif +XXH_FORCE_INLINE void +XXH3_accumulate_512_rvv( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + { + // Try to set vector lenght to 512 bits. + // If this length is unavailable, then maximum available will be used + size_t vl = RVV_OP(vsetvl_e64m2)(8); + + uint64_t* const xacc = (uint64_t*) acc; + const uint64_t* const xinput = (const uint64_t*) input; + const uint64_t* const xsecret = (const uint64_t*) secret; + uint64_t swap_mask[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + vuint64m2_t xswap_mask = RVV_OP(vle64_v_u64m2)(swap_mask, vl); + + // vuint64m1_t is sizeless. + // But we can assume that vl can be only 4(vlen=128) or 8(vlen=256,512) + for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){ + /* data_vec = input[i]; */ + vuint64m2_t data_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xinput + vl * i), vl * 8)); + /* key_vec = secret[i]; */ + vuint64m2_t key_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xsecret + vl * i), vl * 8)); + /* data_key = data_vec ^ key_vec; */ + vuint64m2_t data_key = RVV_OP(vxor_vv_u64m2)(data_vec, key_vec, vl); + /* data_key_lo = data_key >> 32; */ + vuint64m2_t data_key_lo = RVV_OP(vsrl_vx_u64m2)(data_key, 32, vl); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + vuint64m2_t product = RVV_OP(vmul_vv_u64m2)(RVV_OP(vand_vx_u64m2)(data_key, 0xffffffff, vl), RVV_OP(vand_vx_u64m2)(data_key_lo, 0xffffffff, vl), vl); + /* acc_vec = xacc[i]; */ + vuint64m2_t acc_vec = RVV_OP(vle64_v_u64m2)(xacc + vl * i, vl); + acc_vec = RVV_OP(vadd_vv_u64m2)(acc_vec, product, vl); + { + /* swap high and low halves */ + vuint64m2_t data_swap = RVV_OP(vrgather_vv_u64m2)(data_vec, xswap_mask, vl); + acc_vec = RVV_OP(vadd_vv_u64m2)(acc_vec, data_swap, vl); + } + RVV_OP(vse64_v_u64m2)(xacc + vl * i, acc_vec, vl); + } + } +} + +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + { + // Try to set vector lenght to 512 bits. + // If this length is unavailable, then maximum available will be used + size_t vl = RVV_OP(vsetvl_e64m2)(8); + uint64_t* const xacc = (uint64_t*) acc; + const uint64_t* const xsecret = (const uint64_t*) secret; + + uint64_t prime[16] = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,\ + XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1}; + vuint64m2_t vprime = RVV_OP(vle64_v_u64m2)(prime, vl); + + // vuint64m2_t is sizeless. + // But we can assume that vl can be only 4(vlen=128) or 8(vlen=256,512) + for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){ + /* xacc[i] ^= (xacc[i] >> 47) */ + vuint64m2_t acc_vec = RVV_OP(vle64_v_u64m2)(xacc + vl * i, vl); + vuint64m2_t shifted = RVV_OP(vsrl_vx_u64m2)(acc_vec, 47, vl); + vuint64m2_t data_vec = RVV_OP(vxor_vv_u64m2)(acc_vec, shifted, vl); + /* xacc[i] ^= xsecret[i]; */ + vuint64m2_t key_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xsecret + vl * i), vl * 8)); + vuint64m2_t data_key = RVV_OP(vxor_vv_u64m2)(data_vec, key_vec, vl); + + /* xacc[i] *= XXH_PRIME32_1; */ + vuint64m2_t prod_even = RVV_OP(vmul_vv_u64m2)(RVV_OP(vand_vx_u64m2)(data_key, 0xffffffff, vl), vprime, vl); + vuint64m2_t prod_odd = RVV_OP(vmul_vv_u64m2)(RVV_OP(vsrl_vx_u64m2)(data_key, 32, vl), vprime, vl); + vuint64m2_t prod = RVV_OP(vadd_vv_u64m2)(prod_even, RVV_OP(vsll_vx_u64m2)(prod_odd, 32, vl), vl); + RVV_OP(vse64_v_u64m2)(xacc + vl * i, prod, vl); + } + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_rvv(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + { + uint64_t* const xcustomSecret = (uint64_t*)customSecret; + + (void)(&XXH_writeLE64); + { + // Calculate the number of 64-bit elements in the `XXH3_kSecret` secret + size_t XXH3_kSecret_64b_len = XXH_SECRET_DEFAULT_SIZE / 8; + // Create an array of repeated seed values, alternating between seed64 and -seed64. + uint64_t seed_pos[16] = {seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64), \ + seed64, (uint64_t)(-(int64_t)seed64)}; + // Cast the default secret to a signed 64-bit pointer for vectorized access + const int64_t* const xXXH3_kSecret = (const int64_t*)((const void*)XXH3_kSecret); + size_t vl = 0; + for (size_t i=0; i < XXH3_kSecret_64b_len; i += vl) { + + vl = RVV_OP(vsetvl_e64m2)(XXH3_kSecret_64b_len - i); + { + vint64m2_t seed = RVV_OP(vle64_v_i64m2)((int64_t*)seed_pos, vl); + vint64m2_t src = RVV_OP(vle64_v_i64m2)((const int64_t*)&xXXH3_kSecret[i], vl); + vint64m2_t res = RVV_OP(vadd_vv_i64m2)(src, seed, vl); + RVV_OP(vse64_v_i64m2)((int64_t*)&xcustomSecret[i], res, vl); + } + } + } + } +} +#endif + + /* scalar variants - universal */ #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) @@ -5440,7 +6121,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) * SUB STR * STR * - * See XXH3_NEON_LANES for details on the pipsline. + * See XXH3_NEON_LANES for details on the pipeline. * * XXH3_64bits_withSeed, len == 256, Snapdragon 835 * without hack: 2654.4 MB/s @@ -5511,6 +6192,24 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#elif (XXH_VECTOR == XXH_LASX) +#define XXH3_accumulate_512 XXH3_accumulate_512_lasx +#define XXH3_accumulate XXH3_accumulate_lasx +#define XXH3_scrambleAcc XXH3_scrambleAcc_lasx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_LSX) +#define XXH3_accumulate_512 XXH3_accumulate_512_lsx +#define XXH3_accumulate XXH3_accumulate_lsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_lsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_RVV) +#define XXH3_accumulate_512 XXH3_accumulate_512_rvv +#define XXH3_accumulate XXH3_accumulate_rvv +#define XXH3_scrambleAcc XXH3_scrambleAcc_rvv +#define XXH3_initCustomSecret XXH3_initCustomSecret_rvv + #else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar @@ -5566,7 +6265,7 @@ XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret acc[1] ^ XXH_readLE64(secret+8) ); } -static XXH64_hash_t +static XXH_PUREF XXH64_hash_t XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) { xxh_u64 result64 = start; @@ -5593,6 +6292,15 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre return XXH3_avalanche(result64); } +/* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + +static XXH_PUREF XXH64_hash_t +XXH3_finalizeLong_64b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 len) +{ + return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, len * XXH_PRIME64_1); +} + #define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } @@ -5608,10 +6316,8 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); - /* do not align on 8, so that the secret is different from the accumulator */ -#define XXH_SECRET_MERGEACCS_START 11 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); + return XXH3_finalizeLong_64b(acc, (const xxh_u8*)secret, (xxh_u64)len); } /* @@ -5747,7 +6453,7 @@ XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH /* === XXH3 streaming === */ #ifndef XXH_NO_STREAM /* - * Malloc's a pointer that is always aligned to align. + * Malloc's a pointer that is always aligned to @align. * * This must be freed with `XXH_alignedFree()`. * @@ -5815,8 +6521,12 @@ static void XXH_alignedFree(void* p) /*! * @brief Allocate an @ref XXH3_state_t. * - * Must be freed with XXH3_freeState(). - * @return An allocated XXH3_state_t on success, `NULL` on failure. + * @return An allocated pointer of @ref XXH3_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH3_freeState(). + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) { @@ -5830,9 +6540,13 @@ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) /*! * @brief Frees an @ref XXH3_state_t. * - * Must be allocated with XXH3_createState(). * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). - * @return XXH_OK. + * + * @return @ref XXH_OK. + * + * @note Must be allocated with XXH3_createState(). + * + * @see @ref streaming_example "Streaming Example" */ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) { @@ -5857,7 +6571,7 @@ XXH3_reset_internal(XXH3_state_t* statePtr, XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); XXH_ASSERT(statePtr != NULL); /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ - memset((char*)statePtr + initStart, 0, initLength); + XXH_memset((char*)statePtr + initStart, 0, initLength); statePtr->acc[0] = XXH_PRIME32_3; statePtr->acc[1] = XXH_PRIME64_1; statePtr->acc[2] = XXH_PRIME64_2; @@ -5976,8 +6690,9 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, # define XXH3_STREAM_USE_STACK 1 # endif #endif -/* - * Both XXH3_64bits_update and XXH3_128bits_update use this routine. +/* This function accepts f_acc and f_scramble as function pointers, + * making it possible to implement multiple variants with different acc & scramble stages. + * This is notably useful to implement multiple vector variants with different intrinsics. */ XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t* XXH_RESTRICT const state, @@ -5991,6 +6706,16 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, } XXH_ASSERT(state != NULL); + state->totalLen += len; + + /* small input : just fill in tmp buffer */ + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + { const xxh_u8* const bEnd = input + len; const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 @@ -6003,15 +6728,6 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, #else xxh_u64* XXH_RESTRICT const acc = state->acc; #endif - state->totalLen += len; - XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); - - /* small input : just fill in tmp buffer */ - if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { - XXH_memcpy(state->buffer + state->bufferedSize, input, len); - state->bufferedSize += (XXH32_hash_t)len; - return XXH_OK; - } /* total input is now > XXH3_INTERNALBUFFER_SIZE */ #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) @@ -6058,12 +6774,21 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state, return XXH_OK; } +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_NO_INLINE XXH_errorcode +XXH3_update_regular(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + /*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) { - return XXH3_update(state, (const xxh_u8*)input, len, - XXH3_accumulate, XXH3_scrambleAcc); + return XXH3_update_regular(state, input, len); } @@ -6111,9 +6836,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* if (state->totalLen > XXH3_MIDSIZE_MAX) { XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; XXH3_digest_long(acc, state, secret); - return XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)state->totalLen * XXH_PRIME64_1); + return XXH3_finalizeLong_64b(acc, secret, (xxh_u64)state->totalLen); } /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ if (state->useSeed) @@ -6405,6 +7128,17 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, } } +static XXH_PUREF XXH128_hash_t +XXH3_finalizeLong_128b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, xxh_u64 len) +{ + XXH128_hash_t h128; + h128.low64 = XXH3_finalizeLong_64b(acc, secret, len); + h128.high64 = XXH3_mergeAccs(acc, secret + secretSize + - XXH_STRIPE_LEN - XXH_SECRET_MERGEACCS_START, + ~(len * XXH_PRIME64_2)); + return h128; +} + XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, @@ -6418,16 +7152,7 @@ XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { XXH128_hash_t h128; - h128.low64 = XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)len * XXH_PRIME64_1); - h128.high64 = XXH3_mergeAccs(acc, - secret + secretSize - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((xxh_u64)len * XXH_PRIME64_2)); - return h128; - } + return XXH3_finalizeLong_128b(acc, secret, secretSize, (xxh_u64)len); } /* @@ -6599,7 +7324,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NO XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) { - return XXH3_64bits_update(state, input, len); + return XXH3_update_regular(state, input, len); } /*! @ingroup XXH3_family */ @@ -6610,19 +7335,10 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_ XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; XXH3_digest_long(acc, state, secret); XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { XXH128_hash_t h128; - h128.low64 = XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)state->totalLen * XXH_PRIME64_1); - h128.high64 = XXH3_mergeAccs(acc, - secret + state->secretLimit + XXH_STRIPE_LEN - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); - return h128; - } + return XXH3_finalizeLong_128b(acc, secret, state->secretLimit + XXH_STRIPE_LEN, (xxh_u64)state->totalLen); } /* len <= XXH3_MIDSIZE_MAX : short code */ - if (state->seed) + if (state->useSeed) return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), secret, state->secretLimit + XXH_STRIPE_LEN); @@ -6630,14 +7346,12 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_ #endif /* !XXH_NO_STREAM */ /* 128-bit utility functions */ -#include /* memcmp, memcpy */ - /* return : 1 is equal, 0 if different */ /*! @ingroup XXH3_family */ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { /* note : XXH128_hash_t is compact, it has no padding byte */ - return !(memcmp(&h1, &h2, sizeof(h1))); + return !(XXH_memcmp(&h1, &h2, sizeof(h1))); } /* This prototype is compatible with stdlib's qsort(). @@ -6721,7 +7435,7 @@ XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOES { size_t pos = 0; while (pos < secretSize) { size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); - memcpy((char*)secretBuffer + pos, customSeed, toCopy); + XXH_memcpy((char*)secretBuffer + pos, customSeed, toCopy); pos += toCopy; } } @@ -6746,7 +7460,7 @@ XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed) XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; XXH3_initCustomSecret(secret, seed); XXH_ASSERT(secretBuffer != NULL); - memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE); + XXH_memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE); } @@ -6768,6 +7482,6 @@ XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed) #endif /* XXH_IMPLEMENTATION */ -#if defined (__cplusplus) +#if defined (__cplusplus) && !defined(XXH_NO_EXTERNC_GUARD) } /* extern "C" */ #endif diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index a7f5f9dacff..e5760243b39 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -153,8 +153,6 @@ add_arrow_lib(gandiva gandiva SOURCES ${SRC_FILES} - PRECOMPILED_HEADERS - "$<$:gandiva/pch.h>" OUTPUTS GANDIVA_LIBRARIES DEPENDENCIES @@ -197,8 +195,8 @@ if(WIN32) list(APPEND GANDIVA_STATIC_TEST_LINK_LIBS ${GANDIVA_OPENSSL_LIBS}) list(APPEND GANDIVA_SHARED_TEST_LINK_LIBS ${GANDIVA_OPENSSL_LIBS}) endif() -list(APPEND GANDIVA_STATIC_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK} ${ARROW_GTEST_GTEST_MAIN}) -list(APPEND GANDIVA_SHARED_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK} ${ARROW_GTEST_GTEST_MAIN}) +list(APPEND GANDIVA_STATIC_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK_MAIN}) +list(APPEND GANDIVA_SHARED_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK_MAIN}) function(ADD_GANDIVA_TEST REL_TEST_NAME) set(options USE_STATIC_LINKING) diff --git a/cpp/src/gandiva/GandivaConfig.cmake.in b/cpp/src/gandiva/GandivaConfig.cmake.in index 68579debd18..e99b3006c58 100644 --- a/cpp/src/gandiva/GandivaConfig.cmake.in +++ b/cpp/src/gandiva/GandivaConfig.cmake.in @@ -30,7 +30,7 @@ set(ARROW_LLVM_VERSIONS "@ARROW_LLVM_VERSIONS@") set(ARROW_ZSTD_SOURCE "@zstd_SOURCE@") include(CMakeFindDependencyMacro) -find_dependency(Arrow) +find_dependency(Arrow CONFIG) if(DEFINED CMAKE_MODULE_PATH) set(GANDIVA_CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH}) else() diff --git a/cpp/src/gandiva/arrow.h b/cpp/src/gandiva/arrow.h index 96303029678..af2c9561b2f 100644 --- a/cpp/src/gandiva/arrow.h +++ b/cpp/src/gandiva/arrow.h @@ -55,7 +55,7 @@ static inline bool is_decimal_128(DataTypePtr type) { } } -static inline bool IsArrowStringLiteral(arrow::Type::type type) { +constexpr bool IsArrowStringLiteral(arrow::Type::type type) { return type == arrow::Type::STRING || type == arrow::Type::BINARY; } diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index 96cf4a12e58..d371db59dfc 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -35,11 +35,13 @@ class TestCacheKey { }; TEST(TestCache, TestGetPut) { - Cache cache(2); - cache.PutObjectCode(TestCacheKey(1), "hello"); - cache.PutObjectCode(TestCacheKey(2), "world"); - ASSERT_EQ(cache.GetObjectCode(TestCacheKey(1)), "hello"); - ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); + Cache> cache(2); + auto hello = std::make_shared("hello"); + cache.PutObjectCode(TestCacheKey(1), hello); + auto world = std::make_shared("world"); + cache.PutObjectCode(TestCacheKey(2), world); + ASSERT_EQ(cache.GetObjectCode(TestCacheKey(1)), hello); + ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), world); } namespace { diff --git a/cpp/src/gandiva/context_helper.cc b/cpp/src/gandiva/context_helper.cc index 2a3efc8348b..752fbea4c33 100644 --- a/cpp/src/gandiva/context_helper.cc +++ b/cpp/src/gandiva/context_helper.cc @@ -60,7 +60,7 @@ arrow::Status ExportedContextFunctions::AddMappings(Engine* engine) const { extern "C" { -void gdv_fn_context_set_error_msg(int64_t context_ptr, char const* err_msg) { +void gdv_fn_context_set_error_msg(int64_t context_ptr, const char* err_msg) { auto context = reinterpret_cast(context_ptr); context->set_error_msg(err_msg); } diff --git a/cpp/src/gandiva/decimal_scalar.h b/cpp/src/gandiva/decimal_scalar.h index 4e07480da45..a32bc054c06 100644 --- a/cpp/src/gandiva/decimal_scalar.h +++ b/cpp/src/gandiva/decimal_scalar.h @@ -61,7 +61,7 @@ class DecimalScalar128 : public BasicDecimalScalar128 { namespace std { template <> struct hash { - std::size_t operator()(gandiva::DecimalScalar128 const& s) const noexcept { + std::size_t operator()(const gandiva::DecimalScalar128& s) const noexcept { arrow::BasicDecimal128 dvalue(s.value()); static const int kSeedValue = 4; diff --git a/cpp/src/gandiva/encrypt_utils.cc b/cpp/src/gandiva/encrypt_utils.cc index 16c195d4944..c39cf3cf0b3 100644 --- a/cpp/src/gandiva/encrypt_utils.cc +++ b/cpp/src/gandiva/encrypt_utils.cc @@ -109,16 +109,4 @@ int32_t aes_decrypt(const char* ciphertext, int32_t ciphertext_len, const char* return plaintext_len; } -const EVP_CIPHER* get_cipher_algo(int32_t key_length) { - switch (key_length) { - case 16: - return EVP_aes_128_ecb(); - case 24: - return EVP_aes_192_ecb(); - case 32: - return EVP_aes_256_ecb(); - default: - throw std::runtime_error("unsupported key length"); - } -} } // namespace gandiva diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 95f7ed70c24..a55421b1b48 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -115,10 +115,12 @@ namespace gandiva { extern const unsigned char kPrecompiledBitcode[]; extern const size_t kPrecompiledBitcodeSize; +namespace { + std::once_flag llvm_init_once_flag; -static bool llvm_init = false; -static llvm::StringRef cpu_name; -static std::vector cpu_attrs; +bool llvm_init = false; +llvm::StringRef cpu_name; +std::vector cpu_attrs; std::once_flag register_exported_funcs_flag; template @@ -143,7 +145,7 @@ Result MakeTargetMachineBuilder( #else using CodeGenOptLevel = llvm::CodeGenOpt::Level; #endif - auto const opt_level = + const auto opt_level = conf.optimize() ? CodeGenOptLevel::Aggressive : CodeGenOptLevel::None; jtmb.setCodeGenOptLevel(opt_level); return jtmb; @@ -200,10 +202,16 @@ Status UseJITLinkIfEnabled(llvm::orc::LLJITBuilder& jit_builder) { static auto maybe_use_jit_link = ::arrow::internal::GetEnvVar("GANDIVA_USE_JIT_LINK"); if (maybe_use_jit_link.ok()) { ARROW_ASSIGN_OR_RAISE(static auto memory_manager, CreateMemmoryManager()); +# if LLVM_VERSION_MAJOR >= 21 + jit_builder.setObjectLinkingLayerCreator([&](llvm::orc::ExecutionSession& ES) { + return std::make_unique(ES, *memory_manager); + }); +# else jit_builder.setObjectLinkingLayerCreator( [&](llvm::orc::ExecutionSession& ES, const llvm::Triple& TT) { return std::make_unique(ES, *memory_manager); }); +# endif } return Status::OK(); } @@ -241,6 +249,29 @@ Result> BuildJIT( return jit; } +arrow::Status VerifyAndLinkModule( + llvm::Module& dest_module, + llvm::Expected> src_module_or_error) { + ARROW_ASSIGN_OR_RAISE( + auto src_ir_module, + AsArrowResult(src_module_or_error, "Failed to verify and link module: ")); + + src_ir_module->setDataLayout(dest_module.getDataLayout()); + + std::string error_info; + llvm::raw_string_ostream error_stream(error_info); + ARROW_RETURN_IF( + llvm::verifyModule(*src_ir_module, &error_stream), + Status::CodeGenError("verify of IR Module failed: " + error_stream.str())); + + ARROW_RETURN_IF(llvm::Linker::linkModules(dest_module, std::move(src_ir_module)), + Status::CodeGenError("failed to link IR Modules")); + + return Status::OK(); +} + +} // namespace + Status Engine::SetLLVMObjectCache(GandivaObjectCache& object_cache) { auto cached_buffer = object_cache.getObject(nullptr); if (cached_buffer) { @@ -348,27 +379,6 @@ Result> Engine::Make( return engine; } -static arrow::Status VerifyAndLinkModule( - llvm::Module& dest_module, - llvm::Expected> src_module_or_error) { - ARROW_ASSIGN_OR_RAISE( - auto src_ir_module, - AsArrowResult(src_module_or_error, "Failed to verify and link module: ")); - - src_ir_module->setDataLayout(dest_module.getDataLayout()); - - std::string error_info; - llvm::raw_string_ostream error_stream(error_info); - ARROW_RETURN_IF( - llvm::verifyModule(*src_ir_module, &error_stream), - Status::CodeGenError("verify of IR Module failed: " + error_stream.str())); - - ARROW_RETURN_IF(llvm::Linker::linkModules(dest_module, std::move(src_ir_module)), - Status::CodeGenError("failed to link IR Modules")); - - return Status::OK(); -} - llvm::Module* Engine::module() { DCHECK(!module_finalized_) << "module cannot be accessed after finalized"; return module_.get(); @@ -376,7 +386,7 @@ llvm::Module* Engine::module() { // Handling for pre-compiled IR libraries. Status Engine::LoadPreCompiledIR() { - auto const bitcode = llvm::StringRef(reinterpret_cast(kPrecompiledBitcode), + const auto bitcode = llvm::StringRef(reinterpret_cast(kPrecompiledBitcode), kPrecompiledBitcodeSize); /// Read from file into memory buffer. @@ -399,14 +409,14 @@ Status Engine::LoadPreCompiledIR() { } static llvm::MemoryBufferRef AsLLVMMemoryBuffer(const arrow::Buffer& arrow_buffer) { - auto const data = reinterpret_cast(arrow_buffer.data()); - auto const size = arrow_buffer.size(); + const auto data = reinterpret_cast(arrow_buffer.data()); + const auto size = arrow_buffer.size(); return {llvm::StringRef(data, size), "external_bitcode"}; } Status Engine::LoadExternalPreCompiledIR() { - auto const& buffers = function_registry_->GetBitcodeBuffers(); - for (auto const& buffer : buffers) { + const auto& buffers = function_registry_->GetBitcodeBuffers(); + for (const auto& buffer : buffers) { auto llvm_memory_buffer_ref = AsLLVMMemoryBuffer(*buffer); auto module_or_error = llvm::parseBitcodeFile(llvm_memory_buffer_ref, *context()); ARROW_RETURN_NOT_OK(VerifyAndLinkModule(*module_, std::move(module_or_error))); @@ -570,7 +580,7 @@ Result Engine::CompiledFunction(const std::string& function) { void Engine::AddGlobalMappingForFunc(const std::string& name, llvm::Type* ret_type, const std::vector& args, void* func) { - auto const prototype = llvm::FunctionType::get(ret_type, args, /*is_var_arg*/ false); + const auto prototype = llvm::FunctionType::get(ret_type, args, /*is_var_arg*/ false); llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, name, module()); AddAbsoluteSymbol(*lljit_, name, func); } diff --git a/cpp/src/gandiva/exported_funcs_registry_test.cc b/cpp/src/gandiva/exported_funcs_registry_test.cc index 6941201e912..25660a37c8f 100644 --- a/cpp/src/gandiva/exported_funcs_registry_test.cc +++ b/cpp/src/gandiva/exported_funcs_registry_test.cc @@ -22,7 +22,7 @@ namespace gandiva { TEST(ExportedFuncsRegistry, RegistrationOnlyOnce) { gandiva::RegisterExportedFuncs(); - auto const& registered_list = ExportedFuncsRegistry::Registered(); + const auto& registered_list = ExportedFuncsRegistry::Registered(); EXPECT_EQ(registered_list.size(), 6); } } // namespace gandiva diff --git a/cpp/src/gandiva/external_c_functions.cc b/cpp/src/gandiva/external_c_functions.cc index fcba00aed35..7944cd018fd 100644 --- a/cpp/src/gandiva/external_c_functions.cc +++ b/cpp/src/gandiva/external_c_functions.cc @@ -27,7 +27,7 @@ size_t GetNumArgs(const gandiva::FunctionSignature& sig, auto num_args = 0; num_args += func.NeedsContext() ? 1 : 0; num_args += func.NeedsFunctionHolder() ? 1 : 0; - for (auto const& arg : sig.param_types()) { + for (const auto& arg : sig.param_types()) { num_args += arg->id() == arrow::Type::STRING ? 2 : 1; } num_args += sig.ret_type()->id() == arrow::Type::STRING ? 1 : 0; @@ -47,7 +47,7 @@ arrow::Result, llvm::Type*>> MapToLLVMSignatu if (func.NeedsFunctionHolder()) { arg_llvm_types.push_back(types->i64_type()); } - for (auto const& arg : sig.param_types()) { + for (const auto& arg : sig.param_types()) { arg_llvm_types.push_back(types->IRType(arg->id())); if (arg->id() == arrow::Type::STRING) { // string type needs an additional length argument @@ -65,10 +65,10 @@ arrow::Result, llvm::Type*>> MapToLLVMSignatu namespace gandiva { Status ExternalCFunctions::AddMappings(Engine* engine) const { - auto const& c_funcs = function_registry_->GetCFunctions(); - auto const types = engine->types(); + const auto& c_funcs = function_registry_->GetCFunctions(); + const auto types = engine->types(); for (auto& [func, func_ptr] : c_funcs) { - for (auto const& sig : func.signatures()) { + for (const auto& sig : func.signatures()) { ARROW_ASSIGN_OR_RAISE(auto llvm_signature, MapToLLVMSignature(sig, func, types)); auto& [args, ret_llvm_type] = llvm_signature; engine->AddGlobalMappingForFunc(func.pc_name(), ret_llvm_type, args, func_ptr); diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 0955a2e47fc..7ef9178d8d8 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -32,7 +32,9 @@ namespace gandiva { -static constexpr uint32_t kMaxFunctionSignatures = 2048; +namespace { + +constexpr uint32_t kMaxFunctionSignatures = 2048; // encapsulates an llvm memory buffer in an arrow buffer // this is needed because we don't expose the llvm memory buffer to the outside world in @@ -48,6 +50,20 @@ class LLVMMemoryArrowBuffer : public arrow::Buffer { std::unique_ptr llvm_buffer_; }; +arrow::Result> GetBufferFromFile( + const std::string& bitcode_file_path) { + auto buffer_or_error = llvm::MemoryBuffer::getFile(bitcode_file_path); + + ARROW_RETURN_IF(!buffer_or_error, + Status::IOError("Could not load module from bitcode file: ", + bitcode_file_path + + " Error: " + buffer_or_error.getError().message())); + + return std::move(buffer_or_error.get()); +} + +} // namespace + FunctionRegistry::FunctionRegistry() { pc_registry_.reserve(kMaxFunctionSignatures); } FunctionRegistry::iterator FunctionRegistry::begin() const { @@ -64,7 +80,7 @@ FunctionRegistry::iterator FunctionRegistry::back() const { const NativeFunction* FunctionRegistry::LookupSignature( const FunctionSignature& signature) const { - auto const got = pc_registry_map_.find(&signature); + const auto got = pc_registry_map_.find(&signature); return got == pc_registry_map_.end() ? nullptr : got->second; } @@ -74,25 +90,13 @@ Status FunctionRegistry::Add(NativeFunction func) { kMaxFunctionSignatures); } pc_registry_.emplace_back(std::move(func)); - auto const& last_func = pc_registry_.back(); - for (auto const& func_signature : last_func.signatures()) { + const auto& last_func = pc_registry_.back(); + for (const auto& func_signature : last_func.signatures()) { pc_registry_map_.emplace(&func_signature, &last_func); } return arrow::Status::OK(); } -arrow::Result> GetBufferFromFile( - const std::string& bitcode_file_path) { - auto buffer_or_error = llvm::MemoryBuffer::getFile(bitcode_file_path); - - ARROW_RETURN_IF(!buffer_or_error, - Status::IOError("Could not load module from bitcode file: ", - bitcode_file_path + - " Error: " + buffer_or_error.getError().message())); - - return std::move(buffer_or_error.get()); -} - Status FunctionRegistry::Register(const std::vector& funcs, const std::string& bitcode_path) { ARROW_ASSIGN_OR_RAISE(auto llvm_buffer, GetBufferFromFile(bitcode_path)); @@ -114,7 +118,7 @@ arrow::Status FunctionRegistry::Register( std::optional function_holder_maker) { if (function_holder_maker.has_value()) { // all signatures should have the same base name, use the first signature's base name - auto const& func_base_name = func.signatures().begin()->base_name(); + const auto& func_base_name = func.signatures().begin()->base_name(); ARROW_RETURN_NOT_OK(holder_maker_registry_.Register( func_base_name, std::move(function_holder_maker).value())); } @@ -139,11 +143,11 @@ const FunctionHolderMakerRegistry& FunctionRegistry::GetFunctionHolderMakerRegis arrow::Result> MakeDefaultFunctionRegistry() { auto registry = std::make_shared(); - for (auto const& funcs : + for (const auto& funcs : {GetArithmeticFunctionRegistry(), GetDateTimeFunctionRegistry(), GetHashFunctionRegistry(), GetMathOpsFunctionRegistry(), GetStringFunctionRegistry(), GetDateTimeArithmeticFunctionRegistry()}) { - for (auto const& func_signature : funcs) { + for (const auto& func_signature : funcs) { ARROW_RETURN_NOT_OK(registry->Add(func_signature)); } } diff --git a/cpp/src/gandiva/function_signature.cc b/cpp/src/gandiva/function_signature.cc index 43064b6686f..136afca2d94 100644 --- a/cpp/src/gandiva/function_signature.cc +++ b/cpp/src/gandiva/function_signature.cc @@ -35,6 +35,8 @@ using arrow::internal::hash_combine; namespace gandiva { +namespace { + bool DataTypeEquals(const DataTypePtr& left, const DataTypePtr& right) { if (left->id() == right->id()) { switch (left->id()) { @@ -53,6 +55,8 @@ bool DataTypeEquals(const DataTypePtr& left, const DataTypePtr& right) { } } +} // namespace + FunctionSignature::FunctionSignature(std::string base_name, DataTypeVector param_types, DataTypePtr ret_type) : base_name_(std::move(base_name)), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 7a47f7491a4..dff15e6fd29 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -26,7 +26,7 @@ #include "arrow/util/base64.h" #include "arrow/util/bit_util.h" -#include "arrow/util/double_conversion.h" +#include "arrow/util/double_conversion_internal.h" #include "arrow/util/value_parsing.h" #include "gandiva/encrypt_utils.h" @@ -41,6 +41,8 @@ extern "C" { +ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING + static char mask_array[256] = { (char)0, (char)1, (char)2, (char)3, (char)4, (char)5, (char)6, (char)7, (char)8, (char)9, (char)10, (char)11, (char)12, (char)13, (char)14, (char)15, @@ -843,6 +845,8 @@ const char* gdv_mask_show_last_n_utf8_int32(int64_t context, const char* data, int32_t n_to_mask = num_of_chars - n_to_show; return gdv_mask_first_n_utf8_int32(context, data, data_len, n_to_mask, out_len); } + +ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING } namespace gandiva { diff --git a/cpp/src/gandiva/gdv_hash_function_stubs.cc b/cpp/src/gandiva/gdv_hash_function_stubs.cc index aac70a06be6..41eef324450 100644 --- a/cpp/src/gandiva/gdv_hash_function_stubs.cc +++ b/cpp/src/gandiva/gdv_hash_function_stubs.cc @@ -27,6 +27,8 @@ extern "C" { +ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING + #define MD5_HASH_FUNCTION(TYPE) \ GANDIVA_EXPORT \ const char* gdv_fn_md5_##TYPE(int64_t context, gdv_##TYPE value, bool validity, \ @@ -212,6 +214,8 @@ const char* gdv_fn_sha1_decimal128(int64_t context, int64_t x_high, uint64_t x_l const gandiva::BasicDecimal128 decimal_128(x_high, x_low); return gandiva::gdv_sha1_hash(context, decimal_128.ToBytes().data(), 16, out_length); } + +ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING } namespace gandiva { diff --git a/cpp/src/gandiva/gdv_string_function_stubs.cc b/cpp/src/gandiva/gdv_string_function_stubs.cc index 17eefbe22e3..e7982461b43 100644 --- a/cpp/src/gandiva/gdv_string_function_stubs.cc +++ b/cpp/src/gandiva/gdv_string_function_stubs.cc @@ -23,7 +23,7 @@ #include #include -#include "arrow/util/double_conversion.h" +#include "arrow/util/double_conversion_internal.h" #include "arrow/util/utf8_internal.h" #include "arrow/util/value_parsing.h" @@ -35,6 +35,8 @@ extern "C" { +ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING + bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len) { gandiva::LikeHolder* holder = reinterpret_cast(ptr); @@ -167,7 +169,7 @@ CAST_VARLEN_TYPE_FROM_NUMERIC(VARBINARY) GDV_FORCE_INLINE void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) { - char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string"; + const char* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string"; int size = static_cast(strlen(fmt)) + 64; char* error = reinterpret_cast(malloc(size)); snprintf(error, size, fmt, (unsigned char)val); @@ -755,6 +757,8 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in *out_len = result_len; return result; } + +ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING } namespace gandiva { diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 79654e7b78c..b3aa0465e6c 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -107,7 +107,7 @@ TEST_F(TestLLVMGenerator, TestAdd) { SelectionVector::MODE_NONE)); ASSERT_OK(generator->engine_->FinalizeModule()); - auto const& ir = generator->engine_->ir(); + const auto& ir = generator->engine_->ir(); EXPECT_THAT(ir, testing::HasSubstr("vector.body")); ASSERT_OK_AND_ASSIGN(auto fn_ptr, generator->engine_->CompiledFunction(fn_name)); diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index 3541e5e4040..447989b52e0 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -56,7 +56,11 @@ class GANDIVA_EXPORT LLVMTypes { llvm::Type* double_type() { return llvm::Type::getDoubleTy(context_); } llvm::PointerType* ptr_type(llvm::Type* type) { +#if LLVM_VERSION_MAJOR >= 21 + return llvm::PointerType::get(context_, 0); +#else return llvm::PointerType::get(type, 0); +#endif } llvm::PointerType* i8_ptr_type() { return ptr_type(i8_type()); } @@ -117,7 +121,7 @@ class GANDIVA_EXPORT LLVMTypes { std::vector GetSupportedArrowTypes() { std::vector retval; - for (auto const& element : arrow_id_to_llvm_type_map_) { + for (const auto& element : arrow_id_to_llvm_type_map_) { retval.push_back(element.first); } return retval; diff --git a/cpp/src/gandiva/make_precompiled_bitcode.py b/cpp/src/gandiva/make_precompiled_bitcode.py index 97d96f8a878..48ed040f065 100644 --- a/cpp/src/gandiva/make_precompiled_bitcode.py +++ b/cpp/src/gandiva/make_precompiled_bitcode.py @@ -37,8 +37,8 @@ def apply_template(template, data): if __name__ == "__main__": if len(sys.argv) != 4: - raise ValueError("Usage: {0}