diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000..a1c6287841 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,28 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. +2. +3. +4. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**System (please complete the following information):** + - OS: [e.g. RHEL8.6] + - Hardware [e.g. Intel Xeon Ice Lake, 64GB, NVMe] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000..da2327a4c6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,22 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +Owner: + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/speedb-b.gif b/.github/speedb-b.gif new file mode 100644 index 0000000000..f0452a44da Binary files /dev/null and b/.github/speedb-b.gif differ diff --git a/.github/speedb-bee-dark.gif b/.github/speedb-bee-dark.gif new file mode 100644 index 0000000000..fdac8208ee Binary files /dev/null and b/.github/speedb-bee-dark.gif differ diff --git a/.github/speedb-bee.gif b/.github/speedb-bee.gif new file mode 100644 index 0000000000..99d791de5e Binary files /dev/null and b/.github/speedb-bee.gif differ diff --git a/.github/speedb-logo-dark.gif b/.github/speedb-logo-dark.gif new file mode 100644 index 0000000000..4867858ebc Binary files /dev/null and b/.github/speedb-logo-dark.gif differ diff --git a/.github/speedb-logo.gif b/.github/speedb-logo.gif new file mode 100644 index 0000000000..93dc9e5322 Binary files /dev/null and b/.github/speedb-logo.gif differ diff --git a/.github/workflows/artifact-release.yml b/.github/workflows/artifact-release.yml new file mode 100644 index 0000000000..2768779e75 --- /dev/null +++ b/.github/workflows/artifact-release.yml @@ -0,0 +1,112 @@ +name: Create release artifacts + +on: + push: + tags: + - 'speedb/v*' + +permissions: + contents: write # Needed for release assets upload + id-token: write # Needed for AWS credentials setting + +jobs: + build: + runs-on: [self-hosted, ubuntu, asrunner] + + container: + image: centos:7.9.2009 + + steps: + - name: pre + run: | + yum install -y centos-release-scl epel-release + yum install -y make devtoolset-11-gcc-c++ \ + coreutils wget unzip which git python3 openssl openssl-devel \ + libzstd-devel lz4-devel snappy-devel zlib-devel \ + java-1.8.0-openjdk-devel + echo "PATH=/opt/rh/devtoolset-11/root/usr/bin:${PATH}" >> $GITHUB_ENV + echo "RELEASE_VERSION=${GITHUB_REF_NAME#speedb/v}" >> $GITHUB_ENV + + - name: Install CMake + run: | + CMAKE_RELEASE=3.20.1 + wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_RELEASE}/cmake-${CMAKE_RELEASE}.tar.gz + tar xf cmake-${CMAKE_RELEASE}.tar.gz + cd cmake-${CMAKE_RELEASE} + ./bootstrap + make -j$(nproc) && make install + cd .. && rm -rf cmake-${CMAKE_RELEASE}* + + - name: Install awscli + run: | + wget "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -O "awscliv2.zip" + unzip awscliv2.zip + ./aws/install + rm -rf aws awscliv2.zip + + - uses: actions/checkout@v3 + + - run: mkdir "$GITHUB_WORKSPACE/out" + + - name: Build and package release libraries + run: | + rm -rf build && mkdir build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DPORTABLE=1 -DWITH_GFLAGS=0 -DWITH_SNAPPY=1 -DWITH_LZ4=1 -DWITH_ZLIB=1 -DWITH_ZSTD=1 + mkdir -p "$GITHUB_WORKSPACE/out/root" + DESTDIR="$GITHUB_WORKSPACE/out/root" make -j$(nproc) install + ( cd "$GITHUB_WORKSPACE/out/root" && tar czf ../speedb-${RELEASE_VERSION}.tar.gz . ) + rm -rf "$GITHUB_WORKSPACE/out/root" + cd .. && rm -rf build + + - name: Build release Jar + run: | + make clean + LIB_MODE=static DEBUG_LEVEL=0 PORTABLE=1 JAVA_HOME=/usr/lib/jvm/java-openjdk make -j$(nproc) rocksdbjavastatic + cp "java/target/speedbjni-${RELEASE_VERSION}-linux64.jar" "$GITHUB_WORKSPACE/out" + + - name: Build db_bench + run: | + yum install -y gflags-devel + rm -rf build && mkdir build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DPORTABLE=1 -DWITH_GFLAGS=1 \ + -DWITH_SNAPPY=1 -DWITH_LZ4=1 -DWITH_ZLIB=1 -DWITH_ZSTD=1 \ + -DWITH_BENCHMARK_TOOLS=1 -DROCKSDB_BUILD_SHARED=1 + make -j$(nproc) db_bench + tar czf "$GITHUB_WORKSPACE/out/db_bench-speedb-${RELEASE_VERSION}.tar.gz" db_bench + cd .. && rm -rf build + + - name: Generate checksums + run: | + for f in $GITHUB_WORKSPACE/out/*; do + sha256sum "$f" > "$f.sha256" + done + + - name: Get release date + run: | + echo "RELEASE_DATE=$(git for-each-ref "--format=%(creatordate:short)" "refs/tags/${GITHUB_REF_NAME}")" >> $GITHUB_ENV + + - name: Release + uses: softprops/action-gh-release@v1 + if: startsWith(github.ref, 'refs/tags/') + with: + generate_release_notes: false + name: Speedb ${{ env.RELEASE_VERSION }} (${{ env.RELEASE_DATE }}) + files: | + out/db_bench-speedb-${{ env.RELEASE_VERSION }}.tar.gz + out/db_bench-speedb-${{ env.RELEASE_VERSION }}.tar.gz.sha256 + out/speedb-${{ env.RELEASE_VERSION }}.tar.gz + out/speedb-${{ env.RELEASE_VERSION }}.tar.gz.sha256 + out/speedbjni-${{ env.RELEASE_VERSION }}-linux64.jar + out/speedbjni-${{ env.RELEASE_VERSION }}-linux64.jar.sha256 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: eu-west-2 + + - name: Upload artifacts to S3 + run: | + aws s3 cp "$GITHUB_WORKSPACE/out" "s3://spdb-github-artifacts/release-${RELEASE_VERSION}" --recursive + rm -rf "$GITHUB_WORKSPACE/out" diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml new file mode 100644 index 0000000000..4f6ad5ab3e --- /dev/null +++ b/.github/workflows/ci_pipeline.yml @@ -0,0 +1,110 @@ +name: CI + +on: + #push: + workflow_dispatch: + workflow_call: + pull_request_review: + types: [submitted] + + +permissions: write-all + +jobs: + #Sanity: + #uses: speedb-io/speedb/.github/workflows/sanity_check.yml@main + + Build: + #needs: [Sanity] + if: ${{ github.event.review.state == 'approved' || github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/heads/release') }} + runs-on: [self-hosted, ubuntu, asrunner] + strategy: + matrix: + include: + - name: verify build + command: cmake .. -GNinja + - name: optimized build + command: cmake .. -DCMAKE_BUILD_TYPE=Release -GNinja + - mame: clang build + command: CC=clang CXX=clang++ cmake .. -GNinja + container: + image: alpine:3.14 + + steps: + - name: Pre-build + run: | + env + rm -rf /usr/share/dotnet || echo "" + df -h + apk add git + echo "nameserver 8.8.8.8" > /etc/resolv.conf + apk add bash python3 py3-pip clang clang-extra-tools shellcheck gcc g++ cmake ninja ccache \ + openjdk10 gflags-dev snappy-dev lz4-dev bzip2-dev zstd-dev zlib-dev linux-headers openssh-client tar + python3 -m pip install lint-diffs flake8 + + + - name: Checkout + uses: actions/checkout@v3 + + + - name: Prepare ccache timestamp + id: ccache_cache_timestamp + shell: cmake -P {0} + run: | + string(TIMESTAMP current_date "%Y-%m-%d-%H;%M;%S" UTC) + message("::set-output name=timestamp::${current_date}") + + + - name: ccache cache files + uses: actions/cache@v2 + with: + path: ~/.ccache + key: ${{runner.os}}-ccache-${{steps.ccache_cache_timestamp.outputs.timestamp}} + restore-keys: | + ${{runner.os}}-ccache- + + + - name: ${{ matrix.name }} + run: | + if [ -d "$GITHUB_WORKSPACE/build" ]; then + echo >&2 "error: the build directory should not exist" && false NIK + fi + if [ -d "~/.ccache" ]; then + echo "Already exists" + else + mkdir -p ~/.ccache + ls ~ | grep cache || echo "" + touch ~/.ccache/ccache.txt + echo "aaa" > ~/.ccache/ccache.txt + ls ~/.ccache + cat ~/.ccache/ccache.txt + fi + mkdir -p "$GITHUB_WORKSPACE/build" + cd "$GITHUB_WORKSPACE/build" + export "CCACHE_BASEDIR=$HOME" + export "CCACHE_DIR=$HOME/.ccache" + export "CCACHE_COMPILERCHECK=content" + ${{ matrix.command }} -DPORTABLE=1 -DWITH_GFLAGS=1 \ + -DWITH_ZLIB=1 -DWITH_SNAPPY=1 -DWITH_BZ2=1 -DWITH_LZ4=1 -DWITH_ZSTD=1 \ + -DWITH_JNI=1 -DJAVA_HOME=/usr/lib/jvm/default-jvm \ + -DWITH_BENCHMARK_TOOLS=1 -DWITH_CORE_TOOLS=1 -DWITH_TOOLS=1 \ + -DWITH_TESTS=1 -DWITH_ALL_TESTS=1 -DWITH_EXAMPLES=1 + ninja + + #Performance: + #if: ${{ github.event.review.state == 'approved' || github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/heads/release') }} + #needs: [Build] + #uses: speedb-io/speedb/.github/workflows/perf-test.yml@main + + QA-Tests: + if: ${{ github.event.review.state == 'approved' || github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/heads/release') }} + needs: [Build] + uses: speedb-io/speedb/.github/workflows/qa-tests.yml@main + + Fuzz: + if: ${{ github.event.review.state == 'approved' || github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/heads/release') }} + needs: [Build] + uses: ./.github/workflows/test_fuzz.yml + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/new_release_line.yml b/.github/workflows/new_release_line.yml new file mode 100644 index 0000000000..b835b7a9c7 --- /dev/null +++ b/.github/workflows/new_release_line.yml @@ -0,0 +1,106 @@ +name: New Release Line + +on: + workflow_dispatch: + inputs: + new_branch_major: + description: "Next release Major version (LEAVE EMPTY FOR AUTO-INCREMENT)" + required: false + new_branch_minor: + description: "Next release Minor version (LEAVE EMPTY FOR AUTO-INCREMENT)" + required: false + branches: + - main + - 'release/*' + +permissions: + contents: read + +jobs: + tag_version: + runs-on: [self-hosted, ubuntu, asrunner] + container: + image: alpine:3.14 + + env: + VERSION_FILE: speedb/version.h + + steps: + - name: pre + run: | + echo "nameserver 8.8.8.8" > /etc/resolv.conf + apk add git openssh-client + + - name: Verify chosen version + run: | + if ! echo "${{ inputs.new_branch_major }}" | grep -q "^[0-9]*$"; then + echo >&2 "error: major version must be a positive number" && false + fi + if ! echo "${{ inputs.new_branch_minor }}" | grep -q "^[0-9]*$"; then + echo >&2 "error: minor version must be a positive number" && false + fi + + if [ "${{ inputs.new_branch_major }}${{ inputs.new_branch_minor }}" != "" ] && [ "$GITHUB_REF" != "refs/heads/main" ]; then + echo >&2 "error: cannot cut a major or a minor release from a branch that isn't main" && false + elif [ "$GITHUB_REF" != "refs/heads/main" ] && ! echo "$GITHUB_REF" | grep -q "^refs/heads/release/"; then + echo "error: cannot cut a patch release from a non-release branch" && false + fi + + - uses: actions/checkout@v3 + with: + ssh-key: ${{ secrets.RELEASE_SSH_KEY }} + + - name: Calculate new version + run: | + major=$(grep '_MAJOR\s\+[0-9]\+' "$VERSION_FILE" | sed 's/[^0-9]\+//') + minor=$(grep '_MINOR\s\+[0-9]\+' "$VERSION_FILE" | sed 's/[^0-9]\+//') + + if [ -n "${{ inputs.new_branch_major }}" ] && [ "${{ inputs.new_branch_major }}" -lt "$major" ]; then + echo >&2 "error: the chosen major version is lower than current one" && false + elif [ -n "${{ inputs.new_branch_major }}" ] && [ "${{ inputs.new_branch_major }}" -gt "$major" ]; then + major=${{ inputs.new_branch_major }} + if [ -n "${{ inputs.new_branch_minor }}" ] && [ "${{ inputs.new_branch_minor }}" -ne 0 ]; then + echo >&2 "error: cannot bump minor version when bumping major version" && false + fi + minor=0 + patch=0 + elif [ -n "${{ inputs.new_branch_minor }}" ] && [ "${{ inputs.new_branch_minor }}" -lt "$minor" ]; then + echo >&2 "error: the chosen minor version is lower than current one" && false + elif [ -n "${{ inputs.new_branch_minor }}" ] && [ "${{ inputs.new_branch_minor }}" -gt "$minor" ]; then + minor=${{ inputs.new_branch_minor }} + patch=0 + elif [ "$GITHUB_REF" = "refs/heads/main" ]; then + minor=$(( $minor + 1 )) + patch=0 + else + patch=$(( $(grep '_PATCH\s\+[0-9]\+' "$VERSION_FILE" | sed 's/[^0-9]\+//') + 1 )) + fi + + echo "major=$major" >> $GITHUB_ENV + echo "minor=$minor" >> $GITHUB_ENV + echo "patch=$patch" >> $GITHUB_ENV + + - name: Update version.h + run: | + git config user.name "GitHub Runner Bot" + git config user.email "<>" + + sed -i -e "s/\(#define [^\s]\+_MAJOR\s\+\)[0-9]\+/\1${major}/" "$VERSION_FILE" + sed -i -e "s/\(#define [^\s]\+_MINOR\s\+\)[0-9]\+/\1${minor}/" "$VERSION_FILE" + sed -i -e "s/\(#define [^\s]\+_PATCH\s\+\)[0-9]\+/\1${patch}/" "$VERSION_FILE" + + git add "$VERSION_FILE" + git commit -m "release: publish version ${major}.${minor}.${patch}" + git push origin ${GITHUB_REF#refs/heads/} + + - name: Tag and release + run: | + # Create a branch if it's a major or a minor release + if [ "$patch" -eq 0 ]; then + git checkout -b "release/${major}.${minor}" + git push -u origin "release/${major}.${minor}" + fi + + # Create a tag for the release + git tag "speedb/v${major}.${minor}.${patch}" + git push origin "speedb/v${major}.${minor}.${patch}" diff --git a/.github/workflows/perf-test.yml b/.github/workflows/perf-test.yml new file mode 100644 index 0000000000..1395070382 --- /dev/null +++ b/.github/workflows/perf-test.yml @@ -0,0 +1,21 @@ +name: Performance Test + +on: + workflow_call: + workflow_dispatch: + + +jobs: + perf_test: + runs-on: perftest + + steps: + + - name: Run autoperf script via remotnic + run: | + echo Run auto perf test + #echo ${{ github.sender.login }} + #ssh -i ~/remo.k remo@9.148.1.183 /home/remo/tremotnic.sh main HG_auto_T1 ${{ github.event.pusher.name }} + ssh -i ~/remo.k remo@9.148.1.183 /home/remo/tremotnic.sh main HG_auto_T1 ${{ github.actor }} run_db_bench_large_obj + #ssh -i ~/remo.k remo@9.148.1.183 /home/remo/tremotnic.sh main HG_auto_T1 ${{ github.event.pusher.name }} run_db_bench_small_obj + #ssh -i ~/remo.k remo@9.148.1.183 /home/remo/tremotnic.sh main HG_auto_T1 ${{ github.event.pusher.name }} run_db_bench_huge_memtable diff --git a/.github/workflows/qa-tests.yml b/.github/workflows/qa-tests.yml new file mode 100644 index 0000000000..0756b5b49b --- /dev/null +++ b/.github/workflows/qa-tests.yml @@ -0,0 +1,56 @@ +name: QA Tests + +on: + workflow_dispatch: + workflow_call: + +env: + GTEST_COLOR: 1 + GTEST_THROW_ON_FAILURE: 0 + SKIP_FORMAT_BUCK_CHECKS: 1 + +jobs: + test: + runs-on: [self-hosted, ubuntu, asrunner] + container: + image: ubuntu:18.04 + volumes: + - /var/tmp:/var/tmp # Needed for env_test's IoctlFriendlyTmpdir + - /tmp:/tmp # Needed for running tests on non-overlayfs (can't use /dev/shm because there's not enough RAM on the runner) + strategy: + matrix: + include: + - name: Unit tests + short_test: TMPD="$(mktemp -d /tmp/speedb.XXXX)" make -j$(nproc) check + long_test: TMPD="$(mktemp -d /tmp/speedb.XXXX)" make -j$(nproc) check + - name: black-box + short_test: CRASH_TEST_EXT_ARGS="--duration=3600" make -j$(nproc) blackbox_asan_crash_test + long_test: CRASH_TEST_EXT_ARGS="--duration=10000" make -j$(nproc) blackbox_asan_crash_test + - name: white-box + short_test: CRASH_TEST_EXT_ARGS="--duration=3600" make -j$(nproc) whitebox_asan_crash_test + long_test: CRASH_TEST_EXT_ARGS="--duration=10000" make -j$(nproc) whitebox_asan_crash_test + + steps: + - name: Network hotfix + run: echo "nameserver 8.8.8.8" > /etc/resolv.conf + + - name: Pre + run: | + apt update -y + apt install -y build-essential clang-format parallel libgflags-dev liblz4-dev libsnappy-dev libzstd-dev python3 python3-pip curl + + - name: Checkout + uses: actions/checkout@v3 + + - name: ${{ matrix.name }} + run: | + case "$GITHUB_REF_NAME" in + release/*) + echo "Running long test for release, $(nproc) jobs" + make clean && ${{ matrix.long_test }} + ;; + *) + echo "Running short test, $(nproc) jobs" + make clean && ${{ matrix.short_test }} + ;; + esac diff --git a/.github/workflows/sanity_check.yml b/.github/workflows/sanity_check.yml index f16b79d4d6..52c4213a60 100644 --- a/.github/workflows/sanity_check.yml +++ b/.github/workflows/sanity_check.yml @@ -1,10 +1,17 @@ name: Check buck targets and code format -on: [push, pull_request] +on: [push, pull_request, workflow_call, workflow_dispatch] jobs: check: name: Check TARGETS file and code format - runs-on: ubuntu-latest + #runs-on: ubuntu:20.04 + runs-on: [self-hosted, ubuntu, asrunner] + container: + image: ubuntu:latest steps: + + - name: pre + run: apt update && apt install -y sudo git make clang build-essential clang-format + - name: Checkout feature branch uses: actions/checkout@v2 with: @@ -12,16 +19,18 @@ jobs: - name: Fetch from upstream run: | - git remote add upstream https://github.com/facebook/rocksdb.git && git fetch upstream - + git remote add upstream https://github.com/speedb-io/speedb.git && git fetch upstream + git config --global --add safe.directory $GITHUB_WORKSPACE - name: Where am I run: | echo git status && git status echo "git remote -v" && git remote -v echo git branch && git branch - - name: Setup Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 + with: + python-version: '3.x' + architecture: 'x64' - name: Install Dependencies run: python -m pip install --upgrade pip diff --git a/.github/workflows/test_fuzz.yml b/.github/workflows/test_fuzz.yml new file mode 100644 index 0000000000..96a410f03c --- /dev/null +++ b/.github/workflows/test_fuzz.yml @@ -0,0 +1,71 @@ +name: Fuzz Test + +on: + workflow_dispatch: + workflow_call: + secrets: + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + +jobs: + Fuzz: + runs-on: [self-hosted, ubuntu, asrunner] + container: + image: ubuntu:18.04 + strategy: + matrix: + include: + - name: db_fuzzer + - name: db_map_fuzzer + + steps: + - name: Pre-build + run: | + echo "nameserver 8.8.8.8" > /etc/resolv.conf + apt update && apt install -y sudo python3 git clang-tools cmake make automake ucommon-utils libtool gettext pkg-config build-essential clang-10 zlib1g-dev libbz2-dev ninja-build liblzma-dev autoconf libsnappy-dev libzstd-dev liblz4-dev binutils m4 g++-10 unzip + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install + + - uses: actions/checkout@v3 + + - name: Configure AWS credentials from Test account + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: eu-west-2 + + - name: ${{ matrix.name }} + run: | + echo 'git clone https://github.com/google/libprotobuf-mutator.git \n + cd libprotobuf-mutator \n + git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f \n + cd .. \n + export CC=clang && export CXX=clang++ && mkdir LPM && cd LPM \n + ln -s /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libstdc++.so \n + ln -s /usr/bin/clang-10 /usr/bin/clang \n + ln -s /usr/bin/clang++-10 /usr/bin/clang++ \n + cmake ../libprotobuf-mutator -GNinja -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON -DLIB_PROTO_MUTATOR_TESTING=OFF -DCMAKE_BUILD_TYPE=Release \n + ninja \n + ninja install \n + export PKG_CONFIG_PATH=$PWD:$PWD/external.protobuf/lib/pkgconfig/ \n + export PATH=$PWD/external.protobuf/bin:$PATH \n + cd $GITHUB_WORKSPACE \n + COMPILE_WITH_ASAN=1 PORTABLE=1 make -j$(nproc) static_lib \n + cd $GITHUB_WORKSPACE/fuzz \n + make ${{ matrix.name }} \n + ls -alFh $GITHUB_WORKSPACE/fuzz/ \n + echo ASAN_OPTIONS=detect_leaks=0 ./db_fuzzer \n' > prepfuz.sh + chmod +x prepfuz.sh + bash -xv prepfuz.sh + mkdir -p $GITHUB_WORKSPACE/out/ + ASAN_OPTIONS=detect_odr_violation=0 $GITHUB_WORKSPACE/fuzz/${{ matrix.name }} 2>&1 | tee $GITHUB_WORKSPACE/out/${{ matrix.name }}.log + tail -20 $GITHUB_WORKSPACE/out/${{ matrix.name }}.log | grep "==AddressSanitizer. Thread limit (4194304 threads) exceeded\. Dying\." || { echo "${{ matrix.name }} failed!" && false; } + + - name: Copy ${{ matrix.name }} logs to S3 due to failure + if: steps.${{ matrix.name }}.outcome == 'failure' + run: | + aws s3 cp $GITHUB_WORKSPACE/out/${{ matrix.name }}.log s3://spdb-github-ci/$GITHUB_SHA-${{ matrix.name }}.log diff --git a/.gitignore b/.gitignore index 7d6f61affa..78683879d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ make_config.mk +test_config.mk rocksdb.pc *.a @@ -30,6 +31,7 @@ rocksdb.pc CMakeCache.txt CMakeFiles/ build/ +.cache/ ldb manifest_dump @@ -50,6 +52,8 @@ tags etags rocksdb_dump rocksdb_undump +speedb_dump +speedb_undump db_test2 trace_analyzer trace_analyzer_test diff --git a/AUTHORS b/AUTHORS index a451875f1a..e0a9592c35 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,5 @@ +Speedb Ltd. + Facebook Inc. Facebook Engineering Team diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d400462e8..07ce43ae18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ # Run the build commands from within the Developer Command Prompt window to have paths to the compiler and runtime libraries set. # You must have git.exe in your %PATH% environment variable. # -# To build Rocksdb for Windows is as easy as 1-2-3-4-5: +# To build Speedb for Windows is as easy as 1-2-3-4-5: # # 1. Update paths to third-party libraries in thirdparty.inc file # 2. Create a new directory for build artifacts @@ -17,13 +17,13 @@ # sample command: cmake -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Release -DWITH_GFLAGS=1 -DWITH_SNAPPY=1 -DWITH_JEMALLOC=1 -DWITH_JNI=1 .. # 4. Then build the project in debug mode (you may want to add /m[:] flag to run msbuild in parallel threads # or simply /m to use all avail cores) -# msbuild rocksdb.sln +# msbuild speedb.sln # -# rocksdb.sln build features exclusions of test only code in Release. If you build ALL_BUILD then everything +# speedb.sln build features exclusions of test only code in Release. If you build ALL_BUILD then everything # will be attempted but test only code does not build in Release mode. # # 5. And release mode (/m[:] is also supported) -# msbuild rocksdb.sln /p:Configuration=Release +# msbuild speedb.sln /p:Configuration=Release # # Linux: # @@ -35,11 +35,11 @@ cmake_minimum_required(VERSION 3.10) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") -include(ReadVersion) +include(ReadSpeedbVersion) include(GoogleTest) -get_rocksdb_version(rocksdb_VERSION) -project(rocksdb - VERSION ${rocksdb_VERSION} +get_speedb_version(speedb_VERSION) +project(speedb + VERSION ${speedb_VERSION} LANGUAGES CXX C ASM) if(POLICY CMP0042) @@ -56,11 +56,17 @@ if(NOT CMAKE_BUILD_TYPE) "Default BUILD_TYPE is ${default_build_type}" FORCE) endif() -find_program(CCACHE_FOUND ccache) -if(CCACHE_FOUND) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) -endif(CCACHE_FOUND) +find_program(SCCACHE_FOUND sccache) +if(SCCACHE_FOUND) + set(CMAKE_C_COMPILER_LAUNCHER sccache CACHE STRING "C_LANUCHER is sccache" FORCE) + set(CMAKE_CXX_COMPILER_LAUNCHER sccache CACHE STRING "CXX_LANUCHER is sccache" FORCE) +else() + find_program(CCACHE_FOUND ccache) + if(CCACHE_FOUND) + set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "C_LANUCHER is ccache" FORCE) + set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "CXX_LANUCHER is ccache" FORCE) + endif(CCACHE_FOUND) +endif() option(WITH_JEMALLOC "build with JeMalloc" OFF) option(WITH_LIBURING "build with liburing" ON) @@ -123,6 +129,9 @@ else() find_package(gflags REQUIRED) set(GFLAGS_LIB gflags::gflags) endif() + if(DEFINED gflags_VERSION AND gflags_VERSION MATCHES "^2\.1\.[0-9]+") + add_definitions(-DGFLAGS_NAMESPACE=gflags) + endif() include_directories(${GFLAGS_INCLUDE_DIR}) list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB}) add_definitions(-DGFLAGS=1) @@ -481,7 +490,7 @@ if(CMAKE_COMPILER_IS_GNUCXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-memcmp") endif() -option(ROCKSDB_LITE "Build RocksDBLite version" OFF) +option(ROCKSDB_LITE "Build LITE version" OFF) if(ROCKSDB_LITE) add_definitions(-DROCKSDB_LITE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -Os") @@ -898,26 +907,77 @@ list(APPEND SOURCES utilities/transactions/lock/range/range_tree/lib/util/dbt.cc utilities/transactions/lock/range/range_tree/lib/util/memarena.cc) +if (ROCKSDB_PLUGINS) + separate_arguments(ROCKSDB_PLUGINS) +endif() +if (NOT ROCKSDB_PLUGINS OR NOT "speedb" IN_LIST ROCKSDB_PLUGINS) + list(APPEND ROCKSDB_PLUGINS speedb) +endif() +set(ROCKSDB_PLUGIN_EXTERNS "") +set(ROCKSDB_PLUGIN_BUILTINS "") message(STATUS "ROCKSDB_PLUGINS: ${ROCKSDB_PLUGINS}") -if ( ROCKSDB_PLUGINS ) - string(REPLACE " " ";" PLUGINS ${ROCKSDB_PLUGINS}) - foreach (plugin ${PLUGINS}) - add_subdirectory("plugin/${plugin}") +if( ROCKSDB_PLUGINS ) + foreach (plugin ${ROCKSDB_PLUGINS}) + set(plugin_root "plugin/${plugin}/") + add_subdirectory(${plugin_root}) + # Use get_directory_property() to avoid having to declare the variables + # with PARENT_SCOPE in the plugin CMakeLists.txt + # TODO: Change the plugin support here so that a plugin would simply define + # a target that we'll link to. + get_directory_property(${plugin}_SOURCES + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_SOURCES) + get_directory_property(${plugin}_COMPILE_FLAGS + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_COMPILE_FLAGS) foreach (src ${${plugin}_SOURCES}) - list(APPEND SOURCES plugin/${plugin}/${src}) + list(APPEND SOURCES ${plugin_root}/${src}) + set_source_files_properties( + ${plugin_root}/${src} + PROPERTIES COMPILE_FLAGS "${${plugin}_COMPILE_FLAGS}") + endforeach() + get_directory_property(${plugin}_TESTS + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_TESTS) + foreach (test ${${plugin}_TESTS}) + list(APPEND PLUGIN_TESTS ${plugin_root}/${test}) set_source_files_properties( - plugin/${plugin}/${src} + ${plugin_root}/${test} PROPERTIES COMPILE_FLAGS "${${plugin}_COMPILE_FLAGS}") endforeach() + + get_directory_property(${plugin}_INCLUDE_PATHS + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_INCLUDE_PATHS) foreach (path ${${plugin}_INCLUDE_PATHS}) include_directories(${path}) endforeach() + get_directory_property(${plugin}_LIBS + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_LIBS) foreach (lib ${${plugin}_LIBS}) list(APPEND THIRDPARTY_LIBS ${lib}) endforeach() + get_directory_property(${plugin}_LINK_PATHS + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_LINK_PATHS) foreach (link_path ${${plugin}_LINK_PATHS}) link_directories(AFTER ${link_path}) endforeach() + get_directory_property(${plugin}_FUNC + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_FUNC) + string(STRIP "${${plugin}_FUNC}" ${plugin}_FUNC) + if (NOT "${plugin}_FUNC" STREQUAL "") + string(APPEND ROCKSDB_PLUGIN_BUILTINS "{\"${plugin}\", ${${plugin}_FUNC} },") + string(APPEND ROCKSDB_PLUGIN_EXTERNS "int ${${plugin}_FUNC} (ROCKSDB_NAMESPACE::ObjectLibrary&, const std::string&); ") + endif() + get_directory_property(${plugin}_CMAKE_SHARED_LINKER_FLAGS + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_CMAKE_SHARED_LINKER_FLAGS) + get_directory_property(${plugin}_CMAKE_EXE_LINKER_FLAGS + DIRECTORY ${plugin_root} + DEFINITION ${plugin}_CMAKE_EXE_LINKER_FLAGS) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${${plugin}_CMAKE_SHARED_LINKER_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${${plugin}_CMAKE_EXE_LINKER_FLAGS}") endforeach() @@ -974,10 +1034,10 @@ if(USE_FOLLY) third-party/folly/folly/ScopeGuard.cpp) endif() -set(ROCKSDB_STATIC_LIB rocksdb${ARTIFACT_SUFFIX}) -set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX}) +set(ROCKSDB_STATIC_LIB ${PROJECT_NAME}${ARTIFACT_SUFFIX}) +set(ROCKSDB_SHARED_LIB ${PROJECT_NAME}-shared${ARTIFACT_SUFFIX}) -option(ROCKSDB_BUILD_SHARED "Build shared versions of the RocksDB libraries" ON) +option(ROCKSDB_BUILD_SHARED "Build shared versions of the libraries" ON) if(WIN32) @@ -986,46 +1046,16 @@ else() set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT}) endif() -set(ROCKSDB_PLUGIN_EXTERNS "") -set(ROCKSDB_PLUGIN_BUILTINS "") -message(STATUS "ROCKSDB PLUGINS TO BUILD ${ROCKSDB_PLUGINS}") -list(APPEND PLUGINS ${ROCKSDB_PLUGINS}) -foreach(PLUGIN IN LISTS PLUGINS) - set(PLUGIN_ROOT "${CMAKE_SOURCE_DIR}/plugin/${PLUGIN}/") - message("including rocksb plugin ${PLUGIN_ROOT}") - set(PLUGINMKFILE "${PLUGIN_ROOT}${PLUGIN}.mk") - if (NOT EXISTS ${PLUGINMKFILE}) - message(FATAL_ERROR "Missing plugin makefile: ${PLUGINMKFILE}") - endif() - file(READ ${PLUGINMKFILE} PLUGINMK) - string(REGEX MATCH "SOURCES = ([^\n]*)" FOO ${PLUGINMK}) - set(MK_SOURCES ${CMAKE_MATCH_1}) - separate_arguments(MK_SOURCES) - foreach(MK_FILE IN LISTS MK_SOURCES) - list(APPEND SOURCES "${PLUGIN_ROOT}${MK_FILE}") - endforeach() - string(REGEX MATCH "_FUNC = ([^\n]*)" FOO ${PLUGINMK}) - if (NOT ${CMAKE_MATCH_1} STREQUAL "") - string(APPEND ROCKSDB_PLUGIN_BUILTINS "{\"${PLUGIN}\", " ${CMAKE_MATCH_1} "},") - string(APPEND ROCKSDB_PLUGIN_EXTERNS "int " ${CMAKE_MATCH_1} "(ROCKSDB_NAMESPACE::ObjectLibrary&, const std::string&); ") - endif() - string(REGEX MATCH "_LIBS = ([^\n]*)" FOO ${PLUGINMK}) - if (NOT ${CMAKE_MATCH_1} STREQUAL "") - list(APPEND THIRDPARTY_LIBS "${CMAKE_MATCH_1}") - endif() - message("THIRDPARTY_LIBS=${THIRDPARTY_LIBS}") - #TODO: We need to set any compile/link-time flags and add any link libraries -endforeach() - string(TIMESTAMP TS "%Y-%m-%d %H:%M:%S" UTC) -set(BUILD_DATE "${TS}" CACHE STRING "the time we first built rocksdb") +set(BUILD_DATE "${TS}" CACHE STRING "the time we first built Speedb") find_package(Git) if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_SHA COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD ) execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" RESULT_VARIABLE GIT_MOD COMMAND "${GIT_EXECUTABLE}" diff-index HEAD --quiet) - execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_DATE COMMAND "${GIT_EXECUTABLE}" log -1 --date=format:"%Y-%m-%d %T" --format="%ad") + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_DATE COMMAND "${GIT_EXECUTABLE}" log -1 --date=iso --format="%ad") + string(REGEX MATCH "[-0-9]+ [:0-9]+" GIT_DATE ${GIT_DATE}) execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG RESULT_VARIABLE rv COMMAND "${GIT_EXECUTABLE}" symbolic-ref -q --short HEAD OUTPUT_STRIP_TRAILING_WHITESPACE) if (rv AND NOT rv EQUAL 0) execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG COMMAND "${GIT_EXECUTABLE}" describe --tags --exact-match OUTPUT_STRIP_TRAILING_WHITESPACE) @@ -1061,9 +1091,9 @@ if(ROCKSDB_BUILD_SHARED) else() set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES LINKER_LANGUAGE CXX - VERSION ${rocksdb_VERSION} - SOVERSION ${rocksdb_VERSION_MAJOR} - OUTPUT_NAME "rocksdb${ARTIFACT_SUFFIX}") + VERSION ${PROJECT_VERSION} + SOVERSION ${speedb_VERSION_MAJOR} + OUTPUT_NAME "${PROJECT_NAME}${ARTIFACT_SUFFIX}") endif() endif() @@ -1106,16 +1136,16 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS) include(GNUInstallDirs) include(CMakePackageConfigHelpers) - set(package_config_destination ${CMAKE_INSTALL_LIBDIR}/cmake/rocksdb) + set(package_config_destination ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}) configure_package_config_file( - ${CMAKE_CURRENT_LIST_DIR}/cmake/RocksDBConfig.cmake.in RocksDBConfig.cmake + ${CMAKE_CURRENT_LIST_DIR}/cmake/SpeedbConfig.cmake.in SpeedbConfig.cmake INSTALL_DESTINATION ${package_config_destination} ) write_basic_package_version_file( - RocksDBConfigVersion.cmake - VERSION ${rocksdb_VERSION} + SpeedbConfigVersion.cmake + VERSION ${PROJECT_VERSION} COMPATIBILITY SameMajorVersion ) @@ -1125,7 +1155,7 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS) install( TARGETS ${ROCKSDB_STATIC_LIB} - EXPORT RocksDBTargets + EXPORT SpeedbTargets COMPONENT devel ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" @@ -1134,7 +1164,7 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS) if(ROCKSDB_BUILD_SHARED) install( TARGETS ${ROCKSDB_SHARED_LIB} - EXPORT RocksDBTargets + EXPORT SpeedbTargets COMPONENT runtime ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" @@ -1144,16 +1174,16 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS) endif() install( - EXPORT RocksDBTargets + EXPORT SpeedbTargets COMPONENT devel DESTINATION ${package_config_destination} - NAMESPACE RocksDB:: + NAMESPACE Speedb:: ) install( FILES - ${CMAKE_CURRENT_BINARY_DIR}/RocksDBConfig.cmake - ${CMAKE_CURRENT_BINARY_DIR}/RocksDBConfigVersion.cmake + ${CMAKE_CURRENT_BINARY_DIR}/SpeedbConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/SpeedbConfigVersion.cmake COMPONENT devel DESTINATION ${package_config_destination} ) @@ -1170,6 +1200,20 @@ if(WITH_TESTS OR WITH_BENCHMARK_TOOLS) endif() if(WITH_TESTS) + # c_test - doesn't use gtest + # env_test - suspicious use of test::TmpDir + # deletefile_test - serial because it generates giant temporary files in + # its various tests. Running its tests in parallel can fill up your /dev/shm + # db_bloom_filter_test - serial because excessive space usage by instances + # of DBFilterConstructionReserveMemoryTestWithParam can fill up /dev/shm + # timer_queue_test - doesn't use gtest + set(NON_PARALLEL_TESTS + c_test + env_test + deletefile_test + db_bloom_filter_test + timer_queue_test + ) set(TESTS db/db_basic_test.cc env/env_basic_test.cc @@ -1365,6 +1409,7 @@ if(WITH_TESTS) utilities/ttl/ttl_test.cc utilities/util_merge_operators_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc + ${PLUGIN_TESTS} ) endif() @@ -1376,12 +1421,15 @@ if(WITH_TESTS) utilities/cassandra/test_utils.cc ) enable_testing() - add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) + add_custom_target(check + COMMAND ${CMAKE_COMMAND} -P ${PROJECT_SOURCE_DIR}/cmake/CTestRunner.cmake + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + VERBATIM USES_TERMINAL) set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX}) add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE}) target_link_libraries(${TESTUTILLIB} ${ROCKSDB_LIB}) if(MSVC) - set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb") + set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/${TESTUTILLIB}.pdb") endif() set_target_properties(${TESTUTILLIB} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1 @@ -1398,11 +1446,13 @@ if(WITH_TESTS) EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1 OUTPUT_NAME ${exename}${ARTIFACT_SUFFIX} ) - target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB}) - if(NOT "${exename}" MATCHES "db_sanity_test") + target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${TESTUTILLIB} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB}) + if(NOT "${exename}" IN_LIST NON_PARALLEL_TESTS) gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120) - add_dependencies(check ${exename}${ARTIFACT_SUFFIX}) + else() + add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX}) endif() + add_dependencies(check ${exename}${ARTIFACT_SUFFIX}) endforeach(sourcefile ${TESTS}) if(WIN32) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index d1abc700d2..31a1b69b59 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,77 +1,133 @@ -# Code of Conduct +# Contributor Covenant Code of Conduct ## Our Pledge -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to make participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation. +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. ## Our Standards -Examples of behavior that contributes to creating a positive environment -include: +Examples of behavior that contributes to a positive environment for our +community include: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +- Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +- The use of sexualized language or imagery, and sexual attention or advances + of any kind +- Trolling, insulting or derogatory comments, and personal or political + attacks +- Public or private harassment +- Publishing others' private information, such as a physical or email address, + without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members +## Scope -Examples of unacceptable behavior by participants include: +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. -* The use of sexualized language or imagery and unwelcome sexual attention or - advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic - address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting +## Enforcement -## Our Responsibilities +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +conduct@speedb.io. All complaints will be reviewed and investigated promptly and +fairly. -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. +## Enforcement Guidelines -## Scope +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: -This Code of Conduct applies within all project spaces, and it also applies when -an individual is representing the project or its community in public spaces. -Examples of representing a project or community include using an official -project e-mail address, posting via an official social media account, or acting -as an appointed representative at an online or offline event. Representation of -a project may be further defined and clarified by project maintainers. +### 1. Correction -## Enforcement +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at . All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. -[homepage]: https://www.contributor-covenant.org +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder][mozilla coc]. -For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][faq]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[mozilla coc]: https://github.com/mozilla/diversity +[faq]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 190100b429..a0afda7914 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,17 +1,483 @@ -# Contributing to RocksDB +# Contributing -## Code of Conduct -The code of conduct is described in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md) + -## Contributor License Agreement ("CLA") + -In order to accept your pull request, we need you to submit a CLA. You -only need to do this once, so if you've done this for another Facebook -open source project, you're good to go. If you are submitting a pull -request for the first time, just let us know that you have completed -the CLA and we can cross-check with your GitHub username. +## Table of contents -Complete your CLA here: +- [Overview](#overview) +- [Ways to contribute](#ways-to-contribute) + - [Help document Speedb](#help-document-speedb) + - [Help address bugs](#help-address-bugs) + - [Help contribute ideas](#help-contribute-ideas) + - [Help land changes](#help-land-changes) +- [How to become a contributor](#how-to-become-a-contributor) + - [Contribution guidelines and standards](#contribution-guidelines-and-standards) +- [Style](#style) + - [Source code](#source-code) + - [Markdown files](#markdown-files) +- [License](#license) + - [Source files](#source-files-1) + - [Markdown](#markdown) +- [Contribution workflow](#contribution-workflow) + - [Fork and build](#fork-and-build) + - [Checkout a pull requuest](#checkout-a-pull-request) + - [Make your changes](#make-your-changes) + - [Update HISTORY.md](#update-HISTORYmd) + - [Add a test](#add-a-test) + - [Run the tests](#run-the-tests) + - [C++ unit tests](#c-unit-tests) + - [Debugging single unit test failures](#debugging-single-unit-test-failures) + - [Java unit tests](#java-unit-tests) + - [Additional build flavors](#additional-build-flavors) + - [Crash tests](#crash-tests) + - [Performance tests](#performance-tests) + - [Commit changes](#commit-changes) + - [Create a pull request](#create-a-pull-request) + - [Submit a pull request](#submit-a-pull-request) -If you prefer to sign a paper copy, we can send you a PDF. Send us an -e-mail or create a new github issue to request the CLA in PDF format. + + +## Overview + +Thank you for your interest in contributing to Speedb! There are many ways to +contribute, and we appreciate all of them. If you have questions, please feel +free to ask on [GitHub](https://github.com/speedb-io/speedb/discussions). + +Please read and follow our [Code of Conduct](CODE_OF_CONDUCT.md) to keep our +community welcoming, helpful, and respectable. + +## Ways to contribute + +There are several ways to contribure to Speedb, the most obvious of which is by +contributing code changes, but it's not the only one. + +### Help document Speedb + +We strive to provide an extensive and up to date documentation of Speedb, so if +you find an area where the documentation is lacking, we would love to have you +contribute changes to address that. + +### Help address bugs + +We'll inevitably have bugs, or other kinds of issues. Helping us by reporting +such issues with detailed information (ideally with a test case attached), or +even simply analyzing and reproducing an existing issue, is a great way to get +involved. We track bugs and other kinds of issues using +[GitHub issues](https://github.com/speedb-io/speedb/issues). + +Please go over existing issues before opening a new one to avoid duplicates, and +please follow the relevant template when opening new issues. + +### Help contribute ideas + +If you have an idea for Speedb, we encourage you to +[discuss](https://github.com/speedb-io/speedb/discussions) it with the +community, and potentially prepare a proposal for it and submit it as a feature +request using the +[feature request template](https://github.com/speedb-io/speedb/issues/new?assignees=&labels=&template=feature_request.md&title=). + +If you do start working on a proposal, keep in mind that this requires a time +investment to discuss the idea with the community, get it reviewed, and +eventually implemented. We encourage discussing the idea early, before even +writing a proposal. + +### Help land changes + +If you find a feature request that you'd like to get into Speedb and there's a +pull request open for it, you can help by testing it and providing feedback. +When giving feedback, please keep comments positive and constructive. + +## How to become a contributor + +### Contribution guidelines and standards + +All documents and pull requests must be consistent with the guidelines and +follow the Speedb documentation and coding styles. + +- For **both** documentation and code: + + - When the Speedb team accepts new documentation or features, we take on + the maintenance burden. This means we'll weigh the benefit of each + contribution against the cost of maintaining it. + - The appropriate [style](#style) is applied. + - The [license](#license) is present in all contributions. + - Code review is used to improve the correctness, clarity, and consistency + of all contributions. + +- For documentation: + + - All documentation is written for clarity and readability. Beyond fixing + spelling and grammar, this also means content is worded to be accessible + to a broad audience. + - Typos or other minor fixes that don't change the meaning of a document + do not need formal review, and are often handled directly as a pull + request. + +- For code: + + - New features and substantive changes to Speedb need to go through a + formal feature request process. Pull requests are only sent after a + proposal has been discussed, submitted, and reviewed. + - Bug fixes and mechanical improvements don't need this. + - All new features and bug fixes include unit tests, as they help to (a) + document and validate concrete usage of a feature and its edge cases, + and (b) guard against future breaking changes to lower the maintenance + cost. + - Unit tests must pass with the changes. + - If some tests fail for unrelated reasons, we wait until they're fixed. + It helps to contribute a fix! + - Code changes should be made with API compatibility and evolvability in + mind. + +## Style + +### Source code + +Speedb follows the +[Google C++ Style](https://google.github.io/styleguide/cppguide.html). + +For formatting, we limit each line to 80 characters. Most formatting can be done +automatically by running + +``` +build_tools/format-diff.sh +``` + +or simply `make format` if you use GNU make. If you lack any of the dependencies +to run it, the script will print out instructions for you to install them. + +### Markdown files + +Markdown files should use [Prettier](https://prettier.io/) for formatting. + +## License + +A license is required at the top of all documents and files. + +### Source files + +Every new source file should have the following header at the top: + +``` +Copyright (C) Speedb Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +``` + +Replace `` in the copyright notice above with the current year. + +### Markdown + +Markdown files should have at the top: + +``` +# DOC TITLE + + +``` + +For example, see the top of +[this file](https://github.com/speedb-io/speedb/raw/main/CONTRIBUTING.md)'s raw +content. + +## Contribution workflow + +As most open-source projects in github, Speedb contributors work on their fork, +and send pull requests to Speedb’s repo. After a reviewer approves the pull +request, a Speedb team member will merge it. + +### Fork and build + +[Fork](https://github.com/speedb-io/speedb/fork) the Speedb repository to your +own account and clone the resulting repository to your machine. + +Refer to the [README](README.md) and [INSTALL](INSTALL.md) documents for +information about how to build Speedb locally. + +### Checkout a pull request + +If you'd like to contribute by testing a pull request and providing feedback, +this section is for you. Otherwise, if you'd like to contribute by making +changes (to code or documentation), skip this section and read the next one +instead. + +Every pull request has its own number. This number is visible both in the URL +of a pull request page as well as in the title of the pull request page itself +(in the form #123, where 123 is the PR number). Follow +[this guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/reviewing-changes-in-pull-requests/checking-out-pull-requests-locally) +in order to checkout the pull request locally (if you're using GitHub CLI, be +sure to choose the GitHub CLI option rather than Web Browser on the guide page). +After you have the pull request changes checked out locally, you can move on to +testing the changes by using the information in the "Run the tests" section +below. + +### Make your changes + +This is where you update the documentation, fix a bug, test another +contributor's fix, or add a feature. Make sure your changes adhere to the +guidelines. + +If you add a new source file, be sure to add it to the `LIB_SOURCES` variable in +[`src.mk`](src.mk) (note the backslashes at the end of each line) as well as to +the `SOURCES` variable in [`CMakeLists.txt`](CMakeLists.txt). + +#### Update HISTORY.md + +For code-related changes, add a short description of your change to the +[HISTORY](HISTORY.md) document, especially if it's a bug fix, public API change +or an awesome new feature. + +#### Add a test + +If you make a code-related change, be sure to add a unit test. Speedb uses +[GTest](https://github.com/google/googletest) for the C++ unit tests and +[JUnit](https://junit.org/) for the Java unit tests. + +For the C++ unit test, prefer adding a test to an existing unit tests suite (in +the files ending with `_test.cc`) in order to keep build and test time at bay. +However, if this is a test for a new feature and it doesn't belong in any of the +existing test suites, you may add a new file. Be sure to update the +`TEST_MAIN_SOURCES` variable in [`src.mk`](src.mk) (note the backslashes at the +end of each line) as well as the `TESTS` variable in +[`CMakeLists.txt`](CMakeLists.txt). + +### Run the tests + +This is only needed for code-related changes, so if you only made changes to +documentation you can safely skip this section. + +#### C++ unit tests + +You can run the C++ unit tests using the Makefile as explained below, or, if +you're using CMake, using `ctest`. The Makefile has support for running the unit +tests in parallel using GNU Parallel, so it's recommended that you install it +first using your system's package manager (refer to the GNU Parallel +[official webpage](https://www.gnu.org/software/parallel/) for more +information). + +In order to run unit tests execute the following command: + +``` +make check +``` + +This will build Speedb and run the tests. You can provide the `-j` flag to +`make` in order to make a better utilization of CPU and speed up the build. Note +that this flag only affects the build, not the tests themselves. If you have GNU +Parallel installed, you can control the number parallel tests to run using the +environment variable `J`. For example, to build on a 64-core CPU and run the +tests in parallel, you can run: + +``` +make J=64 check -j64 +``` + +Unlike `-j`, which if not provided defaults to 1, if `J` isn't provided, the +default is to run one job per core. + +If you switch between release and debug build, normal or lite build, or compiler +or compiler options, call `make clean` first. So here is a safe routine to run +all tests: + +``` +make clean && make check -j64 +``` + +#### Debugging single unit test failures + +You can run a specific unit test by running the test binary that contains it. If +you use GNU make, the test binary will be located in the root directory of the +repository (if you use CMake, the test binary will be in your build directory). +For example, the test `DBBasicTest.OpenWhenOpen` is in the binary +`db_basic_test`, so simply running + +``` +./db_basic_test +``` + +will run all tests in the binary. + +GTest provides some useful command line parameters, and you can see them by +providing the `--help` argument to the test binary: + +``` +./db_basic_test --help +``` + +The flag you're most likely to use is probably `--gtest_filter`, which allows +you to specify a subset of the tests to run. For example, if you only want to +run `DBBasicTest.OpenWhenOpen`: + +``` +./db_basic_test --gtest_filter="*DBBasicTest.OpenWhenOpen*" +``` + +By default, the test DB created by tests is cleared up even if the test fails. +You can preserve it by using `--gtest_throw_on_failure`. If you want to stop the +debugger when an assertion fails, specify `--gtest_break_on_failure`. + +The `KEEP_DB=1` environment variable is another way to preserve the test DB from +being deleted at the end of a unit-test run, regardless of whether the test +fails or not: + +``` +KEEP_DB=1 ./db_basic_test --gtest_filter=DBBasicTest.Open +``` + +By default, the temporary test files will be under `/tmp/rocksdbtest-/` +(except when running in parallel, in which case they are under `/dev/shm`). You +can override the location by using the `TEST_TMPDIR` environment variable. For +example: + +``` +TEST_TMPDIR=/dev/shm/my_dir ./db_basic_test +``` + +#### Java unit tests + +To run the Java unit tests, make sure you set the `JAVA_HOME` environment +variable to the path of your JDK installation and execute the following command: + +``` +make jclean && DISABLE_JEMALLOC=1 make jtest -j64 +``` + +#### Additional build flavors + +For more complicated code changes, we ask contributors to run more build flavors +before sending the code for review. + +To build with _AddressSanitizer (ASAN)_, set the `COMPILE_WITH_ASAN` environment +variable: + +``` +COMPILE_WITH_ASAN=1 make check -j64 +``` + +To build with _ThreadSanitizer (TSAN)_, set the `COMPILE_WITH_TSAN` environment +variable: + +``` +COMPILE_WITH_TSAN=1 make check -j64 +``` + +To run _UndefinedBehaviorSanitizer (UBSAN)_, set the `COMPILE_WITH_UBSAN` +environment variable: + +``` +COMPILE_WITH_UBSAN=1 make check -j64 +``` + +To run LLVM's analyzer, run: + +``` +make analyze +``` + +#### Crash tests + +For changes with higher risks, other than running all of the tests with multiple +flavors, a crash test cycle needs to be executed without failure. If crash test +doesn't cover the new feature, add it there. + +To run all crash tests, run + +``` +make crash_test -j64 +make crash_test_with_atomic_flush -j64 +``` + +If you are unable to use GNU make, you can manually build the `db_stress` +binary, and run the following commands manually: + +``` + python -u tools/db_crashtest.py whitebox + python -u tools/db_crashtest.py blackbox + python -u tools/db_crashtest.py --simple whitebox + python -u tools/db_crashtest.py --simple blackbox + python -u tools/db_crashtest.py --cf_consistency blackbox + python -u tools/db_crashtest.py --cf_consistency whitebox +``` + +#### Performance tests + +For changes that might impact performance, we suggest normal benchmarks are run +to make sure there is no regression (see [benchmark.sh](tools/benchmark.sh)). +Depending the actual performance, you may choose to run against a database +backed by disks, or memory-backed file systems. + +### Commit changes + +Please keep your commits: + +- Standalone - The code must compile and run successfully after each commit + (no breaking commits!). +- Minimal - Break your code into minimal, logically-complete chunks. +- Self-Reviewed - Always double-check yourself before submitting. + +Commit messages should: + +- Start with a component name followed by a colon. For example, if you made + changes to the documentation, prefix the commit message with `docs: `. If + you only updated tests, prefix the commit message with `tests: `. For + build-related changed use `build: `, etc. +- Reference a relevant issue, if any. This is especially relevant for bug + fixes and new features. The issue should be referenced at the end of the + first line as a hash sign followed by the issue number. For example, `#23`. + If there's more than one issue that applies, mention the main one on the + first line, and add a reference to the rest at the end of the commit message + (e.g. `Also fixes #54, #89, and #99`). +- Have the line length limited to 100 characters or less. This restriction + does not apply when quoting program output, etc. +- Be phrased in a clear and grammatically-correct language, and use present + tense ("add feature", not "added feature".) + +### Create a pull request + +When you're finished with the changes, create a pull request, also known as a +PR. If you're unfamiliar with open-source contributions on GitHub, follow the +[Creating a pull request guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request). + +#### Submit a pull request + +- Describe what your change is doing, especially if there isn't a relevant + issue open. +- Reference relevant issues and discussions, and don't forget to + [link PR to issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) + if you are solving one. +- Explain how you tested your changes (we recommend adding a "Test Plan:" + section to the pull request summary, which specifies what testing was done + to validate the quality and performance of the change). +- If your change impacts performance, explain why the specific performance + environment was chosen. Also specify at least one benchmark test case that + favors the improvement and share the results. +- Enable the checkbox to allow maintainer edits so the branch can be updated + for a merge. Once you submit your PR, a Speedb team member will review your + proposal. We may ask questions or request for additional information. +- We may ask for changes to be made before a PR can be merged, either using + suggested changes or pull request comments. You can apply suggested changes + directly through the UI. You can make any other changes in your fork, then + commit them to your branch. +- If you run into any merge issues, check out this + [git tutorial](https://lab.github.com/githubtraining/managing-merge-conflicts) + to help you resolve merge conflicts and other issues. diff --git a/COPYING b/COPYING deleted file mode 100644 index d159169d10..0000000000 --- a/COPYING +++ /dev/null @@ -1,339 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Lesser General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. diff --git a/HISTORY.md b/HISTORY.md index 06cbc6307e..db668c4966 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,41 @@ +# Speedb Change Log +## Apricot 2.0.0 (08/04/2022) +## New Features +* Added a new hash based memtable that supports concurrent reads and writes +* Added ability to create MemTableFactory from URI/string to tools + +## Bug Fixes +* Avoid comparing Status using == as it compares only status codes. The comparison breaks when comparing against status::NoSpace() since it has a status code of `Code::kIOError` and only a subcode of `SubCode::kNoSpace` +* Fixed snapshots leak in optimistic_transaction_example: whenever the example is run under ASan, snapshots are acquired but not released, resulting in a memory leak error. +* ldb: fix get to print the entire value +* db_bench: fix Rocksdb bug of last_ref assertion. Test fails to delete multi-dbs correctly. +* db_bench: fix SeekRandom and ReadRandomWriteRandom to work on all CFs instead of the default +* db_bench to report accurate response time when using rate limit +* db_test: add test for - forward the incomplete status on no_io (https://github.com/facebook/rocksdb/pull/8485) +* CMake: use the old plugin infra and add support for *_FUNC* registration + +## Miscellaneous +* LOG: Print write_buffer_manager size to LOG +* LOG: change log header to SpeeDB +* LOG & db_bench: metadata_cache_options - print to LOG and support its configuration in db_bench +* db_impl: use unique_ptr in DBImpl::Open for nicer memory management +* Explicitly compare the SuperVersion pointer in column_family +* Rename rocksdb threads to speedb +* Add a version number to Speedb builds +* Clang-Format: Do not include third-party code as any changes are either version updates or fixes. +* Git: add clangd cache to .gitignore + + # Rocksdb Change Log +## 7.2.2 (04/28/2022) +### Bug Fixes +* Fixed a bug in async_io path where incorrect length of data is read by FilePrefetchBuffer if data is consumed from two populated buffers and request for more data is sent. + +## 7.2.1 (04/26/2022) +### Bug Fixes +* Fixed a bug where RocksDB could corrupt DBs with `avoid_flush_during_recovery == true` by removing valid WALs, leading to `Status::Corruption` with message like "SST file is ahead of WALs" when attempting to reopen. +* RocksDB calls FileSystem::Poll API during FilePrefetchBuffer destruction which impacts performance as it waits for read requets completion which is not needed anymore. Calling FileSystem::AbortIO to abort those requests instead fixes that performance issue. + ## 7.2.0 (04/15/2022) ### Bug Fixes * Fixed bug which caused rocksdb failure in the situation when rocksdb was accessible using UNC path @@ -22,6 +59,8 @@ * Added a dedicated integer DB property `rocksdb.live-blob-file-garbage-size` that exposes the total amount of garbage in the blob files in the current version. * RocksDB does internal auto prefetching if it notices sequential reads. It starts with readahead size `initial_auto_readahead_size` which now can be configured through BlockBasedTableOptions. * Add a merge operator that allows users to register specific aggregation function so that they can does aggregation using different aggregation types for different keys. See comments in include/rocksdb/utilities/agg_merge.h for actual usage. The feature is experimental and the format is subject to change and we won't provide a migration tool. +* Meta-internal / Experimental: Improve CPU performance by replacing many uses of std::unordered_map with folly::F14FastMap when RocksDB is compiled together with Folly. +* Experimental: Add CompressedSecondaryCache, a concrete implementation of rocksdb::SecondaryCache, that integrates with compression libraries (e.g. LZ4) to hold compressed blocks. ### Behavior changes * Disallow usage of commit-time-write-batch for write-prepared/write-unprepared transactions if TransactionOptions::use_only_the_last_commit_time_batch_for_recovery is false to prevent two (or more) uncommitted versions of the same key in the database. Otherwise, bottommost compaction may violate the internal key uniqueness invariant of SSTs if the sequence numbers of both internal keys are zeroed out (#9794). @@ -30,6 +69,7 @@ ### Public API changes * Exposed APIs to examine results of block cache stats collections in a structured way. In particular, users of `GetMapProperty()` with property `kBlockCacheEntryStats` can now use the functions in `BlockCacheEntryStatsMapKeys` to find stats in the map. * Add `fail_if_not_bottommost_level` to IngestExternalFileOptions so that ingestion will fail if the file(s) cannot be ingested to the bottommost level. +* Add output parameter `is_in_sec_cache` to `SecondaryCache::Lookup()`. It is to indicate whether the handle is possibly erased from the secondary cache after the Lookup. ## 7.1.0 (03/23/2022) ### New Features diff --git a/INSTALL.md b/INSTALL.md index 7d3b147796..f98fda4a7e 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,212 +1,143 @@ ## Compilation -**Important**: If you plan to run RocksDB in production, don't compile using default -`make` or `make all`. That will compile RocksDB in debug mode, which is much slower -than release mode. +**Important**: If you plan to run Speedb in production, don't compile using +default `make` or `make all` invocations. That will compile Speedb in debug +mode, which is much slower than release mode. -RocksDB's library should be able to compile without any dependency installed, -although we recommend installing some compression libraries (see below). -We do depend on newer gcc/clang with C++17 support (GCC >= 7, Clang >= 5). +Speedb's library should be able to compile without any dependency installed, +although we recommend installing some compression libraries (see below). We do +depend on newer gcc/clang with C++17 support (GCC >= 7, Clang >= 5). -There are few options when compiling RocksDB: +There are few options when compiling Speedb: -* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. Compiles static library in release mode. +- [recommended] `make static_lib` will compile the Speedb static library + (`librocksdb.a`) in release mode. -* `make shared_lib` will compile librocksdb.so, RocksDB shared library. Compiles shared library in release mode. +- `make shared_lib` will compile the Speedb shared library (`librocksdb.so`) + in release mode. -* `make check` will compile and run all the unit tests. `make check` will compile RocksDB in debug mode. +- `make check` will compile and run all the unit tests. `make check` will + compile Speedb in debug mode. -* `make all` will compile our static library, and all our tools and unit tests. Our tools -depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't -use binaries compiled by `make all` in production. +- `make all` will compile our static library, and all our tools and unit + tests. Our tools depend on gflags. You will need to have gflags installed to + run `make all`. This will compile Speedb in debug mode. Don't use binaries + compiled by `make all` in production. -* By default the binary we produce is optimized for the platform you're compiling on -(`-march=native` or the equivalent). SSE4.2 will thus be enabled automatically if your -CPU supports it. To print a warning if your CPU does not support SSE4.2, build with -`USE_SSE=1 make static_lib` or, if using CMake, `cmake -DFORCE_SSE42=ON`. If you want -to build a portable binary, add `PORTABLE=1` before your make commands, like this: -`PORTABLE=1 make static_lib`. +- By default the binary we produce is optimized for the platform you're + compiling on (`-march=native` or the equivalent). SSE4.2 will thus be + enabled automatically if your CPU supports it. To print a warning if your + CPU does not support SSE4.2, build with `USE_SSE=1 make static_lib` or, if + using CMake, `cmake -DFORCE_SSE42=ON`. If you want to build a portable + binary, add `PORTABLE=1` before your make commands, like this: + `PORTABLE=1 make static_lib`, or `cmake -DPORTABLE=1` if using CMake. ## Dependencies -* You can link RocksDB with following compression libraries: - - [zlib](http://www.zlib.net/) - a library for data compression. - - [bzip2](http://www.bzip.org/) - a library for data compression. - - [lz4](https://github.com/lz4/lz4) - a library for extremely fast data compression. - - [snappy](http://google.github.io/snappy/) - a library for fast - data compression. - - [zstandard](http://www.zstd.net) - Fast real-time compression - algorithm. +- You can link Speedb with following compression libraries: -* All our tools depend on: - - [gflags](https://gflags.github.io/gflags/) - a library that handles - command line flags processing. You can compile rocksdb library even - if you don't have gflags installed. + - [zlib](http://www.zlib.net/) - a library for data compression. + - [bzip2](http://www.bzip.org/) - a library for data compression. + - [lz4](https://github.com/lz4/lz4) - a library for extremely fast data + compression. + - [snappy](http://google.github.io/snappy/) - a library for fast data + compression. + - [zstandard](http://www.zstd.net) - Fast real-time compression algorithm. -* `make check` will also check code formatting, which requires [clang-format](https://clang.llvm.org/docs/ClangFormat.html) +- All of our tools depend on: -* If you wish to build the RocksJava static target, then cmake is required for building Snappy. + - [gflags](https://gflags.github.io/gflags/) - a library that handles + command line flags processing. Note that this only required for building + the tools, and that you can compile the Speedb library even if you don't + have gflags installed. -* If you wish to run microbench (e.g, `make microbench`, `make ribbon_bench` or `cmake -DWITH_BENCHMARK=1`), Google benchmark >= 1.6.0 is needed. +- `make check` will also check code formatting, which requires + [clang-format](https://clang.llvm.org/docs/ClangFormat.html) -## Supported platforms - -* **Linux - Ubuntu** - * Upgrade your gcc to version at least 7 to get C++17 support. - * Install gflags. First, try: `sudo apt-get install libgflags-dev` - If this doesn't work and you're using Ubuntu, here's a nice tutorial: - (http://askubuntu.com/questions/312173/installing-gflags-12-04) - * Install snappy. This is usually as easy as: - `sudo apt-get install libsnappy-dev`. - * Install zlib. Try: `sudo apt-get install zlib1g-dev`. - * Install bzip2: `sudo apt-get install libbz2-dev`. - * Install lz4: `sudo apt-get install liblz4-dev`. - * Install zstandard: `sudo apt-get install libzstd-dev`. - -* **Linux - CentOS / RHEL** - * Upgrade your gcc to version at least 7 to get C++17 support - * Install gflags: - - git clone https://github.com/gflags/gflags.git - cd gflags - git checkout v2.0 - ./configure && make && sudo make install - - **Notice**: Once installed, please add the include path for gflags to your `CPATH` environment variable and the - lib path to `LIBRARY_PATH`. If installed with default settings, the include path will be `/usr/local/include` - and the lib path will be `/usr/local/lib`. - - * Install snappy: - - sudo yum install snappy snappy-devel - - * Install zlib: - - sudo yum install zlib zlib-devel - - * Install bzip2: +- If you wish to build the RocksJava static target, then CMake is required for + building Snappy. - sudo yum install bzip2 bzip2-devel +- If you wish to run microbench (e.g, `make microbench`, `make ribbon_bench` + or `cmake -DWITH_BENCHMARK=1`), Google benchmark >= 1.6.0 is needed. - * Install lz4: - - sudo yum install lz4-devel - - * Install ASAN (optional for debugging): - - sudo yum install libasan - - * Install zstandard: - * With [EPEL](https://fedoraproject.org/wiki/EPEL): - - sudo yum install libzstd-devel - - * With CentOS 8: - - sudo dnf install libzstd-devel - - * From source: - - wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz - mv v1.1.3.tar.gz zstd-1.1.3.tar.gz - tar zxvf zstd-1.1.3.tar.gz - cd zstd-1.1.3 - make && sudo make install +## Supported platforms -* **OS X**: - * Install latest C++ compiler that supports C++ 17: - * Update XCode: run `xcode-select --install` (or install it from XCode App's settting). - * Install via [homebrew](http://brew.sh/). - * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. - * run `brew tap homebrew/versions; brew install gcc7 --use-llvm` to install gcc 7 (or higher). - * run `brew install rocksdb` +- **Linux - Ubuntu** -* **FreeBSD** (11.01): + - Upgrade your gcc to version at least 7 to get C++17 support. + - Install gflags. First, try: `sudo apt-get install libgflags-dev` If this + doesn't work and you're using Ubuntu, here's a nice tutorial: + (http://askubuntu.com/questions/312173/installing-gflags-12-04) + - Install snappy. This is usually as easy as: + `sudo apt-get install libsnappy-dev`. + - Install zlib. Try: `sudo apt-get install zlib1g-dev`. + - Install bzip2: `sudo apt-get install libbz2-dev`. + - Install lz4: `sudo apt-get install liblz4-dev`. + - Install zstandard: `sudo apt-get install libzstd-dev`. - * You can either install RocksDB from the Ports system using `cd /usr/ports/databases/rocksdb && make install`, or you can follow the details below to install dependencies and compile from source code: +- **Linux - CentOS / RHEL** - * Install the dependencies for RocksDB: + - Upgrade your gcc to version at least 7 to get C++17 support + - Install gflags: - export BATCH=YES - cd /usr/ports/devel/gmake && make install - cd /usr/ports/devel/gflags && make install + git clone https://github.com/gflags/gflags.git + cd gflags + git checkout v2.0 + ./configure && make && sudo make install - cd /usr/ports/archivers/snappy && make install - cd /usr/ports/archivers/bzip2 && make install - cd /usr/ports/archivers/liblz4 && make install - cd /usr/ports/archivesrs/zstd && make install + **Notice**: Once installed, please add the include path for gflags to + your `CPATH` environment variable and the lib path to `LIBRARY_PATH`. If + installed with default settings, the include path will be + `/usr/local/include` and the lib path will be `/usr/local/lib`. - cd /usr/ports/devel/git && make install + - Install snappy: + sudo yum install snappy snappy-devel - * Install the dependencies for RocksJava (optional): + - Install zlib: - export BATCH=yes - cd /usr/ports/java/openjdk7 && make install + sudo yum install zlib zlib-devel - * Build RocksDB from source: - cd ~ - git clone https://github.com/facebook/rocksdb.git - cd rocksdb - gmake static_lib + - Install bzip2: - * Build RocksJava from source (optional): - cd rocksdb - export JAVA_HOME=/usr/local/openjdk7 - gmake rocksdbjava + sudo yum install bzip2 bzip2-devel -* **OpenBSD** (6.3/-current): + - Install lz4: - * As RocksDB is not available in the ports yet you have to build it on your own: + sudo yum install lz4-devel - * Install the dependencies for RocksDB: + - Install ASAN (optional for debugging): - pkg_add gmake gflags snappy bzip2 lz4 zstd git jdk bash findutils gnuwatch + sudo yum install libasan - * Build RocksDB from source: + - Install zstandard: - cd ~ - git clone https://github.com/facebook/rocksdb.git - cd rocksdb - gmake static_lib + - With [EPEL](https://fedoraproject.org/wiki/EPEL): - * Build RocksJava from source (optional): + sudo yum install libzstd-devel - cd rocksdb - export JAVA_HOME=/usr/local/jdk-1.8.0 - export PATH=$PATH:/usr/local/jdk-1.8.0/bin - gmake rocksdbjava + - With CentOS 8: -* **iOS**: - * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`. + sudo dnf install libzstd-devel -* **Windows** (Visual Studio 2017 to up): - * Read and follow the instructions at CMakeLists.txt - * Or install via [vcpkg](https://github.com/microsoft/vcpkg) - * run `vcpkg install rocksdb:x64-windows` + - From source: -* **AIX 6.1** - * Install AIX Toolbox rpms with gcc - * Use these environment variables: + wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz + mv v1.1.3.tar.gz zstd-1.1.3.tar.gz + tar zxvf zstd-1.1.3.tar.gz + cd zstd-1.1.3 + make && sudo make install - export PORTABLE=1 - export CC=gcc - export AR="ar -X64" - export EXTRA_ARFLAGS=-X64 - export EXTRA_CFLAGS=-maix64 - export EXTRA_CXXFLAGS=-maix64 - export PLATFORM_LDFLAGS="-static-libstdc++ -static-libgcc" - export LIBPATH=/opt/freeware/lib - export JAVA_HOME=/usr/java8_64 - export PATH=/opt/freeware/bin:$PATH +- **OS X**: -* **Solaris Sparc** - * Install GCC 7 and higher. - * Use these environment variables: + - Install latest C++ compiler that supports C++ 17: + - Update XCode: run `xcode-select --install` (or install it from XCode + App's settting). + - Install via [homebrew](http://brew.sh/). + - If you're first time developer in MacOS, you still need to run: + `xcode-select --install` in your command line. + - run `brew tap homebrew/versions; brew install gcc7 --use-llvm` + to install gcc 7 (or higher). - export CC=gcc - export EXTRA_CFLAGS=-m64 - export EXTRA_CXXFLAGS=-m64 - export EXTRA_LDFLAGS=-m64 - export PORTABLE=1 - export PLATFORM_LDFLAGS="-static-libstdc++ -static-libgcc" +- **Windows** (Visual Studio 2017 to up): + - Read and follow the instructions at CMakeLists.txt diff --git a/LICENSE.Apache b/LICENSE similarity index 100% rename from LICENSE.Apache rename to LICENSE diff --git a/Makefile b/Makefile index 41b8a98d6b..606e043d32 100644 --- a/Makefile +++ b/Makefile @@ -6,9 +6,11 @@ #----------------------------------------------- -BASH_EXISTS := $(shell which bash) -SHELL := $(shell which bash) -include python.mk +# Prefer bash, but don't overwrite the existing setting if not found +SHELL := $(shell command -v bash || echo $(SHELL)) +include common.mk + +PROJECT_NAME := speedb CLEAN_FILES = # deliberately empty, so we can append below. CFLAGS += ${EXTRA_CFLAGS} @@ -19,24 +21,21 @@ ARFLAGS = ${EXTRA_ARFLAGS} rs STRIPFLAGS = -S -x # Transform parallel LOG output into something more readable. -perl_command = perl -n \ - -e '@a=split("\t",$$_,-1); $$t=$$a[8];' \ - -e '$$t =~ /.*if\s\[\[\s"(.*?\.[\w\/]+)/ and $$t=$$1;' \ - -e '$$t =~ s,^\./,,;' \ - -e '$$t =~ s, >.*,,; chomp $$t;' \ - -e '$$t =~ /.*--gtest_filter=(.*?\.[\w\/]+)/ and $$t=$$1;' \ - -e 'printf "%7.3f %s %s\n", $$a[3], $$a[6] == 0 ? "PASS" : "FAIL", $$t' -quoted_perl_command = $(subst ','\'',$(perl_command)) +parallel_log_extract = awk \ + 'BEGIN{FS="\t"} { \ + t=$$9; sub(/if *\[\[ *"/,"",t); sub(/" =.*/,"",t); sub(/ >.*/,"",t); sub(/.*--gtest_filter=/,"",t); \ + printf("%7.3f %s %s\n",4,($$7 == 0 ? "PASS" : "FAIL"),t) \ + }' # DEBUG_LEVEL can have three values: -# * DEBUG_LEVEL=2; this is the ultimate debug mode. It will compile rocksdb +# * DEBUG_LEVEL=2; this is the ultimate debug mode. It will compile Speedb # without any optimizations. To compile with level 2, issue `make dbg` # * DEBUG_LEVEL=1; debug level 1 enables all assertions and debug code, but -# compiles rocksdb with -O2 optimizations. this is the default debug level. -# `make all` or `make ` compile RocksDB with debug level 1. -# We use this debug level when developing RocksDB. +# compiles Speedb with -O2 optimizations. this is the default debug level. +# `make all` or `make ` compile Speedb with debug level 1. +# We use this debug level when developing Speedb. # * DEBUG_LEVEL=0; this is the debug level we use for release. If you're -# running rocksdb in production you most definitely want to compile RocksDB +# running Speedb in production you most definitely want to compile Speedb # with debug level 0. To compile with level 0, run `make shared_lib`, # `make install-shared`, `make static_lib`, `make install-static` or # `make install` @@ -168,7 +167,7 @@ endif # `USE_LTO=1` enables link-time optimizations. Among other things, this enables # more devirtualization opportunities and inlining across translation units. -# This can save significant overhead introduced by RocksDB's pluggable +# This can save significant overhead introduced by Speedb's pluggable # interfaces/internal abstractions, like in the iterator hierarchy. It works # better when combined with profile-guided optimizations (not currently # supported natively in Makefile). @@ -213,11 +212,35 @@ am__v_AR_1 = AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(EXEC_LDFLAGS) $(LDFLAGS) -o $@ +# if user didn't config LIBNAME, set the default +ifeq ($(LIBNAME),) + export LIBNAME=lib$(PROJECT_NAME) +# we should only run Speedb in production with DEBUG_LEVEL 0 +ifneq ($(DEBUG_LEVEL),0) + LIBDEBUG=_debug +endif + +endif +# Only regenerate make_config.mk if it doesn't exists or if we're invoked in a mode +# that executes target recipes (i.e. not -n or -q) +ifeq ($(and $(or $(findstring n,$(MAKEFLAGS)),$(findstring q,$(MAKEFLAGS))),$(wildcard make_config.mk)),) +# Only generate make_config.mk during the main make invocation, not on restarts +# (restarts are caused by Makefiles being updated during the parsing of the Makefile, +# which is exactly what happens when make_config.mk is regenerated and included). +ifeq ($(MAKE_RESTARTS),) +# If make_config.mk exists and the make invocation was for a target that doesn't +# need to regenerate it (because it doesn't build anything), such as `make clean`, +# don't perform the regeneration since these targets either don't need make_config.mk +# at all or only need to use the existing configuration in make_config.mk to do +# their job. +NO_CONFIG_REGENERATION_TARGETS := clean% jclean uninstall dump-log watch-log tags% format check-format check-buck-targets check-sources package checkout_folly list_all_tests +ifneq ($(strip $(and $(wildcard make_config.mk),$(filter-out $(NO_CONFIG_REGENERATION_TARGETS),$(MAKECMDGOALS) make_config.mk))),make_config.mk) + # Detect what platform we're building on. # Export some common variables that might have been passed as Make variables # instead of environment variables. -dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \ - export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \ +$(info * GEN make_config.mk) +dummy := $(shell (export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \ export LDFLAGS="$(EXTRA_LDFLAGS)"; \ export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \ export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \ @@ -225,11 +248,21 @@ dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \ export PORTABLE="$(PORTABLE)"; \ export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \ export USE_CLANG="$(USE_CLANG)"; \ + export LIBNAME="$(LIBNAME)"; \ export LIB_MODE="$(LIB_MODE)"; \ "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk")) + +endif +endif +endif + # this file is generated by the previous line to set build flags and sources include make_config.mk +ifeq ($(strip $(filter speedb,$(ROCKSDB_PLUGINS))),) +ROCKSDB_PLUGINS += speedb +endif + ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk) include $(ROCKSDB_PLUGIN_MKS) ROCKSDB_PLUGIN_PROTO =ROCKSDB_NAMESPACE::ObjectLibrary\&, const std::string\& @@ -241,6 +274,7 @@ ROCKSDB_PLUGIN_EXTERNS = $(foreach p, $(ROCKSDB_PLUGIN_W_FUNCS), int $($(p)_FUNC ROCKSDB_PLUGIN_BUILTINS = $(foreach p, $(ROCKSDB_PLUGIN_W_FUNCS), {\"$(p)\"\, $($(p)_FUNC)}\,) ROCKSDB_PLUGIN_LDFLAGS = $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_LDFLAGS)) ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES = $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_PKGCONFIG_REQUIRES)) +ROCKSDB_PLUGIN_TESTS = $(foreach p, $(ROCKSDB_PLUGINS), $(foreach test, $($(p)_TESTS), plugin/$(p)/$(test))) CXXFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_CXXFLAGS)) PLATFORM_LDFLAGS += $(ROCKSDB_PLUGIN_LDFLAGS) @@ -276,7 +310,7 @@ endif endif export JAVAC_ARGS -CLEAN_FILES += make_config.mk rocksdb.pc +CLEAN_FILES += make_config.mk test_config.mk $(PROJECT_NAME).pc ifeq ($(V), 1) $(info $(shell uname -a)) @@ -567,10 +601,12 @@ STRESS_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES)) ALL_SOURCES = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES) -ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES) +ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES) $(ROCKSDB_PLUGIN_TESTS) +PLUGIN_TESTS = $(patsubst %.cc, %, $(notdir $(ROCKSDB_PLUGIN_TESTS))) TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES))) TESTS += $(patsubst %.c, %, $(notdir $(TEST_MAIN_SOURCES_C))) +TESTS += $(PLUGIN_TESTS) # `make check-headers` to very that each header file includes its own # dependencies @@ -620,11 +656,14 @@ ROCKSDBTESTS_SUBSET ?= $(TESTS) # its various tests. Parallel can fill up your /dev/shm # db_bloom_filter_test - serial because excessive space usage by instances # of DBFilterConstructionReserveMemoryTestWithParam can fill up /dev/shm +# timer_queue_test - doesn't use gtest NON_PARALLEL_TEST = \ c_test \ env_test \ deletefile_test \ db_bloom_filter_test \ + timer_queue_test \ + $(PLUGIN_TESTS) \ PARALLEL_TEST = $(filter-out $(NON_PARALLEL_TEST), $(TESTS)) @@ -676,24 +715,16 @@ else ifeq ($(ROCKSDBTESTS_PLATFORM_DEPENDENT), exclude) endif # bench_tool_analyer main is in bench_tool_analyzer_tool, or this would be simpler... -TOOLS = $(patsubst %.cc, %, $(notdir $(patsubst %_tool.cc, %.cc, $(TOOLS_MAIN_SOURCES)))) +TOOLS = $(patsubst rocksdb_%, $(PROJECT_NAME)_%,$(patsubst %.cc, %, $(notdir $(patsubst %_tool.cc, %.cc, $(TOOLS_MAIN_SOURCES))))) TEST_LIBS = \ - librocksdb_env_basic_test.a + lib$(PROJECT_NAME)_env_basic_test.a # TODO: add back forward_iterator_bench, after making it build in all environemnts. BENCHMARKS = $(patsubst %.cc, %, $(notdir $(BENCH_MAIN_SOURCES))) MICROBENCHS = $(patsubst %.cc, %, $(notdir $(MICROBENCH_SOURCES))) -# if user didn't config LIBNAME, set the default -ifeq ($(LIBNAME),) - LIBNAME=librocksdb -# we should only run rocksdb in production with DEBUG_LEVEL 0 -ifneq ($(DEBUG_LEVEL),0) - LIBDEBUG=_debug -endif -endif STATIC_LIBRARY = ${LIBNAME}$(LIBDEBUG).a STATIC_TEST_LIBRARY = ${LIBNAME}_test$(LIBDEBUG).a STATIC_TOOLS_LIBRARY = ${LIBNAME}_tools$(LIBDEBUG).a @@ -720,10 +751,6 @@ TOOLS_LIBRARY=$(STATIC_TOOLS_LIBRARY) endif STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY) -ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) -ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) -ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) - # If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but # the file needs to already exist or else the build will fail ifndef NO_UPDATE_BUILD_VERSION @@ -740,7 +767,7 @@ else git_sha := $(shell git rev-parse HEAD 2>/dev/null) git_tag := $(shell git symbolic-ref -q --short HEAD 2> /dev/null || git describe --tags --exact-match 2>/dev/null) git_mod := $(shell git diff-index HEAD --quiet 2>/dev/null; echo $$?) - git_date := $(shell git log -1 --date=format:"%Y-%m-%d %T" --format="%ad" 2>/dev/null) + git_date := $(shell git log -1 --date=iso --format="%ad" 2>/dev/null | awk '{print $1 " " $2}' 2>/dev/null) endif gen_build_version = sed -e s/@GIT_SHA@/$(git_sha)/ -e s:@GIT_TAG@:"$(git_tag)": -e s/@GIT_MOD@/"$(git_mod)"/ -e s/@BUILD_DATE@/"$(build_date)"/ -e s/@GIT_DATE@/"$(git_date)"/ -e s/@ROCKSDB_PLUGIN_BUILTINS@/'$(ROCKSDB_PLUGIN_BUILTINS)'/ -e s/@ROCKSDB_PLUGIN_EXTERNS@/"$(ROCKSDB_PLUGIN_EXTERNS)"/ util/build_version.cc.in @@ -769,9 +796,9 @@ SHARED3 = $(SHARED1) SHARED4 = $(SHARED1) SHARED = $(SHARED1) else -SHARED_MAJOR = $(ROCKSDB_MAJOR) -SHARED_MINOR = $(ROCKSDB_MINOR) -SHARED_PATCH = $(ROCKSDB_PATCH) +SHARED_MAJOR = $(VERSION_MAJOR) +SHARED_MINOR = $(VERSION_MINOR) +SHARED_PATCH = $(VERSION_PATCH) SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) ifeq ($(PLATFORM), OS_MACOSX) SHARED_OSX = $(LIBNAME)$(LIBDEBUG).$(SHARED_MAJOR) @@ -843,18 +870,6 @@ coverage: clean # Delete intermediate files $(FIND) . -type f \( -name "*.gcda" -o -name "*.gcno" \) -exec rm -f {} \; -ifneq (,$(filter check parallel_check,$(MAKECMDGOALS)),) -# Use /dev/shm if it has the sticky bit set (otherwise, /tmp), -# and create a randomly-named rocksdb.XXXX directory therein. -# We'll use that directory in the "make check" rules. -ifeq ($(TMPD),) -TMPDIR := $(shell echo $${TMPDIR:-/tmp}) -TMPD := $(shell f=/dev/shm; test -k $$f || f=$(TMPDIR); \ - perl -le 'use File::Temp "tempdir";' \ - -e 'print tempdir("'$$f'/rocksdb.XXXX", CLEANUP => 0)') -endif -endif - # Run all tests in parallel, accumulating per-test logs in t/log-*. # # Each t/run-* file is a tiny generated bourne shell script that invokes one of @@ -883,29 +898,26 @@ endif # parallel_tests = $(patsubst %,parallel_%,$(PARALLEL_TEST)) -.PHONY: gen_parallel_tests $(parallel_tests) -$(parallel_tests): - $(AM_V_at)TEST_BINARY=$(patsubst parallel_%,%,$@); \ +.PHONY: $(parallel_tests) +$(parallel_tests): $(parallel_tests:parallel_%=%) + $(AM_V_at)mkdir -p t; \ + TEST_BINARY=$(patsubst parallel_%,%,$@); \ TEST_NAMES=` \ (./$$TEST_BINARY --gtest_list_tests || echo " $${TEST_BINARY}__list_tests_failure") \ | awk '/^[^ ]/ { prefix = $$1 } /^[ ]/ { print prefix $$1 }'`; \ echo " Generating parallel test scripts for $$TEST_BINARY"; \ + rm -f t/run-$${TEST_BINARY}-*; \ for TEST_NAME in $$TEST_NAMES; do \ - TEST_SCRIPT=t/run-$$TEST_BINARY-$${TEST_NAME//\//-}; \ + TEST_SCRIPT=run-$${TEST_BINARY}-$${TEST_NAME//\//-}; \ printf '%s\n' \ '#!/bin/sh' \ - "d=\$(TMPD)$$TEST_SCRIPT" \ - 'mkdir -p $$d' \ - "TEST_TMPDIR=\$$d $(DRIVER) ./$$TEST_BINARY --gtest_filter=$$TEST_NAME" \ - > $$TEST_SCRIPT; \ - chmod a=rx $$TEST_SCRIPT; \ + "d=\"$(TEST_TMPDIR)/$$TEST_SCRIPT\"" \ + 'mkdir -p "$$d"' \ + "TEST_TMPDIR=\"\$$d\" $(DRIVER) ./$$TEST_BINARY --gtest_filter=$$TEST_NAME && rm -rf \"\$$d\"" \ + > t/$$TEST_SCRIPT; \ + chmod a=rx t/$$TEST_SCRIPT; \ done -gen_parallel_tests: - $(AM_V_at)mkdir -p t - $(AM_V_at)$(FIND) t -type f -name 'run-*' -exec rm -f {} \; - $(MAKE) $(parallel_tests) - # Reorder input lines (which are one per test) so that the # longest-running tests appear first in the output. # Do this by prefixing each selected name with its duration, @@ -923,9 +935,9 @@ gen_parallel_tests: # 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest # slow_test_regexp = \ - ^.*MySQLStyleTransactionTest.*$$|^.*SnapshotConcurrentAccessTest.*$$|^.*SeqAdvanceConcurrentTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$ + ^.*MySQLStyleTransactionTest.*$$\|^.*SnapshotConcurrentAccessTest.*$$\|^.*SeqAdvanceConcurrentTest.*$$\|^t/run-table_test-HarnessTest.Randomized$$\|^t/run-db_test-.*FileCreationRandomFailure$$\|^t/run-db_test-.*EncodeDecompressedBlockSizeTest$$\|^.*RecoverFromCorruptedWALWithoutFlush$$ prioritize_long_running_tests = \ - perl -pe 's,($(slow_test_regexp)),100 $$1,' \ + sed 's,\($(slow_test_regexp)\),100 \1,' \ | sort -k1,1gr \ | sed 's/^[.0-9]* //' @@ -936,6 +948,13 @@ prioritize_long_running_tests = \ # See "man parallel" for its "-j ..." option. J ?= 100% +PARALLEL ?= parallel +PARALLEL_OK := $(shell command -v "$(PARALLEL)" 2>&1 >/dev/null && \ + ("$(PARALLEL)" --gnu --version 2>/dev/null | grep -q 'Ole Tange') && \ + echo 1) +# Use a timeout of 10 minutes per test by default +TEST_TIMEOUT?=600 + # Use this regexp to select the subset of tests whose names match. tests-regexp = . EXCLUDE_TESTS_REGEX ?= "^$$" @@ -946,52 +965,61 @@ else ifeq ($(QUIET_PARALLEL_TESTS), 1) parallel_redir = >& t/$(test_log_prefix)log-{/} else # Default: print failure output only, as it happens -# Note: gnu_parallel --eta is now always used, but has been modified to provide -# only infrequent updates when not connected to a terminal. (CircleCI will -# kill a job if no output for 10min.) +# Note: parallel --eta is now always used because CircleCI will +# kill a job if no output for 10min. parallel_redir = >& t/$(test_log_prefix)log-{/} || bash -c "cat t/$(test_log_prefix)log-{/}; exit $$?" endif -.PHONY: check_0 -check_0: - $(AM_V_GEN)export TEST_TMPDIR=$(TMPD); \ - printf '%s\n' '' \ +.PHONY: check_0 check_1 +check_0: $(TESTS) $(parallel_tests) + $(AM_V_GEN)printf '%s\n' '' \ + 'Running tests in $(TEST_TMPDIR)' \ 'To monitor subtest ,' \ ' run "make watch-log" in a separate window' ''; \ - { \ - printf './%s\n' $(filter-out $(PARALLEL_TEST),$(TESTS)); \ - find t -name 'run-*' -print; \ - } \ + printf './%s\n' $(filter-out $(PARALLEL_TEST),$(TESTS)) $(PARALLEL_TEST:%=t/run-%-*) \ | $(prioritize_long_running_tests) \ | grep -E '$(tests-regexp)' \ | grep -E -v '$(EXCLUDE_TESTS_REGEX)' \ - | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG --eta --gnu '{} $(parallel_redir)' ; \ + | "$(PARALLEL)" -j$(J) --plain --joblog=LOG --eta --gnu \ + --tmpdir=$(TEST_TMPDIR) --timeout=$(TEST_TIMEOUT) '{} $(parallel_redir)' ; \ parallel_retcode=$$? ; \ awk '{ if ($$7 != 0 || $$8 != 0) { if ($$7 == "Exitval") { h = $$0; } else { if (!f) print h; print; f = 1 } } } END { if(f) exit 1; }' < LOG ; \ awk_retcode=$$?; \ - if [ $$parallel_retcode -ne 0 ] || [ $$awk_retcode -ne 0 ] ; then exit 1 ; fi + if [ $$parallel_retcode -ne 0 ] || [ $$awk_retcode -ne 0 ] ; then exit 1 ; fi; + +check_1: $(TESTS) + $(AM_V_GEN)for t in $(TESTS); do \ + echo "===== Running $$t (`date`)"; ./$$t || exit 1; \ + done; valgrind-exclude-regexp = InlineSkipTest.ConcurrentInsert|TransactionStressTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized|MySQLStyleTransactionTest.TransactionStressTest -.PHONY: valgrind_check_0 +.PHONY: valgrind_check_0 valgrind_check_1 valgrind_check_0: test_log_prefix := valgrind_ -valgrind_check_0: - $(AM_V_GEN)export TEST_TMPDIR=$(TMPD); \ - printf '%s\n' '' \ +valgrind_check_0: $(TESTS) $(parallel_tests) + $(AM_V_GEN)printf '%s\n' '' \ + 'Running tests in $(TEST_TMPDIR)' \ 'To monitor subtest ,' \ ' run "make watch-log" in a separate window' ''; \ - { \ - printf './%s\n' $(filter-out $(PARALLEL_TEST) %skiplist_test options_settable_test, $(TESTS)); \ - find t -name 'run-*' -print; \ - } \ + printf './%s\n' $(filter-out $(PARALLEL_TEST) %skiplist_test options_settable_test, $(TESTS)) $(PARALLEL_TEST:%=t/run-%-*) \ | $(prioritize_long_running_tests) \ | grep -E '$(tests-regexp)' \ | grep -E -v '$(valgrind-exclude-regexp)' \ - | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG --eta --gnu \ - '(if [[ "{}" == "./"* ]] ; then $(DRIVER) {}; else {}; fi) \ + | "$(PARALLEL)" -j$(J) --plain --joblog=LOG --eta --gnu \ + --tmpdir=$(TEST_TMPDIR) --timeout=$(TEST_TIMEOUT) \ + '(if [[ "{}" == "./"* ]] ; then $(VALGRIND_VER) $(VALGRIND_OPTS) {}; else {}; fi) \ $(parallel_redir)' \ -CLEAN_FILES += t LOG $(TMPD) +valgrind_check_1: $(TESTS) + $(AM_V_GEN)for t in $(filter-out %skiplist_test options_settable_test,$(TESTS)); do \ + $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ + ret_code=$$?; \ + if [ $$ret_code -ne 0 ]; then \ + exit $$ret_code; \ + fi; \ + done; + +CLEAN_FILES += t LOG # When running parallel "make check", you can monitor its progress # from another window. @@ -1001,25 +1029,14 @@ CLEAN_FILES += t LOG $(TMPD) # regardless of their duration. As with any use of "watch", hit ^C to # interrupt. watch-log: - $(WATCH) --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)' + $(WATCH) --interval=0 'tail -n+2 LOG|sort -k7,7nr -k4,4gr|$(subst ','\'',$(parallel_log_extract))' dump-log: - bash -c '$(quoted_perl_command)' < LOG + tail -n+2 LOG|$(parallel_log_extract) # If J != 1 and GNU parallel is installed, run the tests in parallel, -# via the check_0 rule above. Otherwise, run them sequentially. -check: all - $(MAKE) gen_parallel_tests - $(AM_V_GEN)if test "$(J)" != 1 \ - && (build_tools/gnu_parallel --gnu --help 2>/dev/null) | \ - grep -q 'GNU Parallel'; \ - then \ - $(MAKE) T="$$t" TMPD=$(TMPD) check_0; \ - else \ - for t in $(TESTS); do \ - echo "===== Running $$t (`date`)"; ./$$t || exit 1; done; \ - fi - rm -rf $(TMPD) +# via the check_0 rule above. Otherwise, run them sequentially via check_1. +check: all $(if $(shell [ "$(J)" != "1" ] && [ "$(PARALLEL_OK)" = "1" ] && echo 1),check_0,check_1) ifneq ($(PLATFORM), OS_AIX) $(PYTHON) tools/check_all_python.py ifeq ($(filter -DROCKSDB_LITE,$(OPT)),) @@ -1030,9 +1047,9 @@ endif endif endif ifndef SKIP_FORMAT_BUCK_CHECKS - $(MAKE) check-format - $(MAKE) check-buck-targets - $(MAKE) check-sources + build_tools/format-diff.sh -c + buckifier/check_buck_targets.sh + build_tools/check-sources.sh endif # TODO add ldb_tests @@ -1113,23 +1130,7 @@ valgrind_test: valgrind_test_some: ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some -valgrind_check: $(TESTS) - $(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests - $(AM_V_GEN)if test "$(J)" != 1 \ - && (build_tools/gnu_parallel --gnu --help 2>/dev/null) | \ - grep -q 'GNU Parallel'; \ - then \ - $(MAKE) TMPD=$(TMPD) \ - DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" valgrind_check_0; \ - else \ - for t in $(filter-out %skiplist_test options_settable_test,$(TESTS)); do \ - $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ - ret_code=$$?; \ - if [ $$ret_code -ne 0 ]; then \ - exit $$ret_code; \ - fi; \ - done; \ - fi +valgrind_check: $(if $(shell [ "$(J)" != "1" ] && [ "$(PARALLEL_OK)" = "1" ] && echo 1),valgrind_check_0,valgrind_check_1) valgrind_check_some: $(ROCKSDBTESTS_SUBSET) for t in $(ROCKSDBTESTS_SUBSET); do \ @@ -1140,51 +1141,9 @@ valgrind_check_some: $(ROCKSDBTESTS_SUBSET) fi; \ done -ifneq ($(PAR_TEST),) -parloop: - ret_bad=0; \ - for t in $(PAR_TEST); do \ - echo "===== Running $$t in parallel $(NUM_PAR) (`date`)";\ - if [ $(db_test) -eq 1 ]; then \ - seq $(J) | v="$$t" build_tools/gnu_parallel --gnu --plain 's=$(TMPD)/rdb-{}; export TEST_TMPDIR=$$s;' \ - 'timeout 2m ./db_test --gtest_filter=$$v >> $$s/log-{} 2>1'; \ - else\ - seq $(J) | v="./$$t" build_tools/gnu_parallel --gnu --plain 's=$(TMPD)/rdb-{};' \ - 'export TEST_TMPDIR=$$s; timeout 10m $$v >> $$s/log-{} 2>1'; \ - fi; \ - ret_code=$$?; \ - if [ $$ret_code -ne 0 ]; then \ - ret_bad=$$ret_code; \ - echo $$t exited with $$ret_code; \ - fi; \ - done; \ - exit $$ret_bad; -endif - test_names = \ - ./db_test --gtest_list_tests \ - | perl -n \ - -e 's/ *\#.*//;' \ - -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};' \ - -e 'print qq! $$p$$2!' - -parallel_check: $(TESTS) - $(AM_V_GEN)if test "$(J)" > 1 \ - && (build_tools/gnu_parallel --gnu --help 2>/dev/null) | \ - grep -q 'GNU Parallel'; \ - then \ - echo Running in parallel $(J); \ - else \ - echo "Need to have GNU Parallel and J > 1"; exit 1; \ - fi; \ - ret_bad=0; \ - echo $(J);\ - echo Test Dir: $(TMPD); \ - seq $(J) | build_tools/gnu_parallel --gnu --plain 's=$(TMPD)/rdb-{}; rm -rf $$s; mkdir $$s'; \ - $(MAKE) PAR_TEST="$(shell $(test_names))" TMPD=$(TMPD) \ - J=$(J) db_test=1 parloop; \ - $(MAKE) PAR_TEST="$(filter-out db_test, $(TESTS))" \ - TMPD=$(TMPD) J=$(J) db_test=0 parloop; + ./db_test --gtest_list_tests | sed 's/ *\#.*//' | \ + awk '/^[^ ]/ { prefix = $$1 } /^[ ]/ { print prefix $$1 }' analyze: clean USE_CLANG=1 $(MAKE) analyze_incremental @@ -1215,8 +1174,8 @@ unity_test: $(OBJ_DIR)/db/db_basic_test.o $(OBJ_DIR)/db/db_test_util.o $(TEST_OB $(AM_LINK) ./unity_test -rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc - build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc +$(PROJECT_NAME).h $(PROJECT_NAME).cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc + build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H $(PROJECT_NAME).h -o $(PROJECT_NAME).cc clean: clean-ext-libraries-all clean-rocks clean-rocksjava @@ -1268,7 +1227,7 @@ check-sources: build_tools/check-sources.sh package: - bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR) + bash build_tools/make_package.sh $(VERSION_MAJOR).$(VERSION_MINOR) # --------------------------------------------------------------------------- # Unit tests and tools @@ -1301,7 +1260,7 @@ $(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) $(SHA $(AM_V_AR)rm -f $@ $(STATIC_STRESS_LIBRARY) $(AM_SHARE) -librocksdb_env_basic_test.a: $(OBJ_DIR)/env/env_basic_test.o $(LIB_OBJECTS) $(TESTHARNESS) +lib$(PROJECT_NAME)_env_basic_test.a: $(OBJ_DIR)/env/env_basic_test.o $(LIB_OBJECTS) $(TESTHARNESS) $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ @@ -1338,6 +1297,14 @@ db_sanity_test: $(OBJ_DIR)/tools/db_sanity_test.o $(LIBRARY) db_repl_stress: $(OBJ_DIR)/tools/db_repl_stress.o $(LIBRARY) $(AM_LINK) +define MakeTestRule +$(notdir $(1:%.cc=%)): $(1:%.cc=$$(OBJ_DIR)/%.o) $$(TEST_LIBRARY) $$(LIBRARY) + $$(AM_LINK) +endef + +# For each PLUGIN test, create a rule to generate the test executable +$(foreach test, $(ROCKSDB_PLUGIN_TESTS), $(eval $(call MakeTestRule, $(test)))) + arena_test: $(OBJ_DIR)/memory/arena_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1731,10 +1698,10 @@ deletefile_test: $(OBJ_DIR)/db/deletefile_test.o $(TEST_LIBRARY) $(LIBRARY) obsolete_files_test: $(OBJ_DIR)/db/obsolete_files_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -rocksdb_dump: $(OBJ_DIR)/tools/dump/rocksdb_dump.o $(LIBRARY) +$(PROJECT_NAME)_dump: $(OBJ_DIR)/tools/dump/rocksdb_dump.o $(LIBRARY) $(AM_LINK) -rocksdb_undump: $(OBJ_DIR)/tools/dump/rocksdb_undump.o $(LIBRARY) +$(PROJECT_NAME)_undump: $(OBJ_DIR)/tools/dump/rocksdb_undump.o $(LIBRARY) $(AM_LINK) cuckoo_table_builder_test: $(OBJ_DIR)/table/cuckoo/cuckoo_table_builder_test.o $(TEST_LIBRARY) $(LIBRARY) @@ -1965,7 +1932,7 @@ uninstall: $(INSTALL_LIBDIR)/$(SHARED3) \ $(INSTALL_LIBDIR)/$(SHARED2) \ $(INSTALL_LIBDIR)/$(SHARED1) \ - $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc + $(INSTALL_LIBDIR)/pkgconfig/$(PROJECT_NAME).pc install-headers: gen-pc install -d $(INSTALL_LIBDIR) @@ -1980,7 +1947,7 @@ install-headers: gen-pc install -d $(DESTDIR)/$(PREFIX)/include/rocksdb/`dirname $$header`; \ install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/include/rocksdb/$$header; \ done - install -C -m 644 rocksdb.pc $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc + install -C -m 644 $(PROJECT_NAME).pc $(INSTALL_LIBDIR)/pkgconfig/$(PROJECT_NAME).pc install-static: install-headers $(LIBRARY) install -d $(INSTALL_LIBDIR) @@ -1999,18 +1966,19 @@ install: install-static # Generate the pkg-config file gen-pc: - -echo 'prefix=$(PREFIX)' > rocksdb.pc - -echo 'exec_prefix=$${prefix}' >> rocksdb.pc - -echo 'includedir=$${prefix}/include' >> rocksdb.pc - -echo 'libdir=$(LIBDIR)' >> rocksdb.pc - -echo '' >> rocksdb.pc - -echo 'Name: rocksdb' >> rocksdb.pc - -echo 'Description: An embeddable persistent key-value store for fast storage' >> rocksdb.pc - -echo Version: $(shell ./build_tools/version.sh full) >> rocksdb.pc - -echo 'Libs: -L$${libdir} $(EXEC_LDFLAGS) -lrocksdb' >> rocksdb.pc - -echo 'Libs.private: $(PLATFORM_LDFLAGS)' >> rocksdb.pc - -echo 'Cflags: -I$${includedir} $(PLATFORM_CXXFLAGS)' >> rocksdb.pc - -echo 'Requires: $(subst ",,$(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))' >> rocksdb.pc + $(AM_V_GEN)printf '%s\n' \ + 'prefix=$(PREFIX)' \ + 'exec_prefix=$${prefix}' \ + 'includedir=$${prefix}/include' \ + 'libdir=$(LIBDIR)' \ + '' \ + 'Name: $(PROJECT_NAME)' \ + 'Description: An embeddable persistent key-value store for fast storage' \ + 'Version: $(shell ./build_tools/version.sh full)' \ + 'Libs: -L$${libdir} $(EXEC_LDFLAGS) -lrocksdb' \ + 'Libs.private: $(PLATFORM_LDFLAGS)' \ + 'Cflags: -I$${includedir} $(PLATFORM_CXXFLAGS)' \ + 'Requires: $(subst ",,$(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))' > $(PROJECT_NAME).pc #------------------------------------------------- @@ -2042,22 +2010,22 @@ ifneq ($(origin JNI_LIBC), undefined) JNI_LIBC_POSTFIX = -$(JNI_LIBC) endif -ifeq (,$(ROCKSDBJNILIB)) +ifeq (,$(JNILIBNAME)) ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64, $(MACHINE))) - ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so + JNILIBNAME = lib$(PROJECT_NAME)jni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so else - ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so + JNILIBNAME = lib$(PROJECT_NAME)jni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so endif endif -ROCKSDB_JAVA_VERSION ?= $(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH) -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar -ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-javadoc.jar -ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-sources.jar +LIB_JAVA_VERSION ?= $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH) +LIB_JAR = $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar +LIB_JAR_ALL = $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar +LIB_JAVADOCS_JAR = $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-javadoc.jar +LIB_SOURCES_JAR = $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-sources.jar SHA256_CMD = sha256sum -ZLIB_VER ?= 1.2.12 -ZLIB_SHA256 ?= 91844808532e5ce316b3c010929493c0244f3d37593afd6de04f71821d5136d9 +ZLIB_VER ?= 1.2.13 +ZLIB_SHA256 ?= b3a24de97a8fdbc835b9833169501030b8977031bcb54b3b3ac13740f846ab30 ZLIB_DOWNLOAD_BASE ?= http://zlib.net BZIP2_VER ?= 1.0.8 BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269 @@ -2074,16 +2042,16 @@ ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive CURL_SSL_OPTS ?= --tlsv1 ifeq ($(PLATFORM), OS_MACOSX) -ifeq (,$(findstring librocksdbjni-osx,$(ROCKSDBJNILIB))) +ifeq (,$(findstring lib$(PROJECT_NAME)jni-osx,$(JNILIBNAME))) ifeq ($(MACHINE),arm64) - ROCKSDBJNILIB = librocksdbjni-osx-arm64.jnilib + JNILIBNAME = lib$(PROJECT_NAME)jni-osx-arm64.jnilib else ifeq ($(MACHINE),x86_64) - ROCKSDBJNILIB = librocksdbjni-osx-x86_64.jnilib + JNILIBNAME = lib$(PROJECT_NAME)jni-osx-x86_64.jnilib else - ROCKSDBJNILIB = librocksdbjni-osx.jnilib + JNILIBNAME = lib$(PROJECT_NAME)jni-osx.jnilib endif endif - ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-osx.jar + LIB_JAR = $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-osx.jar SHA256_CMD = openssl sha256 -r ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","") JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin @@ -2094,25 +2062,25 @@ endif ifeq ($(PLATFORM), OS_FREEBSD) JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/freebsd - ROCKSDBJNILIB = librocksdbjni-freebsd$(ARCH).so - ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-freebsd$(ARCH).jar + JNILIBNAME = lib$(PROJECT_NAME)jni-freebsd$(ARCH).so + LIB_JAR = $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-freebsd$(ARCH).jar endif ifeq ($(PLATFORM), OS_SOLARIS) - ROCKSDBJNILIB = librocksdbjni-solaris$(ARCH).so - ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-solaris$(ARCH).jar + JNILIBNAME = lib$(PROJECT_NAME)jni-solaris$(ARCH).so + LIB_JAR = $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-solaris$(ARCH).jar JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/solaris SHA256_CMD = digest -a sha256 endif ifeq ($(PLATFORM), OS_AIX) JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/aix - ROCKSDBJNILIB = librocksdbjni-aix.so + JNILIBNAME = lib$(PROJECT_NAME)jni-aix.so EXTRACT_SOURCES = gunzip < TAR_GZ | tar xvf - SNAPPY_MAKE_TARGET = libsnappy.la endif ifeq ($(PLATFORM), OS_OPENBSD) JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd - ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so - ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar + JNILIBNAME = lib$(PROJECT_NAME)jni-openbsd$(ARCH).so + LIB_JAR = $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-openbsd$(ARCH).jar endif zlib-$(ZLIB_VER).tar.gz: @@ -2212,17 +2180,17 @@ endif $(MAKE) rocksdbjava_jar rocksdbjavastaticosx: rocksdbjavastaticosx_archs - cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md - cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib - cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class - openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1 + cd java; $(JAR_CMD) -cf target/$(LIB_JAR) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(LIB_JAR) lib$(PROJECT_NAME)jni-osx-x86_64.jnilib lib$(PROJECT_NAME)jni-osx-arm64.jnilib + cd java/target/classes; $(JAR_CMD) -uf ../$(LIB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(LIB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(LIB_JAR).sha1 rocksdbjavastaticosx_ub: rocksdbjavastaticosx_archs - cd java/target; lipo -create -output librocksdbjni-osx.jnilib librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib - cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md - cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx.jnilib - cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class - openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1 + cd java/target; lipo -create -output lib$(PROJECT_NAME)jni-osx.jnilib lib$(PROJECT_NAME)jni-osx-x86_64.jnilib lib$(PROJECT_NAME)jni-osx-arm64.jnilib + cd java; $(JAR_CMD) -cf target/$(LIB_JAR) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(LIB_JAR) lib$(PROJECT_NAME)jni-osx.jnilib + cd java/target/classes; $(JAR_CMD) -uf ../$(LIB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(LIB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(LIB_JAR).sha1 rocksdbjavastaticosx_archs: $(MAKE) rocksdbjavastaticosx_arch_x86_64 @@ -2236,7 +2204,7 @@ endif $(MAKE) clean-rocks ARCHFLAG="-arch $*" $(MAKE) rocksdbjavastatic_deps ARCHFLAG="-arch $*" $(MAKE) rocksdbjavastatic_libobjects - ARCHFLAG="-arch $*" ROCKSDBJNILIB="librocksdbjni-osx-$*.jnilib" $(MAKE) rocksdbjavastatic_javalib + ARCHFLAG="-arch $*" JNILIBNAME="lib$(PROJECT_NAME)jni-osx-$*.jnilib" $(MAKE) rocksdbjavastatic_javalib ifeq ($(JAR_CMD),) ifneq ($(JAVA_HOME),) @@ -2247,28 +2215,28 @@ endif endif rocksdbjavastatic_javalib: cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib - rm -f java/target/$(ROCKSDBJNILIB) + rm -f java/target/$(JNILIBNAME) $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \ - -o ./java/target/$(ROCKSDBJNILIB) $(ALL_JNI_NATIVE_SOURCES) \ + -o ./java/target/$(JNILIBNAME) $(ALL_JNI_NATIVE_SOURCES) \ $(LIB_OBJECTS) $(COVERAGEFLAGS) \ $(JAVA_COMPRESSIONS) $(JAVA_STATIC_LDFLAGS) cd java/target;if [ "$(DEBUG_LEVEL)" == "0" ]; then \ - strip $(STRIPFLAGS) $(ROCKSDBJNILIB); \ + strip $(STRIPFLAGS) $(JNILIBNAME); \ fi rocksdbjava_jar: - cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md - cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) - cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class - openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1 + cd java; $(JAR_CMD) -cf target/$(LIB_JAR) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(LIB_JAR) $(JNILIBNAME) + cd java/target/classes; $(JAR_CMD) -uf ../$(LIB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(LIB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(LIB_JAR).sha1 rocksdbjava_javadocs_jar: - cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) * - openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1 + cd java/target/apidocs; $(JAR_CMD) -cf ../$(LIB_JAVADOCS_JAR) * + openssl sha1 java/target/$(LIB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(LIB_JAVADOCS_JAR).sha1 rocksdbjava_sources_jar: - cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org - openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1 + cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(LIB_SOURCES_JAR) org + openssl sha1 java/target/$(LIB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(LIB_SOURCES_JAR).sha1 rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS) @@ -2276,16 +2244,16 @@ rocksdbjavastatic_libobjects: $(LIB_OBJECTS) rocksdbjavastaticrelease: rocksdbjavastaticosx rocksdbjava_javadocs_jar rocksdbjava_sources_jar cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl - cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md - cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib - cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class - openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1 + cd java; $(JAR_CMD) -cf target/$(LIB_JAR_ALL) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(LIB_JAR_ALL) lib$(PROJECT_NAME)jni-*.so lib$(PROJECT_NAME)jni-*.jnilib + cd java/target/classes; $(JAR_CMD) -uf ../$(LIB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(LIB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(LIB_JAR_ALL).sha1 rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl rocksdbjava_javadocs_jar rocksdbjava_sources_jar - cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md - cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib - cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class - openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1 + cd java; $(JAR_CMD) -cf target/$(LIB_JAR_ALL) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(LIB_JAR_ALL) lib$(PROJECT_NAME)jni-*.so lib$(PROJECT_NAME)jni-*.jnilib + cd java/target/classes; $(JAR_CMD) -uf ../$(LIB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(LIB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(LIB_JAR_ALL).sha1 rocksdbjavastaticdockerx86: mkdir -p java/target @@ -2331,29 +2299,29 @@ rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentr rocksdbjavastaticpublishdocker: rocksdbjavastaticreleasedocker rocksdbjavastaticpublishcentral -ROCKSDB_JAVA_RELEASE_CLASSIFIERS = javadoc sources linux64 linux32 linux64-musl linux32-musl osx win64 +LIB_JAVA_RELEASE_CLASSIFIERS = javadoc sources linux64 linux32 linux64-musl linux32-musl osx win64 rocksdbjavastaticpublishcentral: rocksdbjavageneratepom - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar - $(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -Dclassifier=$(classifier);) + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(LIB_JAVA_VERSION).jar + $(foreach classifier, $(LIB_JAVA_RELEASE_CLASSIFIERS), mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-$(classifier).jar -Dclassifier=$(classifier);) rocksdbjavageneratepom: - cd java;cat pom.xml.template | sed 's/\$${ROCKSDB_JAVA_VERSION}/$(ROCKSDB_JAVA_VERSION)/' > pom.xml + cd java;cat pom.xml.template | sed 's/\$${LIB_JAVA_VERSION}/$(LIB_JAVA_VERSION)/' > pom.xml rocksdbjavastaticnexusbundlejar: rocksdbjavageneratepom openssl sha1 -r java/pom.xml | awk '{ print $$1 }' > java/target/pom.xml.sha1 - openssl sha1 -r java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar | awk '{ print $$1 }' > java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.sha1 - $(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), openssl sha1 -r java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar | awk '{ print $$1 }' > java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.sha1;) + openssl sha1 -r java/target/$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar | awk '{ print $$1 }' > java/target/$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar.sha1 + $(foreach classifier, $(LIB_JAVA_RELEASE_CLASSIFIERS), openssl sha1 -r java/target/$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-$(classifier).jar | awk '{ print $$1 }' > java/target/$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-$(classifier).jar.sha1;) gpg --yes --output java/target/pom.xml.asc -ab java/pom.xml - gpg --yes -ab java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar - $(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), gpg --yes -ab java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar;) - $(JAR_CMD) cvf java/target/nexus-bundle-rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java pom.xml -C java/target pom.xml.sha1 -C java/target pom.xml.asc -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.sha1 -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.asc - $(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), $(JAR_CMD) uf java/target/nexus-bundle-rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.sha1 -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.asc;) + gpg --yes -ab java/target/$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar + $(foreach classifier, $(LIB_JAVA_RELEASE_CLASSIFIERS), gpg --yes -ab java/target/$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-$(classifier).jar;) + $(JAR_CMD) cvf java/target/nexus-bundle-$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar -C java pom.xml -C java/target pom.xml.sha1 -C java/target pom.xml.asc -C java/target $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar -C java/target $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar.sha1 -C java/target $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar.asc + $(foreach classifier, $(LIB_JAVA_RELEASE_CLASSIFIERS), $(JAR_CMD) uf java/target/nexus-bundle-$(PROJECT_NAME)jni-$(LIB_JAVA_VERSION).jar -C java/target $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-$(classifier).jar -C java/target $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-$(classifier).jar.sha1 -C java/target $(PROJECT_NAME)jni-$(LIB_JAVA_VERSION)-$(classifier).jar.asc;) # A version of each $(LIBOBJECTS) compiled with -fPIC -jl/%.o: %.cc +jl/%.o: %.cc make_config.mk $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS) rocksdbjava: $(LIB_OBJECTS) @@ -2361,12 +2329,12 @@ ifeq ($(JAVA_HOME),) $(error JAVA_HOME is not set) endif $(AM_V_GEN)cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib; - $(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB) - $(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. -I./java/rocksjni $(JAVA_INCLUDE) $(ROCKSDB_PLUGIN_JNI_CXX_INCLUDEFLAGS) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(ALL_JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS) - $(AM_V_at)cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md - $(AM_V_at)cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) - $(AM_V_at)cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class - $(AM_V_at)openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1 + $(AM_V_at)rm -f ./java/target/$(JNILIBNAME) + $(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. -I./java/rocksjni $(JAVA_INCLUDE) $(ROCKSDB_PLUGIN_JNI_CXX_INCLUDEFLAGS) -shared -fPIC -o ./java/target/$(JNILIBNAME) $(ALL_JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS) + $(AM_V_at)cd java; $(JAR_CMD) -cf target/$(LIB_JAR) HISTORY*.md + $(AM_V_at)cd java/target; $(JAR_CMD) -uf $(LIB_JAR) $(JNILIBNAME) + $(AM_V_at)cd java/target/classes; $(JAR_CMD) -uf ../$(LIB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + $(AM_V_at)openssl sha1 java/target/$(LIB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(LIB_JAR).sha1 jclean: cd java;$(MAKE) clean; @@ -2403,7 +2371,7 @@ checkout_folly: cd third-party/folly && git reset --hard 98b9b2c1124e99f50f9085ddee74ce32afffc665 @# A hack to remove boost dependency. @# NOTE: this hack is not needed if using FBCODE compiler config - perl -pi -e 's/^(#include + + + + + -[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb) -[![TravisCI Status](https://api.travis-ci.com/facebook/rocksdb.svg?branch=main)](https://travis-ci.com/github/facebook/rocksdb) -[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main) -[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb) +
-RocksDB is developed and maintained by Facebook Database Engineering Team. -It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com) -and Jeff Dean (jeff@google.com) +![GitHub](https://img.shields.io/github/license/speedb-io/speedb) +![GitHub contributors](https://img.shields.io/github/contributors/speedb-io/speedb?color=blue) +![GitHub pull requests](https://img.shields.io/github/issues-pr/speedb-io/speedb) +![GitHub closed pull requests](https://img.shields.io/github/issues-pr-closed/speedb-io/speedb?color=green) +
-This code is a library that forms the core building block for a fast -key-value server, especially suited for storing data on flash drives. -It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs -between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF) -and Space-Amplification-Factor (SAF). It has multi-threaded compactions, -making it especially suitable for storing multiple terabytes of data in a -single database. +# Speedb +Speedb is a library that provides persistent key-value stores. Keys and values are arbitrary byte arrays. The keys are ordered within the key value store according to a user-specified comparator function. +It is maintained by Speedb and is forked from Rocksdb, developed by Facebook. -Start with example usage here: https://github.com/facebook/rocksdb/tree/main/examples + + + + -See the [github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation. -The public interface is in `include/`. Callers should not include or -rely on the details of any other header files in this package. Those -internal APIs may be changed without warning. +## Usage +* If speedb is in your default library path: + + + In your `CMakeLists.txt` add: + ``` + target_link_libraries(${PROJECT_NAME} speedb) + ``` + where `PROJECT_NAME` is the name of your target application which uses speedb + +* Otherwise, you have to include the path to the folder the library is in like so: + + ``` + target_link_libraries(${PROJECT_NAME} /path/to/speedb/library/folder) + ``` + + +Usage of the library in your code is the same regardless of whether you statically linked the library or dynamically linked it, and examples can be found under the [examples](examples) directory. +The public interface is in [include](include/rocksdb). Callers should not include or rely on the details of any other header files in this package. Those internal APIs may be changed without warning. + + +## Build dependencies + +Please refer to the file [INSTALL.md](INSTALL.md) for a list of all the +dependencies and how to install them across different platforms. + + +## 🔨 Building Speedb + +Debug: + + mkdir build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Debug [cmake options] + make speedb + +By default the build type is Debug. + +Release: + + mkdir build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Release [cmake options] + make speedb + +This will build the static library. If you want to build the dynamic library, +use: + + make rocksdb-shared + +If you want `make` to increase the number of cores used for building, simply use +the `-j` option. + +If you want to build a specific target: + + make [target name] + +For development and functional testing, go with the debug version which includes +more assertions and debug prints. Otherwise, for production or performance +testing, we recommend building a release version which is more optimized. + +## 📈 Performance + +We are using DBbench to test performance and progress between the versions. it is available under tools and also in the artifact for direct download. +In there you can also find a readme with the commands we are using to get you started. + +Screen Shot 2022-10-31 at 15 15 42 +shorturl.at/dlKOY +## Questions + +- For live discussion with the community you can use our official [Discord channel](https://discord.gg/52yzKZ5G9D). +- For technical questions and discussions you can use our official [Discourse forum](https://speedb.discourse.group/). + + +## 🌎 Join us + +Speedb is committed to a welcoming and inclusive environment where everyone can +contribute. + + +## Contributing code + +See the [contributing guide](CONTRIBUTING.md). -Questions and discussions are welcome on the [RocksDB Developers Public](https://www.facebook.com/groups/rocksdb.dev/) Facebook group and [email list](https://groups.google.com/g/rocksdb) on Google Groups. ## License +Speedb is open-source and licensed under the [Apache 2.0 License](LICENSE.Apache). + -RocksDB is dual-licensed under both the GPLv2 (found in the COPYING file in the root directory) and Apache 2.0 License (found in the LICENSE.Apache file in the root directory). You may select, at your option, one of the above-listed licenses. + diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 0c53c416b7..337d376e73 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -72,10 +72,6 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then fi fi -# Delete existing output, if it exists -rm -f "$OUTPUT" -touch "$OUTPUT" - if test -z "$CC"; then if [ -x "$(command -v cc)" ]; then CC=cc @@ -140,9 +136,6 @@ PLATFORM_SHARED_LDFLAGS="-Wl,--no-as-needed -shared -Wl,-soname -Wl," PLATFORM_SHARED_CFLAGS="-fPIC" PLATFORM_SHARED_VERSIONED=true -# generic port files (working on all platform by #ifdef) go directly in /port -GENERIC_PORT_FILES=`cd "$ROCKSDB_ROOT"; find port -name '*.cc' | tr "\n" " "` - # On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp case "$TARGET_OS" in Darwin) @@ -150,7 +143,6 @@ case "$TARGET_OS" in COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX" PLATFORM_SHARED_EXT=dylib PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " - # PORT_FILES=port/darwin/darwin_specific.cc ;; IOS) PLATFORM=IOS @@ -187,27 +179,23 @@ EOF COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_IOURING_PRESENT" fi fi - # PORT_FILES=port/linux/linux_specific.cc ;; SunOS) PLATFORM=OS_SOLARIS COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -static-libstdc++ -static-libgcc -m64" - # PORT_FILES=port/sunos/sunos_specific.cc ;; AIX) PLATFORM=OS_AIX CC=gcc COMMON_FLAGS="$COMMON_FLAGS -maix64 -pthread -fno-builtin-memcmp -D_REENTRANT -DOS_AIX -D__STDC_FORMAT_MACROS" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread -lpthread -lrt -maix64 -static-libstdc++ -static-libgcc" - # PORT_FILES=port/aix/aix_specific.cc ;; FreeBSD) PLATFORM=OS_FREEBSD CXX=clang++ COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" - # PORT_FILES=port/freebsd/freebsd_specific.cc ;; GNU/kFreeBSD) PLATFORM=OS_GNU_KFREEBSD @@ -218,28 +206,24 @@ EOF PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" fi PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" - # PORT_FILES=port/gnu_kfreebsd/gnu_kfreebsd_specific.cc ;; NetBSD) PLATFORM=OS_NETBSD COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s" - # PORT_FILES=port/netbsd/netbsd_specific.cc ;; OpenBSD) PLATFORM=OS_OPENBSD CXX=clang++ COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread" - # PORT_FILES=port/openbsd/openbsd_specific.cc - FIND=gfind - WATCH=gnuwatch + FIND=gfind + WATCH=gnuwatch ;; DragonFly) PLATFORM=OS_DRAGONFLYBSD COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" - # PORT_FILES=port/dragonfly/dragonfly_specific.cc ;; Cygwin) PLATFORM=CYGWIN @@ -252,13 +236,11 @@ EOF PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" fi PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" - # PORT_FILES=port/linux/linux_specific.cc ;; OS_ANDROID_CROSSCOMPILE) PLATFORM=OS_ANDROID - COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DROCKSDB_PLATFORM_POSIX" - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library - # PORT_FILES=port/android/android.cc + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DROCKSDB_PLATFORM_POSIX" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library CROSS_COMPILE=true ;; *) @@ -833,56 +815,67 @@ rm -f test.o test_dl.o PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" -VALGRIND_VER="$VALGRIND_VER" - -ROCKSDB_MAJOR=`build_tools/version.sh major` -ROCKSDB_MINOR=`build_tools/version.sh minor` -ROCKSDB_PATCH=`build_tools/version.sh patch` - -echo "CC=$CC" >> "$OUTPUT" -echo "CXX=$CXX" >> "$OUTPUT" -echo "AR=$AR" >> "$OUTPUT" -echo "PLATFORM=$PLATFORM" >> "$OUTPUT" -echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT" -echo "PLATFORM_CMAKE_FLAGS=$PLATFORM_CMAKE_FLAGS" >> "$OUTPUT" -echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT" -echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT" -echo "JAVA_STATIC_DEPS_CCFLAGS=$JAVA_STATIC_DEPS_CCFLAGS" >> "$OUTPUT" -echo "JAVA_STATIC_DEPS_CXXFLAGS=$JAVA_STATIC_DEPS_CXXFLAGS" >> "$OUTPUT" -echo "JAVA_STATIC_DEPS_LDFLAGS=$JAVA_STATIC_DEPS_LDFLAGS" >> "$OUTPUT" -echo "JAVAC_ARGS=$JAVAC_ARGS" >> "$OUTPUT" -echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT" -echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT" -echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT" -echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> "$OUTPUT" -echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> "$OUTPUT" -echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> "$OUTPUT" -echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT" -echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT" -echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT" -echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT" -echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT" -echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT" -echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT" -echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT" -echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT" -echo "PROFILING_FLAGS=$PROFILING_FLAGS" >> "$OUTPUT" -echo "FIND=$FIND" >> "$OUTPUT" -echo "WATCH=$WATCH" >> "$OUTPUT" -# This will enable some related identifiers for the preprocessor -if test -n "$JEMALLOC"; then - echo "JEMALLOC=1" >> "$OUTPUT" -fi -# Indicates that jemalloc should be enabled using -ljemalloc flag -# The alternative is to porvide a direct link to the library via JEMALLOC_LIB -# and JEMALLOC_INCLUDE -if test -n "$WITH_JEMALLOC_FLAG"; then - echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT" -fi -echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT" -if test -n "$USE_FOLLY"; then - echo "USE_FOLLY=$USE_FOLLY" >> "$OUTPUT" -fi -if test -n "$PPC_LIBC_IS_GNU"; then - echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT" +VERSION_MAJOR="$(build_tools/version.sh major)" +VERSION_MINOR="$(build_tools/version.sh minor)" +VERSION_PATCH="$(build_tools/version.sh patch)" + +TMP_OUTPUT="${OUTPUT}.tmp" + +{ + echo "CC=$CC" + echo "CXX=$CXX" + echo "AR=$AR" + echo "PLATFORM=$PLATFORM" + echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" + echo "PLATFORM_CMAKE_FLAGS=$PLATFORM_CMAKE_FLAGS" + echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" + echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" + echo "JAVA_STATIC_DEPS_CCFLAGS=$JAVA_STATIC_DEPS_CCFLAGS" + echo "JAVA_STATIC_DEPS_CXXFLAGS=$JAVA_STATIC_DEPS_CXXFLAGS" + echo "JAVA_STATIC_DEPS_LDFLAGS=$JAVA_STATIC_DEPS_LDFLAGS" + echo "JAVAC_ARGS=$JAVAC_ARGS" + echo "VALGRIND_VER=$VALGRIND_VER" + echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" + echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" + echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" + echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" + echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" + echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" + echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" + echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" + echo "JEMALLOC_LIB=$JEMALLOC_LIB" + echo "LIBNAME=$LIBNAME" + echo "VERSION_MAJOR=$VERSION_MAJOR" + echo "VERSION_MINOR=$VERSION_MINOR" + echo "VERSION_PATCH=$VERSION_PATCH" + echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" + echo "CLANG_ANALYZER=$CLANG_ANALYZER" + echo "PROFILING_FLAGS=$PROFILING_FLAGS" + echo "FIND=$FIND" + echo "WATCH=$WATCH" + # This will enable some related identifiers for the preprocessor + if test -n "$JEMALLOC"; then + echo "JEMALLOC=1" + fi + # Indicates that jemalloc should be enabled using -ljemalloc flag + # The alternative is to porvide a direct link to the library via JEMALLOC_LIB + # and JEMALLOC_INCLUDE + if test -n "$WITH_JEMALLOC_FLAG"; then + echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" + fi + echo "LUA_PATH=$LUA_PATH" + if test -n "$USE_FOLLY"; then + echo "USE_FOLLY=$USE_FOLLY" + fi + if test -n "$PPC_LIBC_IS_GNU"; then + echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" + fi +} > "$TMP_OUTPUT" + +# Avoid blindly creating the output file and updating its timestamp when there's +# no need for it +if [ ! -f "$OUTPUT" ] || ! cmp -s "$OUTPUT" "$TMP_OUTPUT"; then + mv "$TMP_OUTPUT" "$OUTPUT" +else + rm -f "$TMP_OUTPUT" fi diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh index 3c16b08eba..c0c44e3fb1 100755 --- a/build_tools/format-diff.sh +++ b/build_tools/format-diff.sh @@ -122,12 +122,12 @@ uncommitted_code=`git diff HEAD` # If there's no uncommitted changes, we assume user are doing post-commit # format check, in which case we'll try to check the modified lines vs. the -# facebook/rocksdb.git main branch. Otherwise, we'll check format of the +# speedb-io/speedb.git main branch. Otherwise, we'll check format of the # uncommitted code only. if [ -z "$uncommitted_code" ] then - # Attempt to get name of facebook/rocksdb.git remote. - [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(LC_ALL=POSIX LANG=POSIX git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)" + # Attempt to get name of speedb-io/speedb.git remote. + [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(LC_ALL=POSIX LANG=POSIX git remote -v | grep 'speedb-io/speedb.git' | head -n 1 | cut -f 1)" # Fall back on 'origin' if that fails [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE=origin # Use main branch from that remote diff --git a/build_tools/gnu_parallel b/build_tools/gnu_parallel deleted file mode 100755 index 757b25f11d..0000000000 --- a/build_tools/gnu_parallel +++ /dev/null @@ -1,7970 +0,0 @@ -#!/usr/bin/env perl - -# Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014 Ole Tange and -# Free Software Foundation, Inc. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see -# or write to the Free Software Foundation, Inc., 51 Franklin St, -# Fifth Floor, Boston, MA 02110-1301 USA - -# open3 used in Job::start -use IPC::Open3; -# &WNOHANG used in reaper -use POSIX qw(:sys_wait_h setsid ceil :errno_h); -# gensym used in Job::start -use Symbol qw(gensym); -# tempfile used in Job::start -use File::Temp qw(tempfile tempdir); -# mkpath used in openresultsfile -use File::Path; -# GetOptions used in get_options_from_array -use Getopt::Long; -# Used to ensure code quality -use strict; -use File::Basename; - -if(not $ENV{HOME}) { - # $ENV{HOME} is sometimes not set if called from PHP - ::warning("\$HOME not set. Using /tmp\n"); - $ENV{HOME} = "/tmp"; -} - -save_stdin_stdout_stderr(); -save_original_signal_handler(); -parse_options(); -::debug("init", "Open file descriptors: ", join(" ",keys %Global::fd), "\n"); -my $number_of_args; -if($Global::max_number_of_args) { - $number_of_args=$Global::max_number_of_args; -} elsif ($opt::X or $opt::m or $opt::xargs) { - $number_of_args = undef; -} else { - $number_of_args = 1; -} - -my @command; -@command = @ARGV; - -my @fhlist; -if($opt::pipepart) { - @fhlist = map { open_or_exit($_) } "/dev/null"; -} else { - @fhlist = map { open_or_exit($_) } @opt::a; - if(not @fhlist and not $opt::pipe) { - @fhlist = (*STDIN); - } -} - -if($opt::skip_first_line) { - # Skip the first line for the first file handle - my $fh = $fhlist[0]; - <$fh>; -} -if($opt::header and not $opt::pipe) { - my $fh = $fhlist[0]; - # split with colsep or \t - # $header force $colsep = \t if undef? - my $delimiter = $opt::colsep; - $delimiter ||= "\$"; - my $id = 1; - for my $fh (@fhlist) { - my $line = <$fh>; - chomp($line); - ::debug("init", "Delimiter: '$delimiter'"); - for my $s (split /$delimiter/o, $line) { - ::debug("init", "Colname: '$s'"); - # Replace {colname} with {2} - # TODO accept configurable short hands - # TODO how to deal with headers in {=...=} - for(@command) { - s:\{$s(|/|//|\.|/\.)\}:\{$id$1\}:g; - } - $Global::input_source_header{$id} = $s; - $id++; - } - } -} else { - my $id = 1; - for my $fh (@fhlist) { - $Global::input_source_header{$id} = $id; - $id++; - } -} - -if($opt::filter_hosts and (@opt::sshlogin or @opt::sshloginfile)) { - # Parallel check all hosts are up. Remove hosts that are down - filter_hosts(); -} - -if($opt::nonall or $opt::onall) { - onall(@command); - wait_and_exit(min(undef_as_zero($Global::exitstatus),254)); -} - -# TODO --transfer foo/./bar --cleanup -# multiple --transfer and --basefile with different /./ - -$Global::JobQueue = JobQueue->new( - \@command,\@fhlist,$Global::ContextReplace,$number_of_args,\@Global::ret_files); - -if($opt::eta or $opt::bar) { - # Count the number of jobs before starting any - $Global::JobQueue->total_jobs(); -} -if($opt::pipepart) { - @Global::cat_partials = map { pipe_part_files($_) } @opt::a; - # Unget the command as many times as there are parts - $Global::JobQueue->{'commandlinequeue'}->unget( - map { $Global::JobQueue->{'commandlinequeue'}->get() } @Global::cat_partials - ); -} -for my $sshlogin (values %Global::host) { - $sshlogin->max_jobs_running(); -} - -init_run_jobs(); -my $sem; -if($Global::semaphore) { - $sem = acquire_semaphore(); -} -$SIG{TERM} = \&start_no_new_jobs; - -start_more_jobs(); -if(not $opt::pipepart) { - if($opt::pipe) { - spreadstdin(); - } -} -::debug("init", "Start draining\n"); -drain_job_queue(); -::debug("init", "Done draining\n"); -reaper(); -::debug("init", "Done reaping\n"); -if($opt::pipe and @opt::a) { - for my $job (@Global::tee_jobs) { - unlink $job->fh(2,"name"); - $job->set_fh(2,"name",""); - $job->print(); - unlink $job->fh(1,"name"); - } -} -::debug("init", "Cleaning\n"); -cleanup(); -if($Global::semaphore) { - $sem->release(); -} -for(keys %Global::sshmaster) { - kill "TERM", $_; -} -::debug("init", "Halt\n"); -if($opt::halt_on_error) { - wait_and_exit($Global::halt_on_error_exitstatus); -} else { - wait_and_exit(min(undef_as_zero($Global::exitstatus),254)); -} - -sub __PIPE_MODE__ {} - -sub pipe_part_files { - # Input: - # $file = the file to read - # Returns: - # @commands that will cat_partial each part - my ($file) = @_; - my $buf = ""; - my $header = find_header(\$buf,open_or_exit($file)); - # find positions - my @pos = find_split_positions($file,$opt::blocksize,length $header); - # Make @cat_partials - my @cat_partials = (); - for(my $i=0; $i<$#pos; $i++) { - push @cat_partials, cat_partial($file, 0, length($header), $pos[$i], $pos[$i+1]); - } - # Remote exec should look like: - # ssh -oLogLevel=quiet lo 'eval `echo $SHELL | grep "/t\{0,1\}csh" > /dev/null && echo setenv PARALLEL_SEQ '$PARALLEL_SEQ'\; setenv PARALLEL_PID '$PARALLEL_PID' || echo PARALLEL_SEQ='$PARALLEL_SEQ'\;export PARALLEL_SEQ\; PARALLEL_PID='$PARALLEL_PID'\;export PARALLEL_PID` ;' tty\ \>/dev/null\ \&\&\ stty\ isig\ -onlcr\ -echo\;echo\ \$SHELL\ \|\ grep\ \"/t\\\{0,1\\\}csh\"\ \>\ /dev/null\ \&\&\ setenv\ FOO\ /tmp/foo\ \|\|\ export\ FOO=/tmp/foo\; \(wc\ -\ \$FOO\) - # ssh -tt not allowed. Remote will die due to broken pipe anyway. - # TODO test remote with --fifo / --cat - return @cat_partials; -} - -sub find_header { - # Input: - # $buf_ref = reference to read-in buffer - # $fh = filehandle to read from - # Uses: - # $opt::header - # $opt::blocksize - # Returns: - # $header string - my ($buf_ref, $fh) = @_; - my $header = ""; - if($opt::header) { - if($opt::header eq ":") { $opt::header = "(.*\n)"; } - # Number = number of lines - $opt::header =~ s/^(\d+)$/"(.*\n)"x$1/e; - while(read($fh,substr($$buf_ref,length $$buf_ref,0),$opt::blocksize)) { - if($$buf_ref=~s/^($opt::header)//) { - $header = $1; - last; - } - } - } - return $header; -} - -sub find_split_positions { - # Input: - # $file = the file to read - # $block = (minimal) --block-size of each chunk - # $headerlen = length of header to be skipped - # Uses: - # $opt::recstart - # $opt::recend - # Returns: - # @positions of block start/end - my($file, $block, $headerlen) = @_; - my $size = -s $file; - $block = int $block; - # The optimal dd blocksize for mint, redhat, solaris, openbsd = 2^17..2^20 - # The optimal dd blocksize for freebsd = 2^15..2^17 - my $dd_block_size = 131072; # 2^17 - my @pos; - my ($recstart,$recend) = recstartrecend(); - my $recendrecstart = $recend.$recstart; - my $fh = ::open_or_exit($file); - push(@pos,$headerlen); - for(my $pos = $block+$headerlen; $pos < $size; $pos += $block) { - my $buf; - seek($fh, $pos, 0) || die; - while(read($fh,substr($buf,length $buf,0),$dd_block_size)) { - if($opt::regexp) { - # If match /$recend$recstart/ => Record position - if($buf =~ /(.*$recend)$recstart/os) { - my $i = length($1); - push(@pos,$pos+$i); - # Start looking for next record _after_ this match - $pos += $i; - last; - } - } else { - # If match $recend$recstart => Record position - my $i = index($buf,$recendrecstart); - if($i != -1) { - push(@pos,$pos+$i); - # Start looking for next record _after_ this match - $pos += $i; - last; - } - } - } - } - push(@pos,$size); - close $fh; - return @pos; -} - -sub cat_partial { - # Input: - # $file = the file to read - # ($start, $end, [$start2, $end2, ...]) = start byte, end byte - # Returns: - # Efficient perl command to copy $start..$end, $start2..$end2, ... to stdout - my($file, @start_end) = @_; - my($start, $i); - # Convert start_end to start_len - my @start_len = map { if(++$i % 2) { $start = $_; } else { $_-$start } } @start_end; - return "<". shell_quote_scalar($file) . - q{ perl -e 'while(@ARGV) { sysseek(STDIN,shift,0) || die; $left = shift; while($read = sysread(STDIN,$buf, ($left > 32768 ? 32768 : $left))){ $left -= $read; syswrite(STDOUT,$buf); } }' } . - " @start_len"; -} - -sub spreadstdin { - # read a record - # Spawn a job and print the record to it. - # Uses: - # $opt::blocksize - # STDIN - # $opr::r - # $Global::max_lines - # $Global::max_number_of_args - # $opt::regexp - # $Global::start_no_new_jobs - # $opt::roundrobin - # %Global::running - - my $buf = ""; - my ($recstart,$recend) = recstartrecend(); - my $recendrecstart = $recend.$recstart; - my $chunk_number = 1; - my $one_time_through; - my $blocksize = $opt::blocksize; - my $in = *STDIN; - my $header = find_header(\$buf,$in); - while(1) { - my $anything_written = 0; - if(not read($in,substr($buf,length $buf,0),$blocksize)) { - # End-of-file - $chunk_number != 1 and last; - # Force the while-loop once if everything was read by header reading - $one_time_through++ and last; - } - if($opt::r) { - # Remove empty lines - $buf =~ s/^\s*\n//gm; - if(length $buf == 0) { - next; - } - } - if($Global::max_lines and not $Global::max_number_of_args) { - # Read n-line records - my $n_lines = $buf =~ tr/\n/\n/; - my $last_newline_pos = rindex($buf,"\n"); - while($n_lines % $Global::max_lines) { - $n_lines--; - $last_newline_pos = rindex($buf,"\n",$last_newline_pos-1); - } - # Chop at $last_newline_pos as that is where n-line record ends - $anything_written += - write_record_to_pipe($chunk_number++,\$header,\$buf, - $recstart,$recend,$last_newline_pos+1); - substr($buf,0,$last_newline_pos+1) = ""; - } elsif($opt::regexp) { - if($Global::max_number_of_args) { - # -N => (start..*?end){n} - # -L -N => (start..*?end){n*l} - my $read_n_lines = $Global::max_number_of_args * ($Global::max_lines || 1); - while($buf =~ s/((?:$recstart.*?$recend){$read_n_lines})($recstart.*)$/$2/os) { - # Copy to modifiable variable - my $b = $1; - $anything_written += - write_record_to_pipe($chunk_number++,\$header,\$b, - $recstart,$recend,length $1); - } - } else { - # Find the last recend-recstart in $buf - if($buf =~ s/(.*$recend)($recstart.*?)$/$2/os) { - # Copy to modifiable variable - my $b = $1; - $anything_written += - write_record_to_pipe($chunk_number++,\$header,\$b, - $recstart,$recend,length $1); - } - } - } else { - if($Global::max_number_of_args) { - # -N => (start..*?end){n} - my $i = 0; - my $read_n_lines = $Global::max_number_of_args * ($Global::max_lines || 1); - while(($i = nindex(\$buf,$recendrecstart,$read_n_lines)) != -1) { - $i += length $recend; # find the actual splitting location - $anything_written += - write_record_to_pipe($chunk_number++,\$header,\$buf, - $recstart,$recend,$i); - substr($buf,0,$i) = ""; - } - } else { - # Find the last recend-recstart in $buf - my $i = rindex($buf,$recendrecstart); - if($i != -1) { - $i += length $recend; # find the actual splitting location - $anything_written += - write_record_to_pipe($chunk_number++,\$header,\$buf, - $recstart,$recend,$i); - substr($buf,0,$i) = ""; - } - } - } - if(not $anything_written and not eof($in)) { - # Nothing was written - maybe the block size < record size? - # Increase blocksize exponentially - my $old_blocksize = $blocksize; - $blocksize = ceil($blocksize * 1.3 + 1); - ::warning("A record was longer than $old_blocksize. " . - "Increasing to --blocksize $blocksize\n"); - } - } - ::debug("init", "Done reading input\n"); - - # If there is anything left in the buffer write it - substr($buf,0,0) = ""; - write_record_to_pipe($chunk_number++,\$header,\$buf,$recstart,$recend,length $buf); - - $Global::start_no_new_jobs ||= 1; - if($opt::roundrobin) { - for my $job (values %Global::running) { - close $job->fh(0,"w"); - } - my %incomplete_jobs = %Global::running; - my $sleep = 1; - while(keys %incomplete_jobs) { - my $something_written = 0; - for my $pid (keys %incomplete_jobs) { - my $job = $incomplete_jobs{$pid}; - if($job->stdin_buffer_length()) { - $something_written += $job->non_block_write(); - } else { - delete $incomplete_jobs{$pid} - } - } - if($something_written) { - $sleep = $sleep/2+0.001; - } - $sleep = ::reap_usleep($sleep); - } - } -} - -sub recstartrecend { - # Uses: - # $opt::recstart - # $opt::recend - # Returns: - # $recstart,$recend with default values and regexp conversion - my($recstart,$recend); - if(defined($opt::recstart) and defined($opt::recend)) { - # If both --recstart and --recend is given then both must match - $recstart = $opt::recstart; - $recend = $opt::recend; - } elsif(defined($opt::recstart)) { - # If --recstart is given it must match start of record - $recstart = $opt::recstart; - $recend = ""; - } elsif(defined($opt::recend)) { - # If --recend is given then it must match end of record - $recstart = ""; - $recend = $opt::recend; - } - - if($opt::regexp) { - # If $recstart/$recend contains '|' this should only apply to the regexp - $recstart = "(?:".$recstart.")"; - $recend = "(?:".$recend.")"; - } else { - # $recstart/$recend = printf strings (\n) - $recstart =~ s/\\([0rnt\'\"\\])/"qq|\\$1|"/gee; - $recend =~ s/\\([0rnt\'\"\\])/"qq|\\$1|"/gee; - } - return ($recstart,$recend); -} - -sub nindex { - # See if string is in buffer N times - # Returns: - # the position where the Nth copy is found - my ($buf_ref, $str, $n) = @_; - my $i = 0; - for(1..$n) { - $i = index($$buf_ref,$str,$i+1); - if($i == -1) { last } - } - return $i; -} - -{ - my @robin_queue; - - sub round_robin_write { - # Input: - # $header_ref = ref to $header string - # $block_ref = ref to $block to be written - # $recstart = record start string - # $recend = record end string - # $endpos = end position of $block - # Uses: - # %Global::running - my ($header_ref,$block_ref,$recstart,$recend,$endpos) = @_; - my $something_written = 0; - my $block_passed = 0; - my $sleep = 1; - while(not $block_passed) { - # Continue flushing existing buffers - # until one is empty and a new block is passed - # Make a queue to spread the blocks evenly - if(not @robin_queue) { - push @robin_queue, values %Global::running; - } - while(my $job = shift @robin_queue) { - if($job->stdin_buffer_length() > 0) { - $something_written += $job->non_block_write(); - } else { - $job->set_stdin_buffer($header_ref,$block_ref,$endpos,$recstart,$recend); - $block_passed = 1; - $job->set_virgin(0); - $something_written += $job->non_block_write(); - last; - } - } - $sleep = ::reap_usleep($sleep); - } - return $something_written; - } -} - -sub write_record_to_pipe { - # Fork then - # Write record from pos 0 .. $endpos to pipe - # Input: - # $chunk_number = sequence number - to see if already run - # $header_ref = reference to header string to prepend - # $record_ref = reference to record to write - # $recstart = start string of record - # $recend = end string of record - # $endpos = position in $record_ref where record ends - # Uses: - # $Global::job_already_run - # $opt::roundrobin - # @Global::virgin_jobs - # Returns: - # Number of chunks written (0 or 1) - my ($chunk_number,$header_ref,$record_ref,$recstart,$recend,$endpos) = @_; - if($endpos == 0) { return 0; } - if(vec($Global::job_already_run,$chunk_number,1)) { return 1; } - if($opt::roundrobin) { - return round_robin_write($header_ref,$record_ref,$recstart,$recend,$endpos); - } - # If no virgin found, backoff - my $sleep = 0.0001; # 0.01 ms - better performance on highend - while(not @Global::virgin_jobs) { - ::debug("pipe", "No virgin jobs"); - $sleep = ::reap_usleep($sleep); - # Jobs may not be started because of loadavg - # or too little time between each ssh login. - start_more_jobs(); - } - my $job = shift @Global::virgin_jobs; - # Job is no longer virgin - $job->set_virgin(0); - if(fork()) { - # Skip - } else { - # Chop of at $endpos as we do not know how many rec_sep will - # be removed. - substr($$record_ref,$endpos,length $$record_ref) = ""; - # Remove rec_sep - if($opt::remove_rec_sep) { - Job::remove_rec_sep($record_ref,$recstart,$recend); - } - $job->write($header_ref); - $job->write($record_ref); - close $job->fh(0,"w"); - exit(0); - } - close $job->fh(0,"w"); - return 1; -} - -sub __SEM_MODE__ {} - -sub acquire_semaphore { - # Acquires semaphore. If needed: spawns to the background - # Uses: - # @Global::host - # Returns: - # The semaphore to be released when jobs is complete - $Global::host{':'} = SSHLogin->new(":"); - my $sem = Semaphore->new($Semaphore::name,$Global::host{':'}->max_jobs_running()); - $sem->acquire(); - if($Semaphore::fg) { - # skip - } else { - # If run in the background, the PID will change - # therefore release and re-acquire the semaphore - $sem->release(); - if(fork()) { - exit(0); - } else { - # child - # Get a semaphore for this pid - ::die_bug("Can't start a new session: $!") if setsid() == -1; - $sem = Semaphore->new($Semaphore::name,$Global::host{':'}->max_jobs_running()); - $sem->acquire(); - } - } - return $sem; -} - -sub __PARSE_OPTIONS__ {} - -sub options_hash { - # Returns: - # %hash = the GetOptions config - return - ("debug|D=s" => \$opt::D, - "xargs" => \$opt::xargs, - "m" => \$opt::m, - "X" => \$opt::X, - "v" => \@opt::v, - "joblog=s" => \$opt::joblog, - "results|result|res=s" => \$opt::results, - "resume" => \$opt::resume, - "resume-failed|resumefailed" => \$opt::resume_failed, - "silent" => \$opt::silent, - #"silent-error|silenterror" => \$opt::silent_error, - "keep-order|keeporder|k" => \$opt::keeporder, - "group" => \$opt::group, - "g" => \$opt::retired, - "ungroup|u" => \$opt::ungroup, - "linebuffer|linebuffered|line-buffer|line-buffered" => \$opt::linebuffer, - "tmux" => \$opt::tmux, - "null|0" => \$opt::0, - "quote|q" => \$opt::q, - # Replacement strings - "parens=s" => \$opt::parens, - "rpl=s" => \@opt::rpl, - "plus" => \$opt::plus, - "I=s" => \$opt::I, - "extensionreplace|er=s" => \$opt::U, - "U=s" => \$opt::retired, - "basenamereplace|bnr=s" => \$opt::basenamereplace, - "dirnamereplace|dnr=s" => \$opt::dirnamereplace, - "basenameextensionreplace|bner=s" => \$opt::basenameextensionreplace, - "seqreplace=s" => \$opt::seqreplace, - "slotreplace=s" => \$opt::slotreplace, - "jobs|j=s" => \$opt::jobs, - "delay=f" => \$opt::delay, - "sshdelay=f" => \$opt::sshdelay, - "load=s" => \$opt::load, - "noswap" => \$opt::noswap, - "max-line-length-allowed" => \$opt::max_line_length_allowed, - "number-of-cpus" => \$opt::number_of_cpus, - "number-of-cores" => \$opt::number_of_cores, - "use-cpus-instead-of-cores" => \$opt::use_cpus_instead_of_cores, - "shellquote|shell_quote|shell-quote" => \$opt::shellquote, - "nice=i" => \$opt::nice, - "timeout=s" => \$opt::timeout, - "tag" => \$opt::tag, - "tagstring|tag-string=s" => \$opt::tagstring, - "onall" => \$opt::onall, - "nonall" => \$opt::nonall, - "filter-hosts|filterhosts|filter-host" => \$opt::filter_hosts, - "sshlogin|S=s" => \@opt::sshlogin, - "sshloginfile|slf=s" => \@opt::sshloginfile, - "controlmaster|M" => \$opt::controlmaster, - "return=s" => \@opt::return, - "trc=s" => \@opt::trc, - "transfer" => \$opt::transfer, - "cleanup" => \$opt::cleanup, - "basefile|bf=s" => \@opt::basefile, - "B=s" => \$opt::retired, - "ctrlc|ctrl-c" => \$opt::ctrlc, - "noctrlc|no-ctrlc|no-ctrl-c" => \$opt::noctrlc, - "workdir|work-dir|wd=s" => \$opt::workdir, - "W=s" => \$opt::retired, - "tmpdir=s" => \$opt::tmpdir, - "tempdir=s" => \$opt::tmpdir, - "use-compress-program|compress-program=s" => \$opt::compress_program, - "use-decompress-program|decompress-program=s" => \$opt::decompress_program, - "compress" => \$opt::compress, - "tty" => \$opt::tty, - "T" => \$opt::retired, - "halt-on-error|halt=s" => \$opt::halt_on_error, - "H=i" => \$opt::retired, - "retries=i" => \$opt::retries, - "dry-run|dryrun" => \$opt::dryrun, - "progress" => \$opt::progress, - "eta" => \$opt::eta, - "bar" => \$opt::bar, - "arg-sep|argsep=s" => \$opt::arg_sep, - "arg-file-sep|argfilesep=s" => \$opt::arg_file_sep, - "trim=s" => \$opt::trim, - "env=s" => \@opt::env, - "recordenv|record-env" => \$opt::record_env, - "plain" => \$opt::plain, - "profile|J=s" => \@opt::profile, - "pipe|spreadstdin" => \$opt::pipe, - "robin|round-robin|roundrobin" => \$opt::roundrobin, - "recstart=s" => \$opt::recstart, - "recend=s" => \$opt::recend, - "regexp|regex" => \$opt::regexp, - "remove-rec-sep|removerecsep|rrs" => \$opt::remove_rec_sep, - "files|output-as-files|outputasfiles" => \$opt::files, - "block|block-size|blocksize=s" => \$opt::blocksize, - "tollef" => \$opt::retired, - "gnu" => \$opt::gnu, - "xapply" => \$opt::xapply, - "bibtex" => \$opt::bibtex, - "nn|nonotice|no-notice" => \$opt::no_notice, - # xargs-compatibility - implemented, man, testsuite - "max-procs|P=s" => \$opt::jobs, - "delimiter|d=s" => \$opt::d, - "max-chars|s=i" => \$opt::max_chars, - "arg-file|a=s" => \@opt::a, - "no-run-if-empty|r" => \$opt::r, - "replace|i:s" => \$opt::i, - "E=s" => \$opt::eof, - "eof|e:s" => \$opt::eof, - "max-args|n=i" => \$opt::max_args, - "max-replace-args|N=i" => \$opt::max_replace_args, - "colsep|col-sep|C=s" => \$opt::colsep, - "help|h" => \$opt::help, - "L=f" => \$opt::L, - "max-lines|l:f" => \$opt::max_lines, - "interactive|p" => \$opt::p, - "verbose|t" => \$opt::verbose, - "version|V" => \$opt::version, - "minversion|min-version=i" => \$opt::minversion, - "show-limits|showlimits" => \$opt::show_limits, - "exit|x" => \$opt::x, - # Semaphore - "semaphore" => \$opt::semaphore, - "semaphoretimeout=i" => \$opt::semaphoretimeout, - "semaphorename|id=s" => \$opt::semaphorename, - "fg" => \$opt::fg, - "bg" => \$opt::bg, - "wait" => \$opt::wait, - # Shebang #!/usr/bin/parallel --shebang - "shebang|hashbang" => \$opt::shebang, - "internal-pipe-means-argfiles" => \$opt::internal_pipe_means_argfiles, - "Y" => \$opt::retired, - "skip-first-line" => \$opt::skip_first_line, - "header=s" => \$opt::header, - "cat" => \$opt::cat, - "fifo" => \$opt::fifo, - "pipepart|pipe-part" => \$opt::pipepart, - "hgrp|hostgroup|hostgroups" => \$opt::hostgroups, - ); -} - -sub get_options_from_array { - # Run GetOptions on @array - # Input: - # $array_ref = ref to @ARGV to parse - # @keep_only = Keep only these options - # Uses: - # @ARGV - # Returns: - # true if parsing worked - # false if parsing failed - # @$array_ref is changed - my ($array_ref, @keep_only) = @_; - if(not @$array_ref) { - # Empty array: No need to look more at that - return 1; - } - # A bit of shuffling of @ARGV needed as GetOptionsFromArray is not - # supported everywhere - my @save_argv; - my $this_is_ARGV = (\@::ARGV == $array_ref); - if(not $this_is_ARGV) { - @save_argv = @::ARGV; - @::ARGV = @{$array_ref}; - } - # If @keep_only set: Ignore all values except @keep_only - my %options = options_hash(); - if(@keep_only) { - my (%keep,@dummy); - @keep{@keep_only} = @keep_only; - for my $k (grep { not $keep{$_} } keys %options) { - # Store the value of the option in @dummy - $options{$k} = \@dummy; - } - } - my $retval = GetOptions(%options); - if(not $this_is_ARGV) { - @{$array_ref} = @::ARGV; - @::ARGV = @save_argv; - } - return $retval; -} - -sub parse_options { - # Returns: N/A - # Defaults: - $Global::version = 20141122; - $Global::progname = 'parallel'; - $Global::infinity = 2**31; - $Global::debug = 0; - $Global::verbose = 0; - $Global::quoting = 0; - # Read only table with default --rpl values - %Global::replace = - ( - '{}' => '', - '{#}' => '1 $_=$job->seq()', - '{%}' => '1 $_=$job->slot()', - '{/}' => 's:.*/::', - '{//}' => '$Global::use{"File::Basename"} ||= eval "use File::Basename; 1;"; $_ = dirname($_);', - '{/.}' => 's:.*/::; s:\.[^/.]+$::;', - '{.}' => 's:\.[^/.]+$::', - ); - %Global::plus = - ( - # {} = {+/}/{/} - # = {.}.{+.} = {+/}/{/.}.{+.} - # = {..}.{+..} = {+/}/{/..}.{+..} - # = {...}.{+...} = {+/}/{/...}.{+...} - '{+/}' => 's:/[^/]*$::', - '{+.}' => 's:.*\.::', - '{+..}' => 's:.*\.([^.]*\.):$1:', - '{+...}' => 's:.*\.([^.]*\.[^.]*\.):$1:', - '{..}' => 's:\.[^/.]+$::; s:\.[^/.]+$::', - '{...}' => 's:\.[^/.]+$::; s:\.[^/.]+$::; s:\.[^/.]+$::', - '{/..}' => 's:.*/::; s:\.[^/.]+$::; s:\.[^/.]+$::', - '{/...}' => 's:.*/::; s:\.[^/.]+$::; s:\.[^/.]+$::; s:\.[^/.]+$::', - ); - # Modifiable copy of %Global::replace - %Global::rpl = %Global::replace; - $Global::parens = "{==}"; - $/="\n"; - $Global::ignore_empty = 0; - $Global::interactive = 0; - $Global::stderr_verbose = 0; - $Global::default_simultaneous_sshlogins = 9; - $Global::exitstatus = 0; - $Global::halt_on_error_exitstatus = 0; - $Global::arg_sep = ":::"; - $Global::arg_file_sep = "::::"; - $Global::trim = 'n'; - $Global::max_jobs_running = 0; - $Global::job_already_run = ''; - $ENV{'TMPDIR'} ||= "/tmp"; - - @ARGV=read_options(); - - if(@opt::v) { $Global::verbose = $#opt::v+1; } # Convert -v -v to v=2 - $Global::debug = $opt::D; - $Global::shell = $ENV{'PARALLEL_SHELL'} || parent_shell($$) || $ENV{'SHELL'} || "/bin/sh"; - if(defined $opt::X) { $Global::ContextReplace = 1; } - if(defined $opt::silent) { $Global::verbose = 0; } - if(defined $opt::0) { $/ = "\0"; } - if(defined $opt::d) { my $e="sprintf \"$opt::d\""; $/ = eval $e; } - if(defined $opt::p) { $Global::interactive = $opt::p; } - if(defined $opt::q) { $Global::quoting = 1; } - if(defined $opt::r) { $Global::ignore_empty = 1; } - if(defined $opt::verbose) { $Global::stderr_verbose = 1; } - # Deal with --rpl - sub rpl { - # Modify %Global::rpl - # Replace $old with $new - my ($old,$new) = @_; - if($old ne $new) { - $Global::rpl{$new} = $Global::rpl{$old}; - delete $Global::rpl{$old}; - } - } - if(defined $opt::parens) { $Global::parens = $opt::parens; } - my $parenslen = 0.5*length $Global::parens; - $Global::parensleft = substr($Global::parens,0,$parenslen); - $Global::parensright = substr($Global::parens,$parenslen); - if(defined $opt::plus) { %Global::rpl = (%Global::plus,%Global::rpl); } - if(defined $opt::I) { rpl('{}',$opt::I); } - if(defined $opt::U) { rpl('{.}',$opt::U); } - if(defined $opt::i and $opt::i) { rpl('{}',$opt::i); } - if(defined $opt::basenamereplace) { rpl('{/}',$opt::basenamereplace); } - if(defined $opt::dirnamereplace) { rpl('{//}',$opt::dirnamereplace); } - if(defined $opt::seqreplace) { rpl('{#}',$opt::seqreplace); } - if(defined $opt::slotreplace) { rpl('{%}',$opt::slotreplace); } - if(defined $opt::basenameextensionreplace) { - rpl('{/.}',$opt::basenameextensionreplace); - } - for(@opt::rpl) { - # Create $Global::rpl entries for --rpl options - # E.g: "{..} s:\.[^.]+$:;s:\.[^.]+$:;" - my ($shorthand,$long) = split/ /,$_,2; - $Global::rpl{$shorthand} = $long; - } - if(defined $opt::eof) { $Global::end_of_file_string = $opt::eof; } - if(defined $opt::max_args) { $Global::max_number_of_args = $opt::max_args; } - if(defined $opt::timeout) { $Global::timeoutq = TimeoutQueue->new($opt::timeout); } - if(defined $opt::tmpdir) { $ENV{'TMPDIR'} = $opt::tmpdir; } - if(defined $opt::help) { die_usage(); } - if(defined $opt::colsep) { $Global::trim = 'lr'; } - if(defined $opt::header) { $opt::colsep = defined $opt::colsep ? $opt::colsep : "\t"; } - if(defined $opt::trim) { $Global::trim = $opt::trim; } - if(defined $opt::arg_sep) { $Global::arg_sep = $opt::arg_sep; } - if(defined $opt::arg_file_sep) { $Global::arg_file_sep = $opt::arg_file_sep; } - if(defined $opt::number_of_cpus) { print SSHLogin::no_of_cpus(),"\n"; wait_and_exit(0); } - if(defined $opt::number_of_cores) { - print SSHLogin::no_of_cores(),"\n"; wait_and_exit(0); - } - if(defined $opt::max_line_length_allowed) { - print Limits::Command::real_max_length(),"\n"; wait_and_exit(0); - } - if(defined $opt::version) { version(); wait_and_exit(0); } - if(defined $opt::bibtex) { bibtex(); wait_and_exit(0); } - if(defined $opt::record_env) { record_env(); wait_and_exit(0); } - if(defined $opt::show_limits) { show_limits(); } - if(@opt::sshlogin) { @Global::sshlogin = @opt::sshlogin; } - if(@opt::sshloginfile) { read_sshloginfiles(@opt::sshloginfile); } - if(@opt::return) { push @Global::ret_files, @opt::return; } - if(not defined $opt::recstart and - not defined $opt::recend) { $opt::recend = "\n"; } - if(not defined $opt::blocksize) { $opt::blocksize = "1M"; } - $opt::blocksize = multiply_binary_prefix($opt::blocksize); - if(defined $opt::controlmaster) { $opt::noctrlc = 1; } - if(defined $opt::semaphore) { $Global::semaphore = 1; } - if(defined $opt::semaphoretimeout) { $Global::semaphore = 1; } - if(defined $opt::semaphorename) { $Global::semaphore = 1; } - if(defined $opt::fg) { $Global::semaphore = 1; } - if(defined $opt::bg) { $Global::semaphore = 1; } - if(defined $opt::wait) { $Global::semaphore = 1; } - if(defined $opt::halt_on_error and - $opt::halt_on_error=~/%/) { $opt::halt_on_error /= 100; } - if(defined $opt::timeout and $opt::timeout !~ /^\d+(\.\d+)?%?$/) { - ::error("--timeout must be seconds or percentage\n"); - wait_and_exit(255); - } - if(defined $opt::minversion) { - print $Global::version,"\n"; - if($Global::version < $opt::minversion) { - wait_and_exit(255); - } else { - wait_and_exit(0); - } - } - if(not defined $opt::delay) { - # Set --delay to --sshdelay if not set - $opt::delay = $opt::sshdelay; - } - if($opt::compress_program) { - $opt::compress = 1; - $opt::decompress_program ||= $opt::compress_program." -dc"; - } - if($opt::compress) { - my ($compress, $decompress) = find_compression_program(); - $opt::compress_program ||= $compress; - $opt::decompress_program ||= $decompress; - } - if(defined $opt::nonall) { - # Append a dummy empty argument - push @ARGV, $Global::arg_sep, ""; - } - if(defined $opt::tty) { - # Defaults for --tty: -j1 -u - # Can be overridden with -jXXX -g - if(not defined $opt::jobs) { - $opt::jobs = 1; - } - if(not defined $opt::group) { - $opt::ungroup = 0; - } - } - if(@opt::trc) { - push @Global::ret_files, @opt::trc; - $opt::transfer = 1; - $opt::cleanup = 1; - } - if(defined $opt::max_lines) { - if($opt::max_lines eq "-0") { - # -l -0 (swallowed -0) - $opt::max_lines = 1; - $opt::0 = 1; - $/ = "\0"; - } elsif ($opt::max_lines == 0) { - # If not given (or if 0 is given) => 1 - $opt::max_lines = 1; - } - $Global::max_lines = $opt::max_lines; - if(not $opt::pipe) { - # --pipe -L means length of record - not max_number_of_args - $Global::max_number_of_args ||= $Global::max_lines; - } - } - - # Read more than one arg at a time (-L, -N) - if(defined $opt::L) { - $Global::max_lines = $opt::L; - if(not $opt::pipe) { - # --pipe -L means length of record - not max_number_of_args - $Global::max_number_of_args ||= $Global::max_lines; - } - } - if(defined $opt::max_replace_args) { - $Global::max_number_of_args = $opt::max_replace_args; - $Global::ContextReplace = 1; - } - if((defined $opt::L or defined $opt::max_replace_args) - and - not ($opt::xargs or $opt::m)) { - $Global::ContextReplace = 1; - } - if(defined $opt::tag and not defined $opt::tagstring) { - $opt::tagstring = "\257<\257>"; # Default = {} - } - if(defined $opt::pipepart and - (defined $opt::L or defined $opt::max_lines - or defined $opt::max_replace_args)) { - ::error("--pipepart is incompatible with --max-replace-args, ", - "--max-lines, and -L.\n"); - wait_and_exit(255); - } - if(grep /^$Global::arg_sep$|^$Global::arg_file_sep$/o, @ARGV) { - # Deal with ::: and :::: - @ARGV=read_args_from_command_line(); - } - - # Semaphore defaults - # Must be done before computing number of processes and max_line_length - # because when running as a semaphore GNU Parallel does not read args - $Global::semaphore ||= ($0 =~ m:(^|/)sem$:); # called as 'sem' - if($Global::semaphore) { - # A semaphore does not take input from neither stdin nor file - @opt::a = ("/dev/null"); - push(@Global::unget_argv, [Arg->new("")]); - $Semaphore::timeout = $opt::semaphoretimeout || 0; - if(defined $opt::semaphorename) { - $Semaphore::name = $opt::semaphorename; - } else { - $Semaphore::name = `tty`; - chomp $Semaphore::name; - } - $Semaphore::fg = $opt::fg; - $Semaphore::wait = $opt::wait; - $Global::default_simultaneous_sshlogins = 1; - if(not defined $opt::jobs) { - $opt::jobs = 1; - } - if($Global::interactive and $opt::bg) { - ::error("Jobs running in the ". - "background cannot be interactive.\n"); - ::wait_and_exit(255); - } - } - if(defined $opt::eta) { - $opt::progress = $opt::eta; - } - if(defined $opt::bar) { - $opt::progress = $opt::bar; - } - if(defined $opt::retired) { - ::error("-g has been retired. Use --group.\n"); - ::error("-B has been retired. Use --bf.\n"); - ::error("-T has been retired. Use --tty.\n"); - ::error("-U has been retired. Use --er.\n"); - ::error("-W has been retired. Use --wd.\n"); - ::error("-Y has been retired. Use --shebang.\n"); - ::error("-H has been retired. Use --halt.\n"); - ::error("--tollef has been retired. Use -u -q --arg-sep -- and --load for -l.\n"); - ::wait_and_exit(255); - } - citation_notice(); - - parse_sshlogin(); - parse_env_var(); - - if(remote_hosts() and ($opt::X or $opt::m or $opt::xargs)) { - # As we do not know the max line length on the remote machine - # long commands generated by xargs may fail - # If opt_N is set, it is probably safe - ::warning("Using -X or -m with --sshlogin may fail.\n"); - } - - if(not defined $opt::jobs) { - $opt::jobs = "100%"; - } - open_joblog(); -} - -sub env_quote { - # Input: - # $v = value to quote - # Returns: - # $v = value quoted as environment variable - my $v = $_[0]; - $v =~ s/([\\])/\\$1/g; - $v =~ s/([\[\] \#\'\&\<\>\(\)\;\{\}\t\"\$\`\*\174\!\?\~])/\\$1/g; - $v =~ s/\n/"\n"/g; - return $v; -} - -sub record_env { - # Record current %ENV-keys in ~/.parallel/ignored_vars - # Returns: N/A - my $ignore_filename = $ENV{'HOME'} . "/.parallel/ignored_vars"; - if(open(my $vars_fh, ">", $ignore_filename)) { - print $vars_fh map { $_,"\n" } keys %ENV; - } else { - ::error("Cannot write to $ignore_filename\n"); - ::wait_and_exit(255); - } -} - -sub parse_env_var { - # Parse --env and set $Global::envvar, $Global::envwarn and $Global::envvarlen - # - # Bash functions must be parsed to export them remotely - # Pre-shellshock style bash function: - # myfunc=() {... - # Post-shellshock style bash function: - # BASH_FUNC_myfunc()=() {... - # - # Uses: - # $Global::envvar = eval string that will set variables in both bash and csh - # $Global::envwarn = If functions are used: Give warning in csh - # $Global::envvarlen = length of $Global::envvar - # @opt::env - # $Global::shell - # %ENV - # Returns: N/A - $Global::envvar = ""; - $Global::envwarn = ""; - my @vars = ('parallel_bash_environment'); - for my $varstring (@opt::env) { - # Split up --env VAR1,VAR2 - push @vars, split /,/, $varstring; - } - if(grep { /^_$/ } @vars) { - # --env _ - # Include all vars that are not in a clean environment - if(open(my $vars_fh, "<", $ENV{'HOME'} . "/.parallel/ignored_vars")) { - my @ignore = <$vars_fh>; - chomp @ignore; - my %ignore; - @ignore{@ignore} = @ignore; - close $vars_fh; - push @vars, grep { not defined $ignore{$_} } keys %ENV; - @vars = grep { not /^_$/ } @vars; - } else { - ::error("Run '$Global::progname --record-env' in a clean environment first.\n"); - ::wait_and_exit(255); - } - } - # Duplicate vars as BASH functions to include post-shellshock functions. - # So --env myfunc should also look for BASH_FUNC_myfunc() - @vars = map { $_, "BASH_FUNC_$_()" } @vars; - # Keep only defined variables - @vars = grep { defined($ENV{$_}) } @vars; - # Pre-shellshock style bash function: - # myfunc=() { echo myfunc - # } - # Post-shellshock style bash function: - # BASH_FUNC_myfunc()=() { echo myfunc - # } - my @bash_functions = grep { substr($ENV{$_},0,4) eq "() {" } @vars; - my @non_functions = grep { substr($ENV{$_},0,4) ne "() {" } @vars; - if(@bash_functions) { - # Functions are not supported for all shells - if($Global::shell !~ m:/(bash|rbash|zsh|rzsh|dash|ksh):) { - ::warning("Shell functions may not be supported in $Global::shell\n"); - } - } - - # Pre-shellschock names are without () - my @bash_pre_shellshock = grep { not /\(\)/ } @bash_functions; - # Post-shellschock names are with () - my @bash_post_shellshock = grep { /\(\)/ } @bash_functions; - - my @qcsh = (map { my $a=$_; "setenv $a " . env_quote($ENV{$a}) } - grep { not /^parallel_bash_environment$/ } @non_functions); - my @qbash = (map { my $a=$_; "export $a=" . env_quote($ENV{$a}) } - @non_functions, @bash_pre_shellshock); - - push @qbash, map { my $a=$_; "eval $a\"\$$a\"" } @bash_pre_shellshock; - push @qbash, map { /BASH_FUNC_(.*)\(\)/; "$1 $ENV{$_}" } @bash_post_shellshock; - - #ssh -tt -oLogLevel=quiet lo 'eval `echo PARALLEL_SEQ='$PARALLEL_SEQ'\;export PARALLEL_SEQ\; PARALLEL_PID='$PARALLEL_PID'\;export PARALLEL_PID` ;' tty\ \>/dev/null\ \&\&\ stty\ isig\ -onlcr\ -echo\;echo\ \$SHELL\ \|\ grep\ \"/t\\\{0,1\\\}csh\"\ \>\ /dev/null\ \&\&\ setenv\ BASH_FUNC_myfunc\ \\\(\\\)\\\ \\\{\\\ \\\ echo\\\ a\"' - #'\"\\\}\ \|\|\ myfunc\(\)\ \{\ \ echo\ a' - #'\}\ \;myfunc\ 1; - - # Check if any variables contain \n - if(my @v = map { s/BASH_FUNC_(.*)\(\)/$1/; $_ } grep { $ENV{$_}=~/\n/ } @vars) { - # \n is bad for csh and will cause it to fail. - $Global::envwarn = ::shell_quote_scalar(q{echo $SHELL | egrep "/t?csh" > /dev/null && echo CSH/TCSH DO NOT SUPPORT newlines IN VARIABLES/FUNCTIONS. Unset }."@v".q{ && exec false;}."\n\n") . $Global::envwarn; - } - - if(not @qcsh) { push @qcsh, "true"; } - if(not @qbash) { push @qbash, "true"; } - # Create lines like: - # echo $SHELL | grep "/t\\{0,1\\}csh" >/dev/null && setenv V1 val1 && setenv V2 val2 || export V1=val1 && export V2=val2 ; echo "$V1$V2" - if(@vars) { - $Global::envvar .= - join"", - (q{echo $SHELL | grep "/t\\{0,1\\}csh" > /dev/null && } - . join(" && ", @qcsh) - . q{ || } - . join(" && ", @qbash) - .q{;}); - if($ENV{'parallel_bash_environment'}) { - $Global::envvar .= 'eval "$parallel_bash_environment";'."\n"; - } - } - $Global::envvarlen = length $Global::envvar; -} - -sub open_joblog { - # Open joblog as specified by --joblog - # Uses: - # $opt::resume - # $opt::resume_failed - # $opt::joblog - # $opt::results - # $Global::job_already_run - # %Global::fd - my $append = 0; - if(($opt::resume or $opt::resume_failed) - and - not ($opt::joblog or $opt::results)) { - ::error("--resume and --resume-failed require --joblog or --results.\n"); - ::wait_and_exit(255); - } - if($opt::joblog) { - if($opt::resume || $opt::resume_failed) { - if(open(my $joblog_fh, "<", $opt::joblog)) { - # Read the joblog - $append = <$joblog_fh>; # If there is a header: Open as append later - my $joblog_regexp; - if($opt::resume_failed) { - # Make a regexp that only matches commands with exit+signal=0 - # 4 host 1360490623.067 3.445 1023 1222 0 0 command - $joblog_regexp='^(\d+)(?:\t[^\t]+){5}\t0\t0\t'; - } else { - # Just match the job number - $joblog_regexp='^(\d+)'; - } - while(<$joblog_fh>) { - if(/$joblog_regexp/o) { - # This is 30% faster than set_job_already_run($1); - vec($Global::job_already_run,($1||0),1) = 1; - } elsif(not /\d+\s+[^\s]+\s+([0-9.]+\s+){6}/) { - ::error("Format of '$opt::joblog' is wrong: $_"); - ::wait_and_exit(255); - } - } - close $joblog_fh; - } - } - if($append) { - # Append to joblog - if(not open($Global::joblog, ">>", $opt::joblog)) { - ::error("Cannot append to --joblog $opt::joblog.\n"); - ::wait_and_exit(255); - } - } else { - if($opt::joblog eq "-") { - # Use STDOUT as joblog - $Global::joblog = $Global::fd{1}; - } elsif(not open($Global::joblog, ">", $opt::joblog)) { - # Overwrite the joblog - ::error("Cannot write to --joblog $opt::joblog.\n"); - ::wait_and_exit(255); - } - print $Global::joblog - join("\t", "Seq", "Host", "Starttime", "JobRuntime", - "Send", "Receive", "Exitval", "Signal", "Command" - ). "\n"; - } - } -} - -sub find_compression_program { - # Find a fast compression program - # Returns: - # $compress_program = compress program with options - # $decompress_program = decompress program with options - - # Search for these. Sorted by speed - my @prg = qw(lzop pigz pxz gzip plzip pbzip2 lzma xz lzip bzip2); - for my $p (@prg) { - if(which($p)) { - return ("$p -c -1","$p -dc"); - } - } - # Fall back to cat - return ("cat","cat"); -} - - -sub read_options { - # Read options from command line, profile and $PARALLEL - # Uses: - # $opt::shebang_wrap - # $opt::shebang - # @ARGV - # $opt::plain - # @opt::profile - # $ENV{'HOME'} - # $ENV{'PARALLEL'} - # Returns: - # @ARGV_no_opt = @ARGV without --options - - # This must be done first as this may exec myself - if(defined $ARGV[0] and ($ARGV[0] =~ /^--shebang/ or - $ARGV[0] =~ /^--shebang-?wrap/ or - $ARGV[0] =~ /^--hashbang/)) { - # Program is called from #! line in script - # remove --shebang-wrap if it is set - $opt::shebang_wrap = ($ARGV[0] =~ s/^--shebang-?wrap *//); - # remove --shebang if it is set - $opt::shebang = ($ARGV[0] =~ s/^--shebang *//); - # remove --hashbang if it is set - $opt::shebang .= ($ARGV[0] =~ s/^--hashbang *//); - if($opt::shebang) { - my $argfile = shell_quote_scalar(pop @ARGV); - # exec myself to split $ARGV[0] into separate fields - exec "$0 --skip-first-line -a $argfile @ARGV"; - } - if($opt::shebang_wrap) { - my @options; - my @parser; - if ($^O eq 'freebsd') { - # FreeBSD's #! puts different values in @ARGV than Linux' does. - my @nooptions = @ARGV; - get_options_from_array(\@nooptions); - while($#ARGV > $#nooptions) { - push @options, shift @ARGV; - } - while(@ARGV and $ARGV[0] ne ":::") { - push @parser, shift @ARGV; - } - if(@ARGV and $ARGV[0] eq ":::") { - shift @ARGV; - } - } else { - @options = shift @ARGV; - } - my $script = shell_quote_scalar(shift @ARGV); - # exec myself to split $ARGV[0] into separate fields - exec "$0 --internal-pipe-means-argfiles @options @parser $script ::: @ARGV"; - } - } - - Getopt::Long::Configure("bundling","require_order"); - my @ARGV_copy = @ARGV; - # Check if there is a --profile to set @opt::profile - get_options_from_array(\@ARGV_copy,"profile|J=s","plain") || die_usage(); - my @ARGV_profile = (); - my @ARGV_env = (); - if(not $opt::plain) { - # Add options from .parallel/config and other profiles - my @config_profiles = ( - "/etc/parallel/config", - $ENV{'HOME'}."/.parallel/config", - $ENV{'HOME'}."/.parallelrc"); - my @profiles = @config_profiles; - if(@opt::profile) { - # --profile overrides default profiles - @profiles = (); - for my $profile (@opt::profile) { - if(-r $profile) { - push @profiles, $profile; - } else { - push @profiles, $ENV{'HOME'}."/.parallel/".$profile; - } - } - } - for my $profile (@profiles) { - if(-r $profile) { - open (my $in_fh, "<", $profile) || ::die_bug("read-profile: $profile"); - while(<$in_fh>) { - /^\s*\#/ and next; - chomp; - push @ARGV_profile, shellwords($_); - } - close $in_fh; - } else { - if(grep /^$profile$/, @config_profiles) { - # config file is not required to exist - } else { - ::error("$profile not readable.\n"); - wait_and_exit(255); - } - } - } - # Add options from shell variable $PARALLEL - if($ENV{'PARALLEL'}) { - @ARGV_env = shellwords($ENV{'PARALLEL'}); - } - } - Getopt::Long::Configure("bundling","require_order"); - get_options_from_array(\@ARGV_profile) || die_usage(); - get_options_from_array(\@ARGV_env) || die_usage(); - get_options_from_array(\@ARGV) || die_usage(); - - # Prepend non-options to @ARGV (such as commands like 'nice') - unshift @ARGV, @ARGV_profile, @ARGV_env; - return @ARGV; -} - -sub read_args_from_command_line { - # Arguments given on the command line after: - # ::: ($Global::arg_sep) - # :::: ($Global::arg_file_sep) - # Removes the arguments from @ARGV and: - # - puts filenames into -a - # - puts arguments into files and add the files to -a - # Input: - # @::ARGV = command option ::: arg arg arg :::: argfiles - # Uses: - # $Global::arg_sep - # $Global::arg_file_sep - # $opt::internal_pipe_means_argfiles - # $opt::pipe - # @opt::a - # Returns: - # @argv_no_argsep = @::ARGV without ::: and :::: and following args - my @new_argv = (); - for(my $arg = shift @ARGV; @ARGV; $arg = shift @ARGV) { - if($arg eq $Global::arg_sep - or - $arg eq $Global::arg_file_sep) { - my $group = $arg; # This group of arguments is args or argfiles - my @group; - while(defined ($arg = shift @ARGV)) { - if($arg eq $Global::arg_sep - or - $arg eq $Global::arg_file_sep) { - # exit while loop if finding new separator - last; - } else { - # If not hitting ::: or :::: - # Append it to the group - push @group, $arg; - } - } - - if($group eq $Global::arg_file_sep - or ($opt::internal_pipe_means_argfiles and $opt::pipe) - ) { - # Group of file names on the command line. - # Append args into -a - push @opt::a, @group; - } elsif($group eq $Global::arg_sep) { - # Group of arguments on the command line. - # Put them into a file. - # Create argfile - my ($outfh,$name) = ::tmpfile(SUFFIX => ".arg"); - unlink($name); - # Put args into argfile - print $outfh map { $_,$/ } @group; - seek $outfh, 0, 0; - # Append filehandle to -a - push @opt::a, $outfh; - } else { - ::die_bug("Unknown command line group: $group"); - } - if(defined($arg)) { - # $arg is ::: or :::: - redo; - } else { - # $arg is undef -> @ARGV empty - last; - } - } - push @new_argv, $arg; - } - # Output: @ARGV = command to run with options - return @new_argv; -} - -sub cleanup { - # Returns: N/A - if(@opt::basefile) { cleanup_basefile(); } -} - -sub __QUOTING_ARGUMENTS_FOR_SHELL__ {} - -sub shell_quote { - # Input: - # @strings = strings to be quoted - # Output: - # @shell_quoted_strings = string quoted with \ as needed by the shell - my @strings = (@_); - for my $a (@strings) { - $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g; - $a =~ s/[\n]/'\n'/g; # filenames with '\n' is quoted using \' - } - return wantarray ? @strings : "@strings"; -} - -sub shell_quote_empty { - # Inputs: - # @strings = strings to be quoted - # Returns: - # @quoted_strings = empty strings quoted as ''. - my @strings = shell_quote(@_); - for my $a (@strings) { - if($a eq "") { - $a = "''"; - } - } - return wantarray ? @strings : "@strings"; -} - -sub shell_quote_scalar { - # Quote the string so shell will not expand any special chars - # Inputs: - # $string = string to be quoted - # Returns: - # $shell_quoted = string quoted with \ as needed by the shell - my $a = $_[0]; - if(defined $a) { - # $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g; - # This is 1% faster than the above - $a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\*\>\<\~\|\; \"\!\$\&\'\202-\377]/\\$&/go; - $a =~ s/[\n]/'\n'/go; # filenames with '\n' is quoted using \' - } - return $a; -} - -sub shell_quote_file { - # Quote the string so shell will not expand any special chars and prepend ./ if needed - # Input: - # $filename = filename to be shell quoted - # Returns: - # $quoted_filename = filename quoted with \ as needed by the shell and ./ if needed - my $a = shell_quote_scalar(shift); - if(defined $a) { - if($a =~ m:^/: or $a =~ m:^\./:) { - # /abs/path or ./rel/path => skip - } else { - # rel/path => ./rel/path - $a = "./".$a; - } - } - return $a; -} - -sub shellwords { - # Input: - # $string = shell line - # Returns: - # @shell_words = $string split into words as shell would do - $Global::use{"Text::ParseWords"} ||= eval "use Text::ParseWords; 1;"; - return Text::ParseWords::shellwords(@_); -} - - -sub __FILEHANDLES__ {} - - -sub save_stdin_stdout_stderr { - # Remember the original STDIN, STDOUT and STDERR - # and file descriptors opened by the shell (e.g. 3>/tmp/foo) - # Uses: - # %Global::fd - # $Global::original_stderr - # $Global::original_stdin - # Returns: N/A - - # Find file descriptors that are already opened (by the shell) - for my $fdno (1..61) { - # /dev/fd/62 and above are used by bash for <(cmd) - my $fh; - # 2-argument-open is used to be compatible with old perl 5.8.0 - # bug #43570: Perl 5.8.0 creates 61 files - if(open($fh,">&=$fdno")) { - $Global::fd{$fdno}=$fh; - } - } - open $Global::original_stderr, ">&", "STDERR" or - ::die_bug("Can't dup STDERR: $!"); - open $Global::original_stdin, "<&", "STDIN" or - ::die_bug("Can't dup STDIN: $!"); - $Global::is_terminal = (-t $Global::original_stderr) && !$ENV{'CIRCLECI'} && !$ENV{'TRAVIS'}; -} - -sub enough_file_handles { - # Check that we have enough filehandles available for starting - # another job - # Uses: - # $opt::ungroup - # %Global::fd - # Returns: - # 1 if ungrouped (thus not needing extra filehandles) - # 0 if too few filehandles - # 1 if enough filehandles - if(not $opt::ungroup) { - my %fh; - my $enough_filehandles = 1; - # perl uses 7 filehandles for something? - # open3 uses 2 extra filehandles temporarily - # We need a filehandle for each redirected file descriptor - # (normally just STDOUT and STDERR) - for my $i (1..(7+2+keys %Global::fd)) { - $enough_filehandles &&= open($fh{$i}, "<", "/dev/null"); - } - for (values %fh) { close $_; } - return $enough_filehandles; - } else { - # Ungrouped does not need extra file handles - return 1; - } -} - -sub open_or_exit { - # Open a file name or exit if the file cannot be opened - # Inputs: - # $file = filehandle or filename to open - # Uses: - # $Global::stdin_in_opt_a - # $Global::original_stdin - # Returns: - # $fh = file handle to read-opened file - my $file = shift; - if($file eq "-") { - $Global::stdin_in_opt_a = 1; - return ($Global::original_stdin || *STDIN); - } - if(ref $file eq "GLOB") { - # This is an open filehandle - return $file; - } - my $fh = gensym; - if(not open($fh, "<", $file)) { - ::error("Cannot open input file `$file': No such file or directory.\n"); - wait_and_exit(255); - } - return $fh; -} - -sub __RUNNING_THE_JOBS_AND_PRINTING_PROGRESS__ {} - -# Variable structure: -# -# $Global::running{$pid} = Pointer to Job-object -# @Global::virgin_jobs = Pointer to Job-object that have received no input -# $Global::host{$sshlogin} = Pointer to SSHLogin-object -# $Global::total_running = total number of running jobs -# $Global::total_started = total jobs started - -sub init_run_jobs { - $Global::total_running = 0; - $Global::total_started = 0; - $Global::tty_taken = 0; - $SIG{USR1} = \&list_running_jobs; - $SIG{USR2} = \&toggle_progress; - if(@opt::basefile) { setup_basefile(); } -} - -{ - my $last_time; - my %last_mtime; - -sub start_more_jobs { - # Run start_another_job() but only if: - # * not $Global::start_no_new_jobs set - # * not JobQueue is empty - # * not load on server is too high - # * not server swapping - # * not too short time since last remote login - # Uses: - # $Global::max_procs_file - # $Global::max_procs_file_last_mod - # %Global::host - # @opt::sshloginfile - # $Global::start_no_new_jobs - # $opt::filter_hosts - # $Global::JobQueue - # $opt::pipe - # $opt::load - # $opt::noswap - # $opt::delay - # $Global::newest_starttime - # Returns: - # $jobs_started = number of jobs started - my $jobs_started = 0; - my $jobs_started_this_round = 0; - if($Global::start_no_new_jobs) { - return $jobs_started; - } - if(time - ($last_time||0) > 1) { - # At most do this every second - $last_time = time; - if($Global::max_procs_file) { - # --jobs filename - my $mtime = (stat($Global::max_procs_file))[9]; - if($mtime > $Global::max_procs_file_last_mod) { - # file changed: Force re-computing max_jobs_running - $Global::max_procs_file_last_mod = $mtime; - for my $sshlogin (values %Global::host) { - $sshlogin->set_max_jobs_running(undef); - } - } - } - if(@opt::sshloginfile) { - # Is --sshloginfile changed? - for my $slf (@opt::sshloginfile) { - my $actual_file = expand_slf_shorthand($slf); - my $mtime = (stat($actual_file))[9]; - $last_mtime{$actual_file} ||= $mtime; - if($mtime - $last_mtime{$actual_file} > 1) { - ::debug("run","--sshloginfile $actual_file changed. reload\n"); - $last_mtime{$actual_file} = $mtime; - # Reload $slf - # Empty sshlogins - @Global::sshlogin = (); - for (values %Global::host) { - # Don't start new jobs on any host - # except the ones added back later - $_->set_max_jobs_running(0); - } - # This will set max_jobs_running on the SSHlogins - read_sshloginfile($actual_file); - parse_sshlogin(); - $opt::filter_hosts and filter_hosts(); - setup_basefile(); - } - } - } - } - do { - $jobs_started_this_round = 0; - # This will start 1 job on each --sshlogin (if possible) - # thus distribute the jobs on the --sshlogins round robin - - for my $sshlogin (values %Global::host) { - if($Global::JobQueue->empty() and not $opt::pipe) { - # No more jobs in the queue - last; - } - debug("run", "Running jobs before on ", $sshlogin->string(), ": ", - $sshlogin->jobs_running(), "\n"); - if ($sshlogin->jobs_running() < $sshlogin->max_jobs_running()) { - if($opt::load and $sshlogin->loadavg_too_high()) { - # The load is too high or unknown - next; - } - if($opt::noswap and $sshlogin->swapping()) { - # The server is swapping - next; - } - if($sshlogin->too_fast_remote_login()) { - # It has been too short since - next; - } - if($opt::delay and $opt::delay > ::now() - $Global::newest_starttime) { - # It has been too short since last start - next; - } - debug("run", $sshlogin->string(), " has ", $sshlogin->jobs_running(), - " out of ", $sshlogin->max_jobs_running(), - " jobs running. Start another.\n"); - if(start_another_job($sshlogin) == 0) { - # No more jobs to start on this $sshlogin - debug("run","No jobs started on ", $sshlogin->string(), "\n"); - next; - } - $sshlogin->inc_jobs_running(); - $sshlogin->set_last_login_at(::now()); - $jobs_started++; - $jobs_started_this_round++; - } - debug("run","Running jobs after on ", $sshlogin->string(), ": ", - $sshlogin->jobs_running(), " of ", - $sshlogin->max_jobs_running(), "\n"); - } - } while($jobs_started_this_round); - - return $jobs_started; -} -} - -{ - my $no_more_file_handles_warned; - -sub start_another_job { - # If there are enough filehandles - # and JobQueue not empty - # and not $job is in joblog - # Then grab a job from Global::JobQueue, - # start it at sshlogin - # mark it as virgin_job - # Inputs: - # $sshlogin = the SSHLogin to start the job on - # Uses: - # $Global::JobQueue - # $opt::pipe - # $opt::results - # $opt::resume - # @Global::virgin_jobs - # Returns: - # 1 if another jobs was started - # 0 otherwise - my $sshlogin = shift; - # Do we have enough file handles to start another job? - if(enough_file_handles()) { - if($Global::JobQueue->empty() and not $opt::pipe) { - # No more commands to run - debug("start", "Not starting: JobQueue empty\n"); - return 0; - } else { - my $job; - # Skip jobs already in job log - # Skip jobs already in results - do { - $job = get_job_with_sshlogin($sshlogin); - if(not defined $job) { - # No command available for that sshlogin - debug("start", "Not starting: no jobs available for ", - $sshlogin->string(), "\n"); - return 0; - } - } while ($job->is_already_in_joblog() - or - ($opt::results and $opt::resume and $job->is_already_in_results())); - debug("start", "Command to run on '", $job->sshlogin()->string(), "': '", - $job->replaced(),"'\n"); - if($job->start()) { - if($opt::pipe) { - push(@Global::virgin_jobs,$job); - } - debug("start", "Started as seq ", $job->seq(), - " pid:", $job->pid(), "\n"); - return 1; - } else { - # Not enough processes to run the job. - # Put it back on the queue. - $Global::JobQueue->unget($job); - # Count down the number of jobs to run for this SSHLogin. - my $max = $sshlogin->max_jobs_running(); - if($max > 1) { $max--; } else { - ::error("No more processes: cannot run a single job. Something is wrong.\n"); - ::wait_and_exit(255); - } - $sshlogin->set_max_jobs_running($max); - # Sleep up to 300 ms to give other processes time to die - ::usleep(rand()*300); - ::warning("No more processes: ", - "Decreasing number of running jobs to $max. ", - "Raising ulimit -u or /etc/security/limits.conf may help.\n"); - return 0; - } - } - } else { - # No more file handles - $no_more_file_handles_warned++ or - ::warning("No more file handles. ", - "Raising ulimit -n or /etc/security/limits.conf may help.\n"); - return 0; - } -} -} - -$opt::min_progress_interval = 0; - -sub init_progress { - # Uses: - # $opt::bar - # Returns: - # list of computers for progress output - $|=1; - if (not $Global::is_terminal) { - $opt::min_progress_interval = 30; - } - if($opt::bar) { - return("",""); - } - my %progress = progress(); - return ("\nComputers / CPU cores / Max jobs to run\n", - $progress{'workerlist'}); -} - -sub drain_job_queue { - # Uses: - # $opt::progress - # $Global::original_stderr - # $Global::total_running - # $Global::max_jobs_running - # %Global::running - # $Global::JobQueue - # %Global::host - # $Global::start_no_new_jobs - # Returns: N/A - if($opt::progress) { - print $Global::original_stderr init_progress(); - } - my $last_header=""; - my $sleep = 0.2; - my $last_left = 1000000000; - my $last_progress_time = 0; - my $ps_reported = 0; - do { - while($Global::total_running > 0) { - debug($Global::total_running, "==", scalar - keys %Global::running," slots: ", $Global::max_jobs_running); - if($opt::pipe) { - # When using --pipe sometimes file handles are not closed properly - for my $job (values %Global::running) { - close $job->fh(0,"w"); - } - } - # When not connected to terminal, assume CI (e.g. CircleCI). In - # that case we want occasional progress output to prevent abort - # due to timeout with no output, but we also need to stop sending - # progress output if there has been no actual progress, so that - # the job can time out appropriately (CirecleCI: 10m) in case of - # a hung test. But without special output, it is extremely - # annoying to diagnose which test is hung, so we add that using - # `ps` below. - if($opt::progress and - ($Global::is_terminal or (time() - $last_progress_time) >= 30)) { - my %progress = progress(); - if($last_header ne $progress{'header'}) { - print $Global::original_stderr "\n", $progress{'header'}, "\n"; - $last_header = $progress{'header'}; - } - if ($Global::is_terminal) { - print $Global::original_stderr "\r",$progress{'status'}; - } - if ($last_left > $Global::left) { - if (not $Global::is_terminal) { - print $Global::original_stderr $progress{'status'},"\n"; - } - $last_progress_time = time(); - $ps_reported = 0; - } elsif (not $ps_reported and (time() - $last_progress_time) >= 60) { - # No progress in at least 60 seconds: run ps - print $Global::original_stderr "\n"; - system("ps", "-wf"); - $ps_reported = 1; - } - $last_left = $Global::left; - flush $Global::original_stderr; - } - if($Global::total_running < $Global::max_jobs_running - and not $Global::JobQueue->empty()) { - # These jobs may not be started because of loadavg - # or too little time between each ssh login. - if(start_more_jobs() > 0) { - # Exponential back-on if jobs were started - $sleep = $sleep/2+0.001; - } - } - # Sometimes SIGCHLD is not registered, so force reaper - $sleep = ::reap_usleep($sleep); - } - if(not $Global::JobQueue->empty()) { - # These jobs may not be started: - # * because there the --filter-hosts has removed all - if(not %Global::host) { - ::error("There are no hosts left to run on.\n"); - ::wait_and_exit(255); - } - # * because of loadavg - # * because of too little time between each ssh login. - start_more_jobs(); - $sleep = ::reap_usleep($sleep); - if($Global::max_jobs_running == 0) { - ::warning("There are no job slots available. Increase --jobs.\n"); - } - } - } while ($Global::total_running > 0 - or - not $Global::start_no_new_jobs and not $Global::JobQueue->empty()); - if($opt::progress) { - my %progress = progress(); - print $Global::original_stderr $opt::progress_sep, $progress{'status'}, "\n"; - flush $Global::original_stderr; - } -} - -sub toggle_progress { - # Turn on/off progress view - # Uses: - # $opt::progress - # $Global::original_stderr - # Returns: N/A - $opt::progress = not $opt::progress; - if($opt::progress) { - print $Global::original_stderr init_progress(); - } -} - -sub progress { - # Uses: - # $opt::bar - # $opt::eta - # %Global::host - # $Global::total_started - # Returns: - # $workerlist = list of workers - # $header = that will fit on the screen - # $status = message that will fit on the screen - if($opt::bar) { - return ("workerlist" => "", "header" => "", "status" => bar()); - } - my $eta = ""; - my ($status,$header)=("",""); - if($opt::eta) { - my($total, $completed, $left, $pctcomplete, $avgtime, $this_eta) = - compute_eta(); - $eta = sprintf("ETA: %ds Left: %d AVG: %.2fs ", - $this_eta, $left, $avgtime); - $Global::left = $left; - } - my $termcols = terminal_columns(); - my @workers = sort keys %Global::host; - my %sshlogin = map { $_ eq ":" ? ($_=>"local") : ($_=>$_) } @workers; - my $workerno = 1; - my %workerno = map { ($_=>$workerno++) } @workers; - my $workerlist = ""; - for my $w (@workers) { - $workerlist .= - $workerno{$w}.":".$sshlogin{$w} ." / ". - ($Global::host{$w}->ncpus() || "-")." / ". - $Global::host{$w}->max_jobs_running()."\n"; - } - $status = "x"x($termcols+1); - if(length $status > $termcols) { - # sshlogin1:XX/XX/XX%/XX.Xs sshlogin2:XX/XX/XX%/XX.Xs sshlogin3:XX/XX/XX%/XX.Xs - $header = "Computer:jobs running/jobs completed/%of started jobs/Average seconds to complete"; - $status = $eta . - join(" ",map - { - if($Global::total_started) { - my $completed = ($Global::host{$_}->jobs_completed()||0); - my $running = $Global::host{$_}->jobs_running(); - my $time = $completed ? (time-$^T)/($completed) : "0"; - sprintf("%s:%d/%d/%d%%/%.1fs ", - $sshlogin{$_}, $running, $completed, - ($running+$completed)*100 - / $Global::total_started, $time); - } - } @workers); - } - if(length $status > $termcols) { - # 1:XX/XX/XX%/XX.Xs 2:XX/XX/XX%/XX.Xs 3:XX/XX/XX%/XX.Xs 4:XX/XX/XX%/XX.Xs - $header = "Computer:jobs running/jobs completed/%of started jobs"; - $status = $eta . - join(" ",map - { - my $completed = ($Global::host{$_}->jobs_completed()||0); - my $running = $Global::host{$_}->jobs_running(); - my $time = $completed ? (time-$^T)/($completed) : "0"; - sprintf("%s:%d/%d/%d%%/%.1fs ", - $workerno{$_}, $running, $completed, - ($running+$completed)*100 - / $Global::total_started, $time); - } @workers); - } - if(length $status > $termcols) { - # sshlogin1:XX/XX/XX% sshlogin2:XX/XX/XX% sshlogin3:XX/XX/XX% - $header = "Computer:jobs running/jobs completed/%of started jobs"; - $status = $eta . - join(" ",map - { sprintf("%s:%d/%d/%d%%", - $sshlogin{$_}, - $Global::host{$_}->jobs_running(), - ($Global::host{$_}->jobs_completed()||0), - ($Global::host{$_}->jobs_running()+ - ($Global::host{$_}->jobs_completed()||0))*100 - / $Global::total_started) } - @workers); - } - if(length $status > $termcols) { - # 1:XX/XX/XX% 2:XX/XX/XX% 3:XX/XX/XX% 4:XX/XX/XX% 5:XX/XX/XX% 6:XX/XX/XX% - $header = "Computer:jobs running/jobs completed/%of started jobs"; - $status = $eta . - join(" ",map - { sprintf("%s:%d/%d/%d%%", - $workerno{$_}, - $Global::host{$_}->jobs_running(), - ($Global::host{$_}->jobs_completed()||0), - ($Global::host{$_}->jobs_running()+ - ($Global::host{$_}->jobs_completed()||0))*100 - / $Global::total_started) } - @workers); - } - if(length $status > $termcols) { - # sshlogin1:XX/XX/XX% sshlogin2:XX/XX/XX% sshlogin3:XX/XX sshlogin4:XX/XX - $header = "Computer:jobs running/jobs completed"; - $status = $eta . - join(" ",map - { sprintf("%s:%d/%d", - $sshlogin{$_}, $Global::host{$_}->jobs_running(), - ($Global::host{$_}->jobs_completed()||0)) } - @workers); - } - if(length $status > $termcols) { - # sshlogin1:XX/XX sshlogin2:XX/XX sshlogin3:XX/XX sshlogin4:XX/XX - $header = "Computer:jobs running/jobs completed"; - $status = $eta . - join(" ",map - { sprintf("%s:%d/%d", - $sshlogin{$_}, $Global::host{$_}->jobs_running(), - ($Global::host{$_}->jobs_completed()||0)) } - @workers); - } - if(length $status > $termcols) { - # 1:XX/XX 2:XX/XX 3:XX/XX 4:XX/XX 5:XX/XX 6:XX/XX - $header = "Computer:jobs running/jobs completed"; - $status = $eta . - join(" ",map - { sprintf("%s:%d/%d", - $workerno{$_}, $Global::host{$_}->jobs_running(), - ($Global::host{$_}->jobs_completed()||0)) } - @workers); - } - if(length $status > $termcols) { - # sshlogin1:XX sshlogin2:XX sshlogin3:XX sshlogin4:XX sshlogin5:XX - $header = "Computer:jobs completed"; - $status = $eta . - join(" ",map - { sprintf("%s:%d", - $sshlogin{$_}, - ($Global::host{$_}->jobs_completed()||0)) } - @workers); - } - if(length $status > $termcols) { - # 1:XX 2:XX 3:XX 4:XX 5:XX 6:XX - $header = "Computer:jobs completed"; - $status = $eta . - join(" ",map - { sprintf("%s:%d", - $workerno{$_}, - ($Global::host{$_}->jobs_completed()||0)) } - @workers); - } - return ("workerlist" => $workerlist, "header" => $header, "status" => $status); -} - -{ - my ($total, $first_completed, $smoothed_avg_time); - - sub compute_eta { - # Calculate important numbers for ETA - # Returns: - # $total = number of jobs in total - # $completed = number of jobs completed - # $left = number of jobs left - # $pctcomplete = percent of jobs completed - # $avgtime = averaged time - # $eta = smoothed eta - $total ||= $Global::JobQueue->total_jobs(); - my $completed = 0; - for(values %Global::host) { $completed += $_->jobs_completed() } - my $left = $total - $completed; - if(not $completed) { - return($total, $completed, $left, 0, 0, 0); - } - my $pctcomplete = $completed / $total; - $first_completed ||= time; - my $timepassed = (time - $first_completed); - my $avgtime = $timepassed / $completed; - $smoothed_avg_time ||= $avgtime; - # Smooth the eta so it does not jump wildly - $smoothed_avg_time = (1 - $pctcomplete) * $smoothed_avg_time + - $pctcomplete * $avgtime; - my $eta = int($left * $smoothed_avg_time); - return($total, $completed, $left, $pctcomplete, $avgtime, $eta); - } -} - -{ - my ($rev,$reset); - - sub bar { - # Return: - # $status = bar with eta, completed jobs, arg and pct - $rev ||= "\033[7m"; - $reset ||= "\033[0m"; - my($total, $completed, $left, $pctcomplete, $avgtime, $eta) = - compute_eta(); - my $arg = $Global::newest_job ? - $Global::newest_job->{'commandline'}->replace_placeholders(["\257<\257>"],0,0) : ""; - # These chars mess up display in the terminal - $arg =~ tr/[\011-\016\033\302-\365]//d; - my $bar_text = - sprintf("%d%% %d:%d=%ds %s", - $pctcomplete*100, $completed, $left, $eta, $arg); - my $terminal_width = terminal_columns(); - my $s = sprintf("%-${terminal_width}s", - substr($bar_text." "x$terminal_width, - 0,$terminal_width)); - my $width = int($terminal_width * $pctcomplete); - substr($s,$width,0) = $reset; - my $zenity = sprintf("%-${terminal_width}s", - substr("# $eta sec $arg", - 0,$terminal_width)); - $s = "\r" . $zenity . "\r" . $pctcomplete*100 . # Prefix with zenity header - "\r" . $rev . $s . $reset; - return $s; - } -} - -{ - my ($columns,$last_column_time); - - sub terminal_columns { - # Get the number of columns of the display - # Returns: - # number of columns of the screen - if(not $columns or $last_column_time < time) { - $last_column_time = time; - $columns = $ENV{'COLUMNS'}; - if(not $columns) { - my $resize = qx{ resize 2>/dev/null }; - $resize =~ /COLUMNS=(\d+);/ and do { $columns = $1; }; - } - $columns ||= 80; - } - return $columns; - } -} - -sub get_job_with_sshlogin { - # Returns: - # next job object for $sshlogin if any available - my $sshlogin = shift; - my $job = undef; - - if ($opt::hostgroups) { - my @other_hostgroup_jobs = (); - - while($job = $Global::JobQueue->get()) { - if($sshlogin->in_hostgroups($job->hostgroups())) { - # Found a job for this hostgroup - last; - } else { - # This job was not in the hostgroups of $sshlogin - push @other_hostgroup_jobs, $job; - } - } - $Global::JobQueue->unget(@other_hostgroup_jobs); - if(not defined $job) { - # No more jobs - return undef; - } - } else { - $job = $Global::JobQueue->get(); - if(not defined $job) { - # No more jobs - ::debug("start", "No more jobs: JobQueue empty\n"); - return undef; - } - } - - my $clean_command = $job->replaced(); - if($clean_command =~ /^\s*$/) { - # Do not run empty lines - if(not $Global::JobQueue->empty()) { - return get_job_with_sshlogin($sshlogin); - } else { - return undef; - } - } - $job->set_sshlogin($sshlogin); - if($opt::retries and $clean_command and - $job->failed_here()) { - # This command with these args failed for this sshlogin - my ($no_of_failed_sshlogins,$min_failures) = $job->min_failed(); - # Only look at the Global::host that have > 0 jobslots - if($no_of_failed_sshlogins == grep { $_->max_jobs_running() > 0 } values %Global::host - and $job->failed_here() == $min_failures) { - # It failed the same or more times on another host: - # run it on this host - } else { - # If it failed fewer times on another host: - # Find another job to run - my $nextjob; - if(not $Global::JobQueue->empty()) { - # This can potentially recurse for all args - no warnings 'recursion'; - $nextjob = get_job_with_sshlogin($sshlogin); - } - # Push the command back on the queue - $Global::JobQueue->unget($job); - return $nextjob; - } - } - return $job; -} - -sub __REMOTE_SSH__ {} - -sub read_sshloginfiles { - # Returns: N/A - for my $s (@_) { - read_sshloginfile(expand_slf_shorthand($s)); - } -} - -sub expand_slf_shorthand { - my $file = shift; - if($file eq "-") { - # skip: It is stdin - } elsif($file eq "..") { - $file = $ENV{'HOME'}."/.parallel/sshloginfile"; - } elsif($file eq ".") { - $file = "/etc/parallel/sshloginfile"; - } elsif(not -r $file) { - if(not -r $ENV{'HOME'}."/.parallel/".$file) { - # Try prepending ~/.parallel - ::error("Cannot open $file.\n"); - ::wait_and_exit(255); - } else { - $file = $ENV{'HOME'}."/.parallel/".$file; - } - } - return $file; -} - -sub read_sshloginfile { - # Returns: N/A - my $file = shift; - my $close = 1; - my $in_fh; - ::debug("init","--slf ",$file); - if($file eq "-") { - $in_fh = *STDIN; - $close = 0; - } else { - if(not open($in_fh, "<", $file)) { - # Try the filename - ::error("Cannot open $file.\n"); - ::wait_and_exit(255); - } - } - while(<$in_fh>) { - chomp; - /^\s*#/ and next; - /^\s*$/ and next; - push @Global::sshlogin, $_; - } - if($close) { - close $in_fh; - } -} - -sub parse_sshlogin { - # Returns: N/A - my @login; - if(not @Global::sshlogin) { @Global::sshlogin = (":"); } - for my $sshlogin (@Global::sshlogin) { - # Split up -S sshlogin,sshlogin - for my $s (split /,/, $sshlogin) { - if ($s eq ".." or $s eq "-") { - # This may add to @Global::sshlogin - possibly bug - read_sshloginfile(expand_slf_shorthand($s)); - } else { - push (@login, $s); - } - } - } - $Global::minimal_command_line_length = 8_000_000; - my @allowed_hostgroups; - for my $ncpu_sshlogin_string (::uniq(@login)) { - my $sshlogin = SSHLogin->new($ncpu_sshlogin_string); - my $sshlogin_string = $sshlogin->string(); - if($sshlogin_string eq "") { - # This is an ssh group: -S @webservers - push @allowed_hostgroups, $sshlogin->hostgroups(); - next; - } - if($Global::host{$sshlogin_string}) { - # This sshlogin has already been added: - # It is probably a host that has come back - # Set the max_jobs_running back to the original - debug("run","Already seen $sshlogin_string\n"); - if($sshlogin->{'ncpus'}) { - # If ncpus set by '#/' of the sshlogin, overwrite it: - $Global::host{$sshlogin_string}->set_ncpus($sshlogin->ncpus()); - } - $Global::host{$sshlogin_string}->set_max_jobs_running(undef); - next; - } - if($sshlogin_string eq ":") { - $sshlogin->set_maxlength(Limits::Command::max_length()); - } else { - # If all chars needs to be quoted, every other character will be \ - $sshlogin->set_maxlength(int(Limits::Command::max_length()/2)); - } - $Global::minimal_command_line_length = - ::min($Global::minimal_command_line_length, $sshlogin->maxlength()); - $Global::host{$sshlogin_string} = $sshlogin; - } - if(@allowed_hostgroups) { - # Remove hosts that are not in these groups - while (my ($string, $sshlogin) = each %Global::host) { - if(not $sshlogin->in_hostgroups(@allowed_hostgroups)) { - delete $Global::host{$string}; - } - } - } - - # debug("start", "sshlogin: ", my_dump(%Global::host),"\n"); - if($opt::transfer or @opt::return or $opt::cleanup or @opt::basefile) { - if(not remote_hosts()) { - # There are no remote hosts - if(@opt::trc) { - ::warning("--trc ignored as there are no remote --sshlogin.\n"); - } elsif (defined $opt::transfer) { - ::warning("--transfer ignored as there are no remote --sshlogin.\n"); - } elsif (@opt::return) { - ::warning("--return ignored as there are no remote --sshlogin.\n"); - } elsif (defined $opt::cleanup) { - ::warning("--cleanup ignored as there are no remote --sshlogin.\n"); - } elsif (@opt::basefile) { - ::warning("--basefile ignored as there are no remote --sshlogin.\n"); - } - } - } -} - -sub remote_hosts { - # Return sshlogins that are not ':' - # Returns: - # list of sshlogins with ':' removed - return grep !/^:$/, keys %Global::host; -} - -sub setup_basefile { - # Transfer basefiles to each $sshlogin - # This needs to be done before first jobs on $sshlogin is run - # Returns: N/A - my $cmd = ""; - my $rsync_destdir; - my $workdir; - for my $sshlogin (values %Global::host) { - if($sshlogin->string() eq ":") { next } - for my $file (@opt::basefile) { - if($file !~ m:^/: and $opt::workdir eq "...") { - ::error("Work dir '...' will not work with relative basefiles\n"); - ::wait_and_exit(255); - } - $workdir ||= Job->new("")->workdir(); - $cmd .= $sshlogin->rsync_transfer_cmd($file,$workdir) . "&"; - } - } - $cmd .= "wait;"; - debug("init", "basesetup: $cmd\n"); - print `$cmd`; -} - -sub cleanup_basefile { - # Remove the basefiles transferred - # Returns: N/A - my $cmd=""; - my $workdir = Job->new("")->workdir(); - for my $sshlogin (values %Global::host) { - if($sshlogin->string() eq ":") { next } - for my $file (@opt::basefile) { - $cmd .= $sshlogin->cleanup_cmd($file,$workdir)."&"; - } - } - $cmd .= "wait;"; - debug("init", "basecleanup: $cmd\n"); - print `$cmd`; -} - -sub filter_hosts { - my(@cores, @cpus, @maxline, @echo); - my $envvar = ::shell_quote_scalar($Global::envvar); - while (my ($host, $sshlogin) = each %Global::host) { - if($host eq ":") { next } - # The 'true' is used to get the $host out later - my $sshcmd = "true $host;" . $sshlogin->sshcommand()." ".$sshlogin->serverlogin(); - push(@cores, $host."\t".$sshcmd." ".$envvar." parallel --number-of-cores\n\0"); - push(@cpus, $host."\t".$sshcmd." ".$envvar." parallel --number-of-cpus\n\0"); - push(@maxline, $host."\t".$sshcmd." ".$envvar." parallel --max-line-length-allowed\n\0"); - # 'echo' is used to get the best possible value for an ssh login time - push(@echo, $host."\t".$sshcmd." echo\n\0"); - } - my ($fh, $tmpfile) = ::tmpfile(SUFFIX => ".ssh"); - print $fh @cores, @cpus, @maxline, @echo; - close $fh; - # --timeout 5: Setting up an SSH connection and running a simple - # command should never take > 5 sec. - # --delay 0.1: If multiple sshlogins use the same proxy the delay - # will make it less likely to overload the ssh daemon. - # --retries 3: If the ssh daemon it overloaded, try 3 times - # -s 16000: Half of the max line on UnixWare - my $cmd = "cat $tmpfile | $0 -j0 --timeout 5 -s 16000 --joblog - --plain --delay 0.1 --retries 3 --tag --tagstring {1} -0 --colsep '\t' -k eval {2} 2>/dev/null"; - ::debug("init", $cmd, "\n"); - open(my $host_fh, "-|", $cmd) || ::die_bug("parallel host check: $cmd"); - my (%ncores, %ncpus, %time_to_login, %maxlen, %echo, @down_hosts); - my $prepend = ""; - while(<$host_fh>) { - if(/\'$/) { - # if last char = ' then append next line - # This may be due to quoting of $Global::envvar - $prepend .= $_; - next; - } - $_ = $prepend . $_; - $prepend = ""; - chomp; - my @col = split /\t/, $_; - if(defined $col[6]) { - # This is a line from --joblog - # seq host time spent sent received exit signal command - # 2 : 1372607672.654 0.675 0 0 0 0 eval true\ m\;ssh\ m\ parallel\ --number-of-cores - if($col[0] eq "Seq" and $col[1] eq "Host" and - $col[2] eq "Starttime") { - # Header => skip - next; - } - # Get server from: eval true server\; - $col[8] =~ /eval true..([^;]+).;/ or ::die_bug("col8 does not contain host: $col[8]"); - my $host = $1; - $host =~ tr/\\//d; - $Global::host{$host} or next; - if($col[6] eq "255" or $col[7] eq "15") { - # exit == 255 or signal == 15: ssh failed - # Remove sshlogin - ::debug("init", "--filtered $host\n"); - push(@down_hosts, $host); - @down_hosts = uniq(@down_hosts); - } elsif($col[6] eq "127") { - # signal == 127: parallel not installed remote - # Set ncpus and ncores = 1 - ::warning("Could not figure out ", - "number of cpus on $host. Using 1.\n"); - $ncores{$host} = 1; - $ncpus{$host} = 1; - $maxlen{$host} = Limits::Command::max_length(); - } elsif($col[0] =~ /^\d+$/ and $Global::host{$host}) { - # Remember how log it took to log in - # 2 : 1372607672.654 0.675 0 0 0 0 eval true\ m\;ssh\ m\ echo - $time_to_login{$host} = ::min($time_to_login{$host},$col[3]); - } else { - ::die_bug("host check unmatched long jobline: $_"); - } - } elsif($Global::host{$col[0]}) { - # This output from --number-of-cores, --number-of-cpus, - # --max-line-length-allowed - # ncores: server 8 - # ncpus: server 2 - # maxlen: server 131071 - if(not $ncores{$col[0]}) { - $ncores{$col[0]} = $col[1]; - } elsif(not $ncpus{$col[0]}) { - $ncpus{$col[0]} = $col[1]; - } elsif(not $maxlen{$col[0]}) { - $maxlen{$col[0]} = $col[1]; - } elsif(not $echo{$col[0]}) { - $echo{$col[0]} = $col[1]; - } elsif(m/perl: warning:|LANGUAGE =|LC_ALL =|LANG =|are supported and installed/) { - # Skip these: - # perl: warning: Setting locale failed. - # perl: warning: Please check that your locale settings: - # LANGUAGE = (unset), - # LC_ALL = (unset), - # LANG = "en_US.UTF-8" - # are supported and installed on your system. - # perl: warning: Falling back to the standard locale ("C"). - } else { - ::die_bug("host check too many col0: $_"); - } - } else { - ::die_bug("host check unmatched short jobline ($col[0]): $_"); - } - } - close $host_fh; - $Global::debug or unlink $tmpfile; - delete @Global::host{@down_hosts}; - @down_hosts and ::warning("Removed @down_hosts\n"); - $Global::minimal_command_line_length = 8_000_000; - while (my ($sshlogin, $obj) = each %Global::host) { - if($sshlogin eq ":") { next } - $ncpus{$sshlogin} or ::die_bug("ncpus missing: ".$obj->serverlogin()); - $ncores{$sshlogin} or ::die_bug("ncores missing: ".$obj->serverlogin()); - $time_to_login{$sshlogin} or ::die_bug("time_to_login missing: ".$obj->serverlogin()); - $maxlen{$sshlogin} or ::die_bug("maxlen missing: ".$obj->serverlogin()); - if($opt::use_cpus_instead_of_cores) { - $obj->set_ncpus($ncpus{$sshlogin}); - } else { - $obj->set_ncpus($ncores{$sshlogin}); - } - $obj->set_time_to_login($time_to_login{$sshlogin}); - $obj->set_maxlength($maxlen{$sshlogin}); - $Global::minimal_command_line_length = - ::min($Global::minimal_command_line_length, - int($maxlen{$sshlogin}/2)); - ::debug("init", "Timing from -S:$sshlogin ncpus:",$ncpus{$sshlogin}, - " ncores:", $ncores{$sshlogin}, - " time_to_login:", $time_to_login{$sshlogin}, - " maxlen:", $maxlen{$sshlogin}, - " min_max_len:", $Global::minimal_command_line_length,"\n"); - } -} - -sub onall { - sub tmp_joblog { - my $joblog = shift; - if(not defined $joblog) { - return undef; - } - my ($fh, $tmpfile) = ::tmpfile(SUFFIX => ".log"); - close $fh; - return $tmpfile; - } - my @command = @_; - if($Global::quoting) { - @command = shell_quote_empty(@command); - } - - # Copy all @fhlist into tempfiles - my @argfiles = (); - for my $fh (@fhlist) { - my ($outfh, $name) = ::tmpfile(SUFFIX => ".all", UNLINK => 1); - print $outfh (<$fh>); - close $outfh; - push @argfiles, $name; - } - if(@opt::basefile) { setup_basefile(); } - # for each sshlogin do: - # parallel -S $sshlogin $command :::: @argfiles - # - # Pass some of the options to the sub-parallels, not all of them as - # -P should only go to the first, and -S should not be copied at all. - my $options = - join(" ", - ((defined $opt::jobs) ? "-P $opt::jobs" : ""), - ((defined $opt::linebuffer) ? "--linebuffer" : ""), - ((defined $opt::ungroup) ? "-u" : ""), - ((defined $opt::group) ? "-g" : ""), - ((defined $opt::keeporder) ? "--keeporder" : ""), - ((defined $opt::D) ? "-D $opt::D" : ""), - ((defined $opt::plain) ? "--plain" : ""), - ((defined $opt::max_chars) ? "--max-chars ".$opt::max_chars : ""), - ); - my $suboptions = - join(" ", - ((defined $opt::ungroup) ? "-u" : ""), - ((defined $opt::linebuffer) ? "--linebuffer" : ""), - ((defined $opt::group) ? "-g" : ""), - ((defined $opt::files) ? "--files" : ""), - ((defined $opt::keeporder) ? "--keeporder" : ""), - ((defined $opt::colsep) ? "--colsep ".shell_quote($opt::colsep) : ""), - ((@opt::v) ? "-vv" : ""), - ((defined $opt::D) ? "-D $opt::D" : ""), - ((defined $opt::timeout) ? "--timeout ".$opt::timeout : ""), - ((defined $opt::plain) ? "--plain" : ""), - ((defined $opt::retries) ? "--retries ".$opt::retries : ""), - ((defined $opt::max_chars) ? "--max-chars ".$opt::max_chars : ""), - ((defined $opt::arg_sep) ? "--arg-sep ".$opt::arg_sep : ""), - ((defined $opt::arg_file_sep) ? "--arg-file-sep ".$opt::arg_file_sep : ""), - (@opt::env ? map { "--env ".::shell_quote_scalar($_) } @opt::env : ""), - ); - ::debug("init", "| $0 $options\n"); - open(my $parallel_fh, "|-", "$0 --no-notice -j0 $options") || - ::die_bug("This does not run GNU Parallel: $0 $options"); - my @joblogs; - for my $host (sort keys %Global::host) { - my $sshlogin = $Global::host{$host}; - my $joblog = tmp_joblog($opt::joblog); - if($joblog) { - push @joblogs, $joblog; - $joblog = "--joblog $joblog"; - } - my $quad = $opt::arg_file_sep || "::::"; - ::debug("init", "$0 $suboptions -j1 $joblog ", - ((defined $opt::tag) ? - "--tagstring ".shell_quote_scalar($sshlogin->string()) : ""), - " -S ", shell_quote_scalar($sshlogin->string())," ", - join(" ",shell_quote(@command))," $quad @argfiles\n"); - print $parallel_fh "$0 $suboptions -j1 $joblog ", - ((defined $opt::tag) ? - "--tagstring ".shell_quote_scalar($sshlogin->string()) : ""), - " -S ", shell_quote_scalar($sshlogin->string())," ", - join(" ",shell_quote(@command))," $quad @argfiles\n"; - } - close $parallel_fh; - $Global::exitstatus = $? >> 8; - debug("init", "--onall exitvalue ", $?); - if(@opt::basefile) { cleanup_basefile(); } - $Global::debug or unlink(@argfiles); - my %seen; - for my $joblog (@joblogs) { - # Append to $joblog - open(my $fh, "<", $joblog) || ::die_bug("Cannot open tmp joblog $joblog"); - # Skip first line (header); - <$fh>; - print $Global::joblog (<$fh>); - close $fh; - unlink($joblog); - } -} - -sub __SIGNAL_HANDLING__ {} - -sub save_original_signal_handler { - # Remember the original signal handler - # Returns: N/A - $SIG{TERM} ||= sub { exit 0; }; # $SIG{TERM} is not set on Mac OS X - $SIG{INT} = sub { if($opt::tmux) { qx { tmux kill-session -t p$$ }; } - unlink keys %Global::unlink; exit -1 }; - $SIG{TERM} = sub { if($opt::tmux) { qx { tmux kill-session -t p$$ }; } - unlink keys %Global::unlink; exit -1 }; - %Global::original_sig = %SIG; - $SIG{TERM} = sub {}; # Dummy until jobs really start -} - -sub list_running_jobs { - # Returns: N/A - for my $v (values %Global::running) { - print $Global::original_stderr "$Global::progname: ",$v->replaced(),"\n"; - } -} - -sub start_no_new_jobs { - # Returns: N/A - $SIG{TERM} = $Global::original_sig{TERM}; - print $Global::original_stderr - ("$Global::progname: SIGTERM received. No new jobs will be started.\n", - "$Global::progname: Waiting for these ", scalar(keys %Global::running), - " jobs to finish. Send SIGTERM again to stop now.\n"); - list_running_jobs(); - $Global::start_no_new_jobs ||= 1; -} - -sub reaper { - # A job finished. - # Print the output. - # Start another job - # Returns: N/A - my $stiff; - my $children_reaped = 0; - debug("run", "Reaper "); - while (($stiff = waitpid(-1, &WNOHANG)) > 0) { - $children_reaped++; - if($Global::sshmaster{$stiff}) { - # This is one of the ssh -M: ignore - next; - } - my $job = $Global::running{$stiff}; - # '-a <(seq 10)' will give us a pid not in %Global::running - $job or next; - $job->set_exitstatus($? >> 8); - $job->set_exitsignal($? & 127); - debug("run", "died (", $job->exitstatus(), "): ", $job->seq()); - $job->set_endtime(::now()); - if($stiff == $Global::tty_taken) { - # The process that died had the tty => release it - $Global::tty_taken = 0; - } - - if(not $job->should_be_retried()) { - # The job is done - # Free the jobslot - push @Global::slots, $job->slot(); - if($opt::timeout) { - # Update average runtime for timeout - $Global::timeoutq->update_delta_time($job->runtime()); - } - # Force printing now if the job failed and we are going to exit - my $print_now = ($opt::halt_on_error and $opt::halt_on_error == 2 - and $job->exitstatus()); - if($opt::keeporder and not $print_now) { - print_earlier_jobs($job); - } else { - $job->print(); - } - if($job->exitstatus()) { - process_failed_job($job); - } - - } - my $sshlogin = $job->sshlogin(); - $sshlogin->dec_jobs_running(); - $sshlogin->inc_jobs_completed(); - $Global::total_running--; - delete $Global::running{$stiff}; - start_more_jobs(); - } - debug("run", "done "); - return $children_reaped; -} - -sub process_failed_job { - # The jobs had a exit status <> 0, so error - # Returns: N/A - my $job = shift; - $Global::exitstatus++; - $Global::total_failed++; - if($opt::halt_on_error) { - if($opt::halt_on_error == 1 - or - ($opt::halt_on_error < 1 and $Global::total_failed > 3 - and - $Global::total_failed / $Global::total_started > $opt::halt_on_error)) { - # If halt on error == 1 or --halt 10% - # we should gracefully exit - print $Global::original_stderr - ("$Global::progname: Starting no more jobs. ", - "Waiting for ", scalar(keys %Global::running), - " jobs to finish. This job failed:\n", - $job->replaced(),"\n"); - $Global::start_no_new_jobs ||= 1; - $Global::halt_on_error_exitstatus = $job->exitstatus(); - } elsif($opt::halt_on_error == 2) { - # If halt on error == 2 we should exit immediately - print $Global::original_stderr - ("$Global::progname: This job failed:\n", - $job->replaced(),"\n"); - exit ($job->exitstatus()); - } - } -} - -{ - my (%print_later,$job_end_sequence); - - sub print_earlier_jobs { - # Print jobs completed earlier - # Returns: N/A - my $job = shift; - $print_later{$job->seq()} = $job; - $job_end_sequence ||= 1; - debug("run", "Looking for: $job_end_sequence ", - "Current: ", $job->seq(), "\n"); - for(my $j = $print_later{$job_end_sequence}; - $j or vec($Global::job_already_run,$job_end_sequence,1); - $job_end_sequence++, - $j = $print_later{$job_end_sequence}) { - debug("run", "Found job end $job_end_sequence"); - if($j) { - $j->print(); - delete $print_later{$job_end_sequence}; - } - } - } -} - -sub __USAGE__ {} - -sub wait_and_exit { - # If we do not wait, we sometimes get segfault - # Returns: N/A - my $error = shift; - if($error) { - # Kill all without printing - for my $job (values %Global::running) { - $job->kill("TERM"); - $job->kill("TERM"); - } - } - for (keys %Global::unkilled_children) { - kill 9, $_; - waitpid($_,0); - delete $Global::unkilled_children{$_}; - } - wait(); - exit($error); -} - -sub die_usage { - # Returns: N/A - usage(); - wait_and_exit(255); -} - -sub usage { - # Returns: N/A - print join - ("\n", - "Usage:", - "", - "$Global::progname [options] [command [arguments]] < list_of_arguments", - "$Global::progname [options] [command [arguments]] (::: arguments|:::: argfile(s))...", - "cat ... | $Global::progname --pipe [options] [command [arguments]]", - "", - "-j n Run n jobs in parallel", - "-k Keep same order", - "-X Multiple arguments with context replace", - "--colsep regexp Split input on regexp for positional replacements", - "{} {.} {/} {/.} {#} {%} {= perl code =} Replacement strings", - "{3} {3.} {3/} {3/.} {=3 perl code =} Positional replacement strings", - "With --plus: {} = {+/}/{/} = {.}.{+.} = {+/}/{/.}.{+.} = {..}.{+..} =", - " {+/}/{/..}.{+..} = {...}.{+...} = {+/}/{/...}.{+...}", - "", - "-S sshlogin Example: foo\@server.example.com", - "--slf .. Use ~/.parallel/sshloginfile as the list of sshlogins", - "--trc {}.bar Shorthand for --transfer --return {}.bar --cleanup", - "--onall Run the given command with argument on all sshlogins", - "--nonall Run the given command with no arguments on all sshlogins", - "", - "--pipe Split stdin (standard input) to multiple jobs.", - "--recend str Record end separator for --pipe.", - "--recstart str Record start separator for --pipe.", - "", - "See 'man $Global::progname' for details", - "", - "When using programs that use GNU Parallel to process data for publication please cite:", - "", - "O. Tange (2011): GNU Parallel - The Command-Line Power Tool,", - ";login: The USENIX Magazine, February 2011:42-47.", - "", - "Or you can get GNU Parallel without this requirement by paying 10000 EUR.", - ""); -} - - -sub citation_notice { - # if --no-notice or --plain: do nothing - # if stderr redirected: do nothing - # if ~/.parallel/will-cite: do nothing - # else: print citation notice to stderr - if($opt::no_notice - or - $opt::plain - or - not -t $Global::original_stderr - or - -e $ENV{'HOME'}."/.parallel/will-cite") { - # skip - } else { - print $Global::original_stderr - ("When using programs that use GNU Parallel to process data for publication please cite:\n", - "\n", - " O. Tange (2011): GNU Parallel - The Command-Line Power Tool,\n", - " ;login: The USENIX Magazine, February 2011:42-47.\n", - "\n", - "This helps funding further development; and it won't cost you a cent.\n", - "Or you can get GNU Parallel without this requirement by paying 10000 EUR.\n", - "\n", - "To silence this citation notice run 'parallel --bibtex' once or use '--no-notice'.\n\n", - ); - flush $Global::original_stderr; - } -} - - -sub warning { - my @w = @_; - my $fh = $Global::original_stderr || *STDERR; - my $prog = $Global::progname || "parallel"; - print $fh $prog, ": Warning: ", @w; -} - - -sub error { - my @w = @_; - my $fh = $Global::original_stderr || *STDERR; - my $prog = $Global::progname || "parallel"; - print $fh $prog, ": Error: ", @w; -} - - -sub die_bug { - my $bugid = shift; - print STDERR - ("$Global::progname: This should not happen. You have found a bug.\n", - "Please contact and include:\n", - "* The version number: $Global::version\n", - "* The bugid: $bugid\n", - "* The command line being run\n", - "* The files being read (put the files on a webserver if they are big)\n", - "\n", - "If you get the error on smaller/fewer files, please include those instead.\n"); - ::wait_and_exit(255); -} - -sub version { - # Returns: N/A - if($opt::tollef and not $opt::gnu) { - print "WARNING: YOU ARE USING --tollef. IF THINGS ARE ACTING WEIRD USE --gnu.\n"; - } - print join("\n", - "GNU $Global::progname $Global::version", - "Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014 Ole Tange and Free Software Foundation, Inc.", - "License GPLv3+: GNU GPL version 3 or later ", - "This is free software: you are free to change and redistribute it.", - "GNU $Global::progname comes with no warranty.", - "", - "Web site: http://www.gnu.org/software/${Global::progname}\n", - "When using programs that use GNU Parallel to process data for publication please cite:\n", - "O. Tange (2011): GNU Parallel - The Command-Line Power Tool, ", - ";login: The USENIX Magazine, February 2011:42-47.\n", - "Or you can get GNU Parallel without this requirement by paying 10000 EUR.\n", - ); -} - -sub bibtex { - # Returns: N/A - if($opt::tollef and not $opt::gnu) { - print "WARNING: YOU ARE USING --tollef. IF THINGS ARE ACTING WEIRD USE --gnu.\n"; - } - print join("\n", - "When using programs that use GNU Parallel to process data for publication please cite:", - "", - "\@article{Tange2011a,", - " title = {GNU Parallel - The Command-Line Power Tool},", - " author = {O. Tange},", - " address = {Frederiksberg, Denmark},", - " journal = {;login: The USENIX Magazine},", - " month = {Feb},", - " number = {1},", - " volume = {36},", - " url = {http://www.gnu.org/s/parallel},", - " year = {2011},", - " pages = {42-47}", - "}", - "", - "(Feel free to use \\nocite{Tange2011a})", - "", - "This helps funding further development.", - "", - "Or you can get GNU Parallel without this requirement by paying 10000 EUR.", - "" - ); - while(not -e $ENV{'HOME'}."/.parallel/will-cite") { - print "\nType: 'will cite' and press enter.\n> "; - my $input = ; - if($input =~ /will cite/i) { - mkdir $ENV{'HOME'}."/.parallel"; - open (my $fh, ">", $ENV{'HOME'}."/.parallel/will-cite") - || ::die_bug("Cannot write: ".$ENV{'HOME'}."/.parallel/will-cite"); - close $fh; - print "\nThank you for your support. It is much appreciated. The citation\n", - "notice is now silenced.\n"; - } - } -} - -sub show_limits { - # Returns: N/A - print("Maximal size of command: ",Limits::Command::real_max_length(),"\n", - "Maximal used size of command: ",Limits::Command::max_length(),"\n", - "\n", - "Execution of will continue now, and it will try to read its input\n", - "and run commands; if this is not what you wanted to happen, please\n", - "press CTRL-D or CTRL-C\n"); -} - -sub __GENERIC_COMMON_FUNCTION__ {} - -sub uniq { - # Remove duplicates and return unique values - return keys %{{ map { $_ => 1 } @_ }}; -} - -sub min { - # Returns: - # Minimum value of array - my $min; - for (@_) { - # Skip undefs - defined $_ or next; - defined $min or do { $min = $_; next; }; # Set $_ to the first non-undef - $min = ($min < $_) ? $min : $_; - } - return $min; -} - -sub max { - # Returns: - # Maximum value of array - my $max; - for (@_) { - # Skip undefs - defined $_ or next; - defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef - $max = ($max > $_) ? $max : $_; - } - return $max; -} - -sub sum { - # Returns: - # Sum of values of array - my @args = @_; - my $sum = 0; - for (@args) { - # Skip undefs - $_ and do { $sum += $_; } - } - return $sum; -} - -sub undef_as_zero { - my $a = shift; - return $a ? $a : 0; -} - -sub undef_as_empty { - my $a = shift; - return $a ? $a : ""; -} - -{ - my $hostname; - sub hostname { - if(not $hostname) { - $hostname = `hostname`; - chomp($hostname); - $hostname ||= "nohostname"; - } - return $hostname; - } -} - -sub which { - # Input: - # @programs = programs to find the path to - # Returns: - # @full_path = full paths to @programs. Nothing if not found - my @which; - for my $prg (@_) { - push @which, map { $_."/".$prg } grep { -x $_."/".$prg } split(":",$ENV{'PATH'}); - } - return @which; -} - -{ - my ($regexp,%fakename); - - sub parent_shell { - # Input: - # $pid = pid to see if (grand)*parent is a shell - # Returns: - # $shellpath = path to shell - undef if no shell found - my $pid = shift; - if(not $regexp) { - # All shells known to mankind - # - # ash bash csh dash fdsh fish fizsh ksh ksh93 mksh pdksh - # posh rbash rush rzsh sash sh static-sh tcsh yash zsh - my @shells = qw(ash bash csh dash fdsh fish fizsh ksh - ksh93 mksh pdksh posh rbash rush rzsh - sash sh static-sh tcsh yash zsh -sh -csh); - # Can be formatted as: - # [sh] -sh sh busybox sh - # /bin/sh /sbin/sh /opt/csw/sh - # NOT: foo.sh sshd crash flush pdflush scosh fsflush ssh - my $shell = "(?:".join("|",@shells).")"; - $regexp = '^((\[)('. $shell. ')(\])|(|\S+/|busybox )('. $shell. '))($| )'; - %fakename = ( - # csh and tcsh disguise themselves as -sh/-csh - "-sh" => ["csh", "tcsh"], - "-csh" => ["tcsh", "csh"], - ); - } - my ($children_of_ref, $parent_of_ref, $name_of_ref) = pid_table(); - my $shellpath; - my $testpid = $pid; - while($testpid) { - ::debug("init", "shell? ". $name_of_ref->{$testpid}."\n"); - if($name_of_ref->{$testpid} =~ /$regexp/o) { - ::debug("init", "which ".($3||$6)." => "); - $shellpath = (which($3 || $6,@{$fakename{$3 || $6}}))[0]; - ::debug("init", "shell path $shellpath\n"); - $shellpath and last; - } - $testpid = $parent_of_ref->{$testpid}; - } - return $shellpath; - } -} - -{ - my %pid_parentpid_cmd; - - sub pid_table { - # Returns: - # %children_of = { pid -> children of pid } - # %parent_of = { pid -> pid of parent } - # %name_of = { pid -> commandname } - - if(not %pid_parentpid_cmd) { - # Filter for SysV-style `ps` - my $sysv = q( ps -ef | perl -ane '1..1 and /^(.*)CO?MM?A?N?D/ and $s=length $1;). - q(s/^.{$s}//; print "@F[1,2] $_"' ); - # BSD-style `ps` - my $bsd = q(ps -o pid,ppid,command -ax); - %pid_parentpid_cmd = - ( - 'aix' => $sysv, - 'cygwin' => $sysv, - 'msys' => $sysv, - 'dec_osf' => $sysv, - 'darwin' => $bsd, - 'dragonfly' => $bsd, - 'freebsd' => $bsd, - 'gnu' => $sysv, - 'hpux' => $sysv, - 'linux' => $sysv, - 'mirbsd' => $bsd, - 'netbsd' => $bsd, - 'nto' => $sysv, - 'openbsd' => $bsd, - 'solaris' => $sysv, - 'svr5' => $sysv, - ); - } - $pid_parentpid_cmd{$^O} or ::die_bug("pid_parentpid_cmd for $^O missing"); - - my (@pidtable,%parent_of,%children_of,%name_of); - # Table with pid -> children of pid - @pidtable = `$pid_parentpid_cmd{$^O}`; - my $p=$$; - for (@pidtable) { - # must match: 24436 21224 busybox ash - /(\S+)\s+(\S+)\s+(\S+.*)/ or ::die_bug("pidtable format: $_"); - $parent_of{$1} = $2; - push @{$children_of{$2}}, $1; - $name_of{$1} = $3; - } - return(\%children_of, \%parent_of, \%name_of); - } -} - -sub reap_usleep { - # Reap dead children. - # If no dead children: Sleep specified amount with exponential backoff - # Input: - # $ms = milliseconds to sleep - # Returns: - # $ms/2+0.001 if children reaped - # $ms*1.1 if no children reaped - my $ms = shift; - if(reaper()) { - # Sleep exponentially shorter (1/2^n) if a job finished - return $ms/2+0.001; - } else { - if($opt::timeout) { - $Global::timeoutq->process_timeouts(); - } - usleep($ms); - Job::exit_if_disk_full(); - if($opt::linebuffer) { - for my $job (values %Global::running) { - $job->print(); - } - } - # Sleep exponentially longer (1.1^n) if a job did not finish - # though at most 1000 ms. - return (($ms < 1000) ? ($ms * 1.1) : ($ms)); - } -} - -sub usleep { - # Sleep this many milliseconds. - # Input: - # $ms = milliseconds to sleep - my $ms = shift; - ::debug(int($ms),"ms "); - select(undef, undef, undef, $ms/1000); -} - -sub now { - # Returns time since epoch as in seconds with 3 decimals - # Uses: - # @Global::use - # Returns: - # $time = time now with millisecond accuracy - if(not $Global::use{"Time::HiRes"}) { - if(eval "use Time::HiRes qw ( time );") { - eval "sub TimeHiRestime { return Time::HiRes::time };"; - } else { - eval "sub TimeHiRestime { return time() };"; - } - $Global::use{"Time::HiRes"} = 1; - } - - return (int(TimeHiRestime()*1000))/1000; -} - -sub multiply_binary_prefix { - # Evalualte numbers with binary prefix - # Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80 - # ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80 - # K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80 - # k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24 - # 13G = 13*1024*1024*1024 = 13958643712 - # Input: - # $s = string with prefixes - # Returns: - # $value = int with prefixes multiplied - my $s = shift; - $s =~ s/ki/*1024/gi; - $s =~ s/mi/*1024*1024/gi; - $s =~ s/gi/*1024*1024*1024/gi; - $s =~ s/ti/*1024*1024*1024*1024/gi; - $s =~ s/pi/*1024*1024*1024*1024*1024/gi; - $s =~ s/ei/*1024*1024*1024*1024*1024*1024/gi; - $s =~ s/zi/*1024*1024*1024*1024*1024*1024*1024/gi; - $s =~ s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi; - $s =~ s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi; - - $s =~ s/K/*1024/g; - $s =~ s/M/*1024*1024/g; - $s =~ s/G/*1024*1024*1024/g; - $s =~ s/T/*1024*1024*1024*1024/g; - $s =~ s/P/*1024*1024*1024*1024*1024/g; - $s =~ s/E/*1024*1024*1024*1024*1024*1024/g; - $s =~ s/Z/*1024*1024*1024*1024*1024*1024*1024/g; - $s =~ s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g; - $s =~ s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g; - - $s =~ s/k/*1000/g; - $s =~ s/m/*1000*1000/g; - $s =~ s/g/*1000*1000*1000/g; - $s =~ s/t/*1000*1000*1000*1000/g; - $s =~ s/p/*1000*1000*1000*1000*1000/g; - $s =~ s/e/*1000*1000*1000*1000*1000*1000/g; - $s =~ s/z/*1000*1000*1000*1000*1000*1000*1000/g; - $s =~ s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g; - $s =~ s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g; - - $s = eval $s; - ::debug($s); - return $s; -} - -sub tmpfile { - # Create tempfile as $TMPDIR/parXXXXX - # Returns: - # $filename = file name created - return ::tempfile(DIR=>$ENV{'TMPDIR'}, TEMPLATE => 'parXXXXX', @_); -} - -sub __DEBUGGING__ {} - -sub debug { - # Uses: - # $Global::debug - # %Global::fd - # Returns: N/A - $Global::debug or return; - @_ = grep { defined $_ ? $_ : "" } @_; - if($Global::debug eq "all" or $Global::debug eq $_[0]) { - if($Global::fd{1}) { - # Original stdout was saved - my $stdout = $Global::fd{1}; - print $stdout @_[1..$#_]; - } else { - print @_[1..$#_]; - } - } -} - -sub my_memory_usage { - # Returns: - # memory usage if found - # 0 otherwise - use strict; - use FileHandle; - - my $pid = $$; - if(-e "/proc/$pid/stat") { - my $fh = FileHandle->new("; - chomp $data; - $fh->close; - - my @procinfo = split(/\s+/,$data); - - return undef_as_zero($procinfo[22]); - } else { - return 0; - } -} - -sub my_size { - # Returns: - # $size = size of object if Devel::Size is installed - # -1 otherwise - my @size_this = (@_); - eval "use Devel::Size qw(size total_size)"; - if ($@) { - return -1; - } else { - return total_size(@_); - } -} - -sub my_dump { - # Returns: - # ascii expression of object if Data::Dump(er) is installed - # error code otherwise - my @dump_this = (@_); - eval "use Data::Dump qw(dump);"; - if ($@) { - # Data::Dump not installed - eval "use Data::Dumper;"; - if ($@) { - my $err = "Neither Data::Dump nor Data::Dumper is installed\n". - "Not dumping output\n"; - print $Global::original_stderr $err; - return $err; - } else { - return Dumper(@dump_this); - } - } else { - # Create a dummy Data::Dump:dump as Hans Schou sometimes has - # it undefined - eval "sub Data::Dump:dump {}"; - eval "use Data::Dump qw(dump);"; - return (Data::Dump::dump(@dump_this)); - } -} - -sub my_croak { - eval "use Carp; 1"; - $Carp::Verbose = 1; - croak(@_); -} - -sub my_carp { - eval "use Carp; 1"; - $Carp::Verbose = 1; - carp(@_); -} - -sub __OBJECT_ORIENTED_PARTS__ {} - -package SSHLogin; - -sub new { - my $class = shift; - my $sshlogin_string = shift; - my $ncpus; - my %hostgroups; - # SSHLogins can have these formats: - # @grp+grp/ncpu//usr/bin/ssh user@server - # ncpu//usr/bin/ssh user@server - # /usr/bin/ssh user@server - # user@server - # ncpu/user@server - # @grp+grp/user@server - if($sshlogin_string =~ s:^\@([^/]+)/?::) { - # Look for SSHLogin hostgroups - %hostgroups = map { $_ => 1 } split(/\+/, $1); - } - if ($sshlogin_string =~ s:^(\d+)/::) { - # Override default autodetected ncpus unless missing - $ncpus = $1; - } - my $string = $sshlogin_string; - # An SSHLogin is always in the hostgroup of its $string-name - $hostgroups{$string} = 1; - @Global::hostgroups{keys %hostgroups} = values %hostgroups; - my @unget = (); - my $no_slash_string = $string; - $no_slash_string =~ s/[^-a-z0-9:]/_/gi; - return bless { - 'string' => $string, - 'jobs_running' => 0, - 'jobs_completed' => 0, - 'maxlength' => undef, - 'max_jobs_running' => undef, - 'orig_max_jobs_running' => undef, - 'ncpus' => $ncpus, - 'hostgroups' => \%hostgroups, - 'sshcommand' => undef, - 'serverlogin' => undef, - 'control_path_dir' => undef, - 'control_path' => undef, - 'time_to_login' => undef, - 'last_login_at' => undef, - 'loadavg_file' => $ENV{'HOME'} . "/.parallel/tmp/loadavg-" . - $no_slash_string, - 'loadavg' => undef, - 'last_loadavg_update' => 0, - 'swap_activity_file' => $ENV{'HOME'} . "/.parallel/tmp/swap_activity-" . - $no_slash_string, - 'swap_activity' => undef, - }, ref($class) || $class; -} - -sub DESTROY { - my $self = shift; - # Remove temporary files if they are created. - unlink $self->{'loadavg_file'}; - unlink $self->{'swap_activity_file'}; -} - -sub string { - my $self = shift; - return $self->{'string'}; -} - -sub jobs_running { - my $self = shift; - - return ($self->{'jobs_running'} || "0"); -} - -sub inc_jobs_running { - my $self = shift; - $self->{'jobs_running'}++; -} - -sub dec_jobs_running { - my $self = shift; - $self->{'jobs_running'}--; -} - -sub set_maxlength { - my $self = shift; - $self->{'maxlength'} = shift; -} - -sub maxlength { - my $self = shift; - return $self->{'maxlength'}; -} - -sub jobs_completed { - my $self = shift; - return $self->{'jobs_completed'}; -} - -sub in_hostgroups { - # Input: - # @hostgroups = the hostgroups to look for - # Returns: - # true if intersection of @hostgroups and the hostgroups of this - # SSHLogin is non-empty - my $self = shift; - return grep { defined $self->{'hostgroups'}{$_} } @_; -} - -sub hostgroups { - my $self = shift; - return keys %{$self->{'hostgroups'}}; -} - -sub inc_jobs_completed { - my $self = shift; - $self->{'jobs_completed'}++; -} - -sub set_max_jobs_running { - my $self = shift; - if(defined $self->{'max_jobs_running'}) { - $Global::max_jobs_running -= $self->{'max_jobs_running'}; - } - $self->{'max_jobs_running'} = shift; - if(defined $self->{'max_jobs_running'}) { - # max_jobs_running could be resat if -j is a changed file - $Global::max_jobs_running += $self->{'max_jobs_running'}; - } - # Initialize orig to the first non-zero value that comes around - $self->{'orig_max_jobs_running'} ||= $self->{'max_jobs_running'}; -} - -sub swapping { - my $self = shift; - my $swapping = $self->swap_activity(); - return (not defined $swapping or $swapping) -} - -sub swap_activity { - # If the currently known swap activity is too old: - # Recompute a new one in the background - # Returns: - # last swap activity computed - my $self = shift; - # Should we update the swap_activity file? - my $update_swap_activity_file = 0; - if(-r $self->{'swap_activity_file'}) { - open(my $swap_fh, "<", $self->{'swap_activity_file'}) || ::die_bug("swap_activity_file-r"); - my $swap_out = <$swap_fh>; - close $swap_fh; - if($swap_out =~ /^(\d+)$/) { - $self->{'swap_activity'} = $1; - ::debug("swap", "New swap_activity: ", $self->{'swap_activity'}); - } - ::debug("swap", "Last update: ", $self->{'last_swap_activity_update'}); - if(time - $self->{'last_swap_activity_update'} > 10) { - # last swap activity update was started 10 seconds ago - ::debug("swap", "Older than 10 sec: ", $self->{'swap_activity_file'}); - $update_swap_activity_file = 1; - } - } else { - ::debug("swap", "No swap_activity file: ", $self->{'swap_activity_file'}); - $self->{'swap_activity'} = undef; - $update_swap_activity_file = 1; - } - if($update_swap_activity_file) { - ::debug("swap", "Updating swap_activity file ", $self->{'swap_activity_file'}); - $self->{'last_swap_activity_update'} = time; - -e $ENV{'HOME'}."/.parallel" or mkdir $ENV{'HOME'}."/.parallel"; - -e $ENV{'HOME'}."/.parallel/tmp" or mkdir $ENV{'HOME'}."/.parallel/tmp"; - my $swap_activity; - $swap_activity = swapactivityscript(); - if($self->{'string'} ne ":") { - $swap_activity = $self->sshcommand() . " " . $self->serverlogin() . " " . - ::shell_quote_scalar($swap_activity); - } - # Run swap_activity measuring. - # As the command can take long to run if run remote - # save it to a tmp file before moving it to the correct file - my $file = $self->{'swap_activity_file'}; - my ($dummy_fh, $tmpfile) = ::tmpfile(SUFFIX => ".swp"); - ::debug("swap", "\n", $swap_activity, "\n"); - qx{ ($swap_activity > $tmpfile && mv $tmpfile $file || rm $tmpfile) & }; - } - return $self->{'swap_activity'}; -} - -{ - my $script; - - sub swapactivityscript { - # Returns: - # shellscript for detecting swap activity - # - # arguments for vmstat are OS dependant - # swap_in and swap_out are in different columns depending on OS - # - if(not $script) { - my %vmstat = ( - # linux: $7*$8 - # $ vmstat 1 2 - # procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu---- - # r b swpd free buff cache si so bi bo in cs us sy id wa - # 5 0 51208 1701096 198012 18857888 0 0 37 153 28 19 56 11 33 1 - # 3 0 51208 1701288 198012 18857972 0 0 0 0 3638 10412 15 3 82 0 - 'linux' => ['vmstat 1 2 | tail -n1', '$7*$8'], - - # solaris: $6*$7 - # $ vmstat -S 1 2 - # kthr memory page disk faults cpu - # r b w swap free si so pi po fr de sr s3 s4 -- -- in sy cs us sy id - # 0 0 0 4628952 3208408 0 0 3 1 1 0 0 -0 2 0 0 263 613 246 1 2 97 - # 0 0 0 4552504 3166360 0 0 0 0 0 0 0 0 0 0 0 246 213 240 1 1 98 - 'solaris' => ['vmstat -S 1 2 | tail -1', '$6*$7'], - - # darwin (macosx): $21*$22 - # $ vm_stat -c 2 1 - # Mach Virtual Memory Statistics: (page size of 4096 bytes) - # free active specul inactive throttle wired prgable faults copy 0fill reactive purged file-backed anonymous cmprssed cmprssor dcomprs comprs pageins pageout swapins swapouts - # 346306 829050 74871 606027 0 240231 90367 544858K 62343596 270837K 14178 415070 570102 939846 356 370 116 922 4019813 4 0 0 - # 345740 830383 74875 606031 0 239234 90369 2696 359 553 0 0 570110 941179 356 370 0 0 0 0 0 0 - 'darwin' => ['vm_stat -c 2 1 | tail -n1', '$21*$22'], - - # ultrix: $12*$13 - # $ vmstat -S 1 2 - # procs faults cpu memory page disk - # r b w in sy cs us sy id avm fre si so pi po fr de sr s0 - # 1 0 0 4 23 2 3 0 97 7743 217k 0 0 0 0 0 0 0 0 - # 1 0 0 6 40 8 0 1 99 7743 217k 0 0 3 0 0 0 0 0 - 'ultrix' => ['vmstat -S 1 2 | tail -1', '$12*$13'], - - # aix: $6*$7 - # $ vmstat 1 2 - # System configuration: lcpu=1 mem=2048MB - # - # kthr memory page faults cpu - # ----- ----------- ------------------------ ------------ ----------- - # r b avm fre re pi po fr sr cy in sy cs us sy id wa - # 0 0 333933 241803 0 0 0 0 0 0 10 143 90 0 0 99 0 - # 0 0 334125 241569 0 0 0 0 0 0 37 5368 184 0 9 86 5 - 'aix' => ['vmstat 1 2 | tail -n1', '$6*$7'], - - # freebsd: $8*$9 - # $ vmstat -H 1 2 - # procs memory page disks faults cpu - # r b w avm fre flt re pi po fr sr ad0 ad1 in sy cs us sy id - # 1 0 0 596716 19560 32 0 0 0 33 8 0 0 11 220 277 0 0 99 - # 0 0 0 596716 19560 2 0 0 0 0 0 0 0 11 144 263 0 1 99 - 'freebsd' => ['vmstat -H 1 2 | tail -n1', '$8*$9'], - - # mirbsd: $8*$9 - # $ vmstat 1 2 - # procs memory page disks traps cpu - # r b w avm fre flt re pi po fr sr wd0 cd0 int sys cs us sy id - # 0 0 0 25776 164968 34 0 0 0 0 0 0 0 230 259 38 4 0 96 - # 0 0 0 25776 164968 24 0 0 0 0 0 0 0 237 275 37 0 0 100 - 'mirbsd' => ['vmstat 1 2 | tail -n1', '$8*$9'], - - # netbsd: $7*$8 - # $ vmstat 1 2 - # procs memory page disks faults cpu - # r b avm fre flt re pi po fr sr w0 w1 in sy cs us sy id - # 0 0 138452 6012 54 0 0 0 1 2 3 0 4 100 23 0 0 100 - # 0 0 138456 6008 1 0 0 0 0 0 0 0 7 26 19 0 0 100 - 'netbsd' => ['vmstat 1 2 | tail -n1', '$7*$8'], - - # openbsd: $8*$9 - # $ vmstat 1 2 - # procs memory page disks traps cpu - # r b w avm fre flt re pi po fr sr wd0 wd1 int sys cs us sy id - # 0 0 0 76596 109944 73 0 0 0 0 0 0 1 5 259 22 0 1 99 - # 0 0 0 76604 109936 24 0 0 0 0 0 0 0 7 114 20 0 1 99 - 'openbsd' => ['vmstat 1 2 | tail -n1', '$8*$9'], - - # hpux: $8*$9 - # $ vmstat 1 2 - # procs memory page faults cpu - # r b w avm free re at pi po fr de sr in sy cs us sy id - # 1 0 0 247211 216476 4 1 0 0 0 0 0 102 73005 54 6 11 83 - # 1 0 0 247211 216421 43 9 0 0 0 0 0 144 1675 96 25269512791222387000 25269512791222387000 105 - 'hpux' => ['vmstat 1 2 | tail -n1', '$8*$9'], - - # dec_osf (tru64): $11*$12 - # $ vmstat 1 2 - # Virtual Memory Statistics: (pagesize = 8192) - # procs memory pages intr cpu - # r w u act free wire fault cow zero react pin pout in sy cs us sy id - # 3 181 36 51K 1895 8696 348M 59M 122M 259 79M 0 5 218 302 4 1 94 - # 3 181 36 51K 1893 8696 3 15 21 0 28 0 4 81 321 1 1 98 - 'dec_osf' => ['vmstat 1 2 | tail -n1', '$11*$12'], - - # gnu (hurd): $7*$8 - # $ vmstat -k 1 2 - # (pagesize: 4, size: 512288, swap size: 894972) - # free actv inact wired zeroed react pgins pgouts pfaults cowpfs hrat caobj cache swfree - # 371940 30844 89228 20276 298348 0 48192 19016 756105 99808 98% 876 20628 894972 - # 371940 30844 89228 20276 +0 +0 +0 +0 +42 +2 98% 876 20628 894972 - 'gnu' => ['vmstat -k 1 2 | tail -n1', '$7*$8'], - - # -nto (qnx has no swap) - #-irix - #-svr5 (scosysv) - ); - my $perlscript = ""; - for my $os (keys %vmstat) { - #q[ { vmstat 1 2 2> /dev/null || vmstat -c 1 2; } | ]. - # q[ awk 'NR!=4{next} NF==17||NF==16{print $7*$8} NF==22{print $21*$22} {exit}' ]; - $vmstat{$os}[1] =~ s/\$/\\\\\\\$/g; # $ => \\\$ - $perlscript .= 'if($^O eq "'.$os.'") { print `'.$vmstat{$os}[0].' | awk "{print ' . - $vmstat{$os}[1] . '}"` }'; - } - $perlscript = "perl -e " . ::shell_quote_scalar($perlscript); - $script = $Global::envvar. " " .$perlscript; - } - return $script; - } -} - -sub too_fast_remote_login { - my $self = shift; - if($self->{'last_login_at'} and $self->{'time_to_login'}) { - # sshd normally allows 10 simultaneous logins - # A login takes time_to_login - # So time_to_login/5 should be safe - # If now <= last_login + time_to_login/5: Then it is too soon. - my $too_fast = (::now() <= $self->{'last_login_at'} - + $self->{'time_to_login'}/5); - ::debug("run", "Too fast? $too_fast "); - return $too_fast; - } else { - # No logins so far (or time_to_login not computed): it is not too fast - return 0; - } -} - -sub last_login_at { - my $self = shift; - return $self->{'last_login_at'}; -} - -sub set_last_login_at { - my $self = shift; - $self->{'last_login_at'} = shift; -} - -sub loadavg_too_high { - my $self = shift; - my $loadavg = $self->loadavg(); - return (not defined $loadavg or - $loadavg > $self->max_loadavg()); -} - -sub loadavg { - # If the currently know loadavg is too old: - # Recompute a new one in the background - # The load average is computed as the number of processes waiting for disk - # or CPU right now. So it is the server load this instant and not averaged over - # several minutes. This is needed so GNU Parallel will at most start one job - # that will push the load over the limit. - # - # Returns: - # $last_loadavg = last load average computed (undef if none) - my $self = shift; - # Should we update the loadavg file? - my $update_loadavg_file = 0; - if(open(my $load_fh, "<", $self->{'loadavg_file'})) { - local $/ = undef; - my $load_out = <$load_fh>; - close $load_fh; - my $load =()= ($load_out=~/(^[DR]....[^\[])/gm); - if($load > 0) { - # load is overestimated by 1 - $self->{'loadavg'} = $load - 1; - ::debug("load", "New loadavg: ", $self->{'loadavg'}); - } else { - ::die_bug("loadavg_invalid_content: $load_out"); - } - ::debug("load", "Last update: ", $self->{'last_loadavg_update'}); - if(time - $self->{'last_loadavg_update'} > 10) { - # last loadavg was started 10 seconds ago - ::debug("load", time - $self->{'last_loadavg_update'}, " secs old: ", - $self->{'loadavg_file'}); - $update_loadavg_file = 1; - } - } else { - ::debug("load", "No loadavg file: ", $self->{'loadavg_file'}); - $self->{'loadavg'} = undef; - $update_loadavg_file = 1; - } - if($update_loadavg_file) { - ::debug("load", "Updating loadavg file", $self->{'loadavg_file'}, "\n"); - $self->{'last_loadavg_update'} = time; - -e $ENV{'HOME'}."/.parallel" or mkdir $ENV{'HOME'}."/.parallel"; - -e $ENV{'HOME'}."/.parallel/tmp" or mkdir $ENV{'HOME'}."/.parallel/tmp"; - my $cmd = ""; - if($self->{'string'} ne ":") { - $cmd = $self->sshcommand() . " " . $self->serverlogin() . " "; - } - # TODO Is is called 'ps ax -o state,command' on other platforms? - $cmd .= "ps ax -o state,command"; - # As the command can take long to run if run remote - # save it to a tmp file before moving it to the correct file - my $file = $self->{'loadavg_file'}; - my ($dummy_fh, $tmpfile) = ::tmpfile(SUFFIX => ".loa"); - qx{ ($cmd > $tmpfile && mv $tmpfile $file || rm $tmpfile) & }; - } - return $self->{'loadavg'}; -} - -sub max_loadavg { - my $self = shift; - # If --load is a file it might be changed - if($Global::max_load_file) { - my $mtime = (stat($Global::max_load_file))[9]; - if($mtime > $Global::max_load_file_last_mod) { - $Global::max_load_file_last_mod = $mtime; - for my $sshlogin (values %Global::host) { - $sshlogin->set_max_loadavg(undef); - } - } - } - if(not defined $self->{'max_loadavg'}) { - $self->{'max_loadavg'} = - $self->compute_max_loadavg($opt::load); - } - ::debug("load", "max_loadavg: ", $self->string(), " ", $self->{'max_loadavg'}); - return $self->{'max_loadavg'}; -} - -sub set_max_loadavg { - my $self = shift; - $self->{'max_loadavg'} = shift; -} - -sub compute_max_loadavg { - # Parse the max loadaverage that the user asked for using --load - # Returns: - # max loadaverage - my $self = shift; - my $loadspec = shift; - my $load; - if(defined $loadspec) { - if($loadspec =~ /^\+(\d+)$/) { - # E.g. --load +2 - my $j = $1; - $load = - $self->ncpus() + $j; - } elsif ($loadspec =~ /^-(\d+)$/) { - # E.g. --load -2 - my $j = $1; - $load = - $self->ncpus() - $j; - } elsif ($loadspec =~ /^(\d+)\%$/) { - my $j = $1; - $load = - $self->ncpus() * $j / 100; - } elsif ($loadspec =~ /^(\d+(\.\d+)?)$/) { - $load = $1; - } elsif (-f $loadspec) { - $Global::max_load_file = $loadspec; - $Global::max_load_file_last_mod = (stat($Global::max_load_file))[9]; - if(open(my $in_fh, "<", $Global::max_load_file)) { - my $opt_load_file = join("",<$in_fh>); - close $in_fh; - $load = $self->compute_max_loadavg($opt_load_file); - } else { - print $Global::original_stderr "Cannot open $loadspec\n"; - ::wait_and_exit(255); - } - } else { - print $Global::original_stderr "Parsing of --load failed\n"; - ::die_usage(); - } - if($load < 0.01) { - $load = 0.01; - } - } - return $load; -} - -sub time_to_login { - my $self = shift; - return $self->{'time_to_login'}; -} - -sub set_time_to_login { - my $self = shift; - $self->{'time_to_login'} = shift; -} - -sub max_jobs_running { - my $self = shift; - if(not defined $self->{'max_jobs_running'}) { - my $nproc = $self->compute_number_of_processes($opt::jobs); - $self->set_max_jobs_running($nproc); - } - return $self->{'max_jobs_running'}; -} - -sub orig_max_jobs_running { - my $self = shift; - return $self->{'orig_max_jobs_running'}; -} - -sub compute_number_of_processes { - # Number of processes wanted and limited by system resources - # Returns: - # Number of processes - my $self = shift; - my $opt_P = shift; - my $wanted_processes = $self->user_requested_processes($opt_P); - if(not defined $wanted_processes) { - $wanted_processes = $Global::default_simultaneous_sshlogins; - } - ::debug("load", "Wanted procs: $wanted_processes\n"); - my $system_limit = - $self->processes_available_by_system_limit($wanted_processes); - ::debug("load", "Limited to procs: $system_limit\n"); - return $system_limit; -} - -sub processes_available_by_system_limit { - # If the wanted number of processes is bigger than the system limits: - # Limit them to the system limits - # Limits are: File handles, number of input lines, processes, - # and taking > 1 second to spawn 10 extra processes - # Returns: - # Number of processes - my $self = shift; - my $wanted_processes = shift; - - my $system_limit = 0; - my @jobs = (); - my $job; - my @args = (); - my $arg; - my $more_filehandles = 1; - my $max_system_proc_reached = 0; - my $slow_spawining_warning_printed = 0; - my $time = time; - my %fh; - my @children; - - # Reserve filehandles - # perl uses 7 filehandles for something? - # parallel uses 1 for memory_usage - # parallel uses 4 for ? - for my $i (1..12) { - open($fh{"init-$i"}, "<", "/dev/null"); - } - - for(1..2) { - # System process limit - my $child; - if($child = fork()) { - push (@children,$child); - $Global::unkilled_children{$child} = 1; - } elsif(defined $child) { - # The child takes one process slot - # It will be killed later - $SIG{TERM} = $Global::original_sig{TERM}; - sleep 10000000; - exit(0); - } else { - $max_system_proc_reached = 1; - } - } - my $count_jobs_already_read = $Global::JobQueue->next_seq(); - my $wait_time_for_getting_args = 0; - my $start_time = time; - while(1) { - $system_limit >= $wanted_processes and last; - not $more_filehandles and last; - $max_system_proc_reached and last; - my $before_getting_arg = time; - if($Global::semaphore or $opt::pipe) { - # Skip: No need to get args - } elsif(defined $opt::retries and $count_jobs_already_read) { - # For retries we may need to run all jobs on this sshlogin - # so include the already read jobs for this sshlogin - $count_jobs_already_read--; - } else { - if($opt::X or $opt::m) { - # The arguments may have to be re-spread over several jobslots - # So pessimistically only read one arg per jobslot - # instead of a full commandline - if($Global::JobQueue->{'commandlinequeue'}->{'arg_queue'}->empty()) { - if($Global::JobQueue->empty()) { - last; - } else { - ($job) = $Global::JobQueue->get(); - push(@jobs, $job); - } - } else { - ($arg) = $Global::JobQueue->{'commandlinequeue'}->{'arg_queue'}->get(); - push(@args, $arg); - } - } else { - # If there are no more command lines, then we have a process - # per command line, so no need to go further - $Global::JobQueue->empty() and last; - ($job) = $Global::JobQueue->get(); - push(@jobs, $job); - } - } - $wait_time_for_getting_args += time - $before_getting_arg; - $system_limit++; - - # Every simultaneous process uses 2 filehandles when grouping - # Every simultaneous process uses 2 filehandles when compressing - $more_filehandles = open($fh{$system_limit*10}, "<", "/dev/null") - && open($fh{$system_limit*10+2}, "<", "/dev/null") - && open($fh{$system_limit*10+3}, "<", "/dev/null") - && open($fh{$system_limit*10+4}, "<", "/dev/null"); - - # System process limit - my $child; - if($child = fork()) { - push (@children,$child); - $Global::unkilled_children{$child} = 1; - } elsif(defined $child) { - # The child takes one process slot - # It will be killed later - $SIG{TERM} = $Global::original_sig{TERM}; - sleep 10000000; - exit(0); - } else { - $max_system_proc_reached = 1; - } - my $forktime = time - $time - $wait_time_for_getting_args; - ::debug("run", "Time to fork $system_limit procs: $wait_time_for_getting_args ", - $forktime, - " (processes so far: ", $system_limit,")\n"); - if($system_limit > 10 and - $forktime > 1 and - $forktime > $system_limit * 0.01 - and not $slow_spawining_warning_printed) { - # It took more than 0.01 second to fork a processes on avg. - # Give the user a warning. He can press Ctrl-C if this - # sucks. - print $Global::original_stderr - ("parallel: Warning: Starting $system_limit processes took > $forktime sec.\n", - "Consider adjusting -j. Press CTRL-C to stop.\n"); - $slow_spawining_warning_printed = 1; - } - } - # Cleanup: Close the files - for (values %fh) { close $_ } - # Cleanup: Kill the children - for my $pid (@children) { - kill 9, $pid; - waitpid($pid,0); - delete $Global::unkilled_children{$pid}; - } - # Cleanup: Unget the command_lines or the @args - $Global::JobQueue->{'commandlinequeue'}->{'arg_queue'}->unget(@args); - $Global::JobQueue->unget(@jobs); - if($system_limit < $wanted_processes) { - # The system_limit is less than the wanted_processes - if($system_limit < 1 and not $Global::JobQueue->empty()) { - ::warning("Cannot spawn any jobs. Raising ulimit -u or /etc/security/limits.conf\n", - "or /proc/sys/kernel/pid_max may help.\n"); - ::wait_and_exit(255); - } - if(not $more_filehandles) { - ::warning("Only enough file handles to run ", $system_limit, " jobs in parallel.\n", - "Running 'parallel -j0 -N", $system_limit, " --pipe parallel -j0' or ", - "raising ulimit -n or /etc/security/limits.conf may help.\n"); - } - if($max_system_proc_reached) { - ::warning("Only enough available processes to run ", $system_limit, - " jobs in parallel. Raising ulimit -u or /etc/security/limits.conf\n", - "or /proc/sys/kernel/pid_max may help.\n"); - } - } - if($] == 5.008008 and $system_limit > 1000) { - # https://savannah.gnu.org/bugs/?36942 - $system_limit = 1000; - } - if($Global::JobQueue->empty()) { - $system_limit ||= 1; - } - if($self->string() ne ":" and - $system_limit > $Global::default_simultaneous_sshlogins) { - $system_limit = - $self->simultaneous_sshlogin_limit($system_limit); - } - return $system_limit; -} - -sub simultaneous_sshlogin_limit { - # Test by logging in wanted number of times simultaneously - # Returns: - # min($wanted_processes,$working_simultaneous_ssh_logins-1) - my $self = shift; - my $wanted_processes = shift; - if($self->{'time_to_login'}) { - return $wanted_processes; - } - - # Try twice because it guesses wrong sometimes - # Choose the minimal - my $ssh_limit = - ::min($self->simultaneous_sshlogin($wanted_processes), - $self->simultaneous_sshlogin($wanted_processes)); - if($ssh_limit < $wanted_processes) { - my $serverlogin = $self->serverlogin(); - ::warning("ssh to $serverlogin only allows ", - "for $ssh_limit simultaneous logins.\n", - "You may raise this by changing ", - "/etc/ssh/sshd_config:MaxStartups and MaxSessions on $serverlogin.\n", - "Using only ",$ssh_limit-1," connections ", - "to avoid race conditions.\n"); - } - # Race condition can cause problem if using all sshs. - if($ssh_limit > 1) { $ssh_limit -= 1; } - return $ssh_limit; -} - -sub simultaneous_sshlogin { - # Using $sshlogin try to see if we can do $wanted_processes - # simultaneous logins - # (ssh host echo simultaneouslogin & ssh host echo simultaneouslogin & ...)|grep simul|wc -l - # Returns: - # Number of succesful logins - my $self = shift; - my $wanted_processes = shift; - my $sshcmd = $self->sshcommand(); - my $serverlogin = $self->serverlogin(); - my $sshdelay = $opt::sshdelay ? "sleep $opt::sshdelay;" : ""; - my $cmd = "$sshdelay$sshcmd $serverlogin echo simultaneouslogin &1 &"x$wanted_processes; - ::debug("init", "Trying $wanted_processes logins at $serverlogin\n"); - open (my $simul_fh, "-|", "($cmd)|grep simultaneouslogin | wc -l") or - ::die_bug("simultaneouslogin"); - my $ssh_limit = <$simul_fh>; - close $simul_fh; - chomp $ssh_limit; - return $ssh_limit; -} - -sub set_ncpus { - my $self = shift; - $self->{'ncpus'} = shift; -} - -sub user_requested_processes { - # Parse the number of processes that the user asked for using -j - # Returns: - # the number of processes to run on this sshlogin - my $self = shift; - my $opt_P = shift; - my $processes; - if(defined $opt_P) { - if($opt_P =~ /^\+(\d+)$/) { - # E.g. -P +2 - my $j = $1; - $processes = - $self->ncpus() + $j; - } elsif ($opt_P =~ /^-(\d+)$/) { - # E.g. -P -2 - my $j = $1; - $processes = - $self->ncpus() - $j; - } elsif ($opt_P =~ /^(\d+(\.\d+)?)\%$/) { - # E.g. -P 10.5% - my $j = $1; - $processes = - $self->ncpus() * $j / 100; - } elsif ($opt_P =~ /^(\d+)$/) { - $processes = $1; - if($processes == 0) { - # -P 0 = infinity (or at least close) - $processes = $Global::infinity; - } - } elsif (-f $opt_P) { - $Global::max_procs_file = $opt_P; - $Global::max_procs_file_last_mod = (stat($Global::max_procs_file))[9]; - if(open(my $in_fh, "<", $Global::max_procs_file)) { - my $opt_P_file = join("",<$in_fh>); - close $in_fh; - $processes = $self->user_requested_processes($opt_P_file); - } else { - ::error("Cannot open $opt_P.\n"); - ::wait_and_exit(255); - } - } else { - ::error("Parsing of --jobs/-j/--max-procs/-P failed.\n"); - ::die_usage(); - } - $processes = ::ceil($processes); - } - return $processes; -} - -sub ncpus { - my $self = shift; - if(not defined $self->{'ncpus'}) { - my $sshcmd = $self->sshcommand(); - my $serverlogin = $self->serverlogin(); - if($serverlogin eq ":") { - if($opt::use_cpus_instead_of_cores) { - $self->{'ncpus'} = no_of_cpus(); - } else { - $self->{'ncpus'} = no_of_cores(); - } - } else { - my $ncpu; - my $sqe = ::shell_quote_scalar($Global::envvar); - if($opt::use_cpus_instead_of_cores) { - $ncpu = qx(echo|$sshcmd $serverlogin $sqe parallel --number-of-cpus); - } else { - ::debug("init",qq(echo|$sshcmd $serverlogin $sqe parallel --number-of-cores\n)); - $ncpu = qx(echo|$sshcmd $serverlogin $sqe parallel --number-of-cores); - } - chomp $ncpu; - if($ncpu =~ /^\s*[0-9]+\s*$/s) { - $self->{'ncpus'} = $ncpu; - } else { - ::warning("Could not figure out ", - "number of cpus on $serverlogin ($ncpu). Using 1.\n"); - $self->{'ncpus'} = 1; - } - } - } - return $self->{'ncpus'}; -} - -sub no_of_cpus { - # Returns: - # Number of physical CPUs - local $/="\n"; # If delimiter is set, then $/ will be wrong - my $no_of_cpus; - if ($^O eq 'linux') { - $no_of_cpus = no_of_cpus_gnu_linux() || no_of_cores_gnu_linux(); - } elsif ($^O eq 'freebsd') { - $no_of_cpus = no_of_cpus_freebsd(); - } elsif ($^O eq 'netbsd') { - $no_of_cpus = no_of_cpus_netbsd(); - } elsif ($^O eq 'openbsd') { - $no_of_cpus = no_of_cpus_openbsd(); - } elsif ($^O eq 'gnu') { - $no_of_cpus = no_of_cpus_hurd(); - } elsif ($^O eq 'darwin') { - $no_of_cpus = no_of_cpus_darwin(); - } elsif ($^O eq 'solaris') { - $no_of_cpus = no_of_cpus_solaris(); - } elsif ($^O eq 'aix') { - $no_of_cpus = no_of_cpus_aix(); - } elsif ($^O eq 'hpux') { - $no_of_cpus = no_of_cpus_hpux(); - } elsif ($^O eq 'nto') { - $no_of_cpus = no_of_cpus_qnx(); - } elsif ($^O eq 'svr5') { - $no_of_cpus = no_of_cpus_openserver(); - } elsif ($^O eq 'irix') { - $no_of_cpus = no_of_cpus_irix(); - } elsif ($^O eq 'dec_osf') { - $no_of_cpus = no_of_cpus_tru64(); - } else { - $no_of_cpus = (no_of_cpus_gnu_linux() - || no_of_cpus_freebsd() - || no_of_cpus_netbsd() - || no_of_cpus_openbsd() - || no_of_cpus_hurd() - || no_of_cpus_darwin() - || no_of_cpus_solaris() - || no_of_cpus_aix() - || no_of_cpus_hpux() - || no_of_cpus_qnx() - || no_of_cpus_openserver() - || no_of_cpus_irix() - || no_of_cpus_tru64() - # Number of cores is better than no guess for #CPUs - || nproc() - ); - } - if($no_of_cpus) { - chomp $no_of_cpus; - return $no_of_cpus; - } else { - ::warning("Cannot figure out number of cpus. Using 1.\n"); - return 1; - } -} - -sub no_of_cores { - # Returns: - # Number of CPU cores - local $/="\n"; # If delimiter is set, then $/ will be wrong - my $no_of_cores; - if ($^O eq 'linux') { - $no_of_cores = no_of_cores_gnu_linux(); - } elsif ($^O eq 'freebsd') { - $no_of_cores = no_of_cores_freebsd(); - } elsif ($^O eq 'netbsd') { - $no_of_cores = no_of_cores_netbsd(); - } elsif ($^O eq 'openbsd') { - $no_of_cores = no_of_cores_openbsd(); - } elsif ($^O eq 'gnu') { - $no_of_cores = no_of_cores_hurd(); - } elsif ($^O eq 'darwin') { - $no_of_cores = no_of_cores_darwin(); - } elsif ($^O eq 'solaris') { - $no_of_cores = no_of_cores_solaris(); - } elsif ($^O eq 'aix') { - $no_of_cores = no_of_cores_aix(); - } elsif ($^O eq 'hpux') { - $no_of_cores = no_of_cores_hpux(); - } elsif ($^O eq 'nto') { - $no_of_cores = no_of_cores_qnx(); - } elsif ($^O eq 'svr5') { - $no_of_cores = no_of_cores_openserver(); - } elsif ($^O eq 'irix') { - $no_of_cores = no_of_cores_irix(); - } elsif ($^O eq 'dec_osf') { - $no_of_cores = no_of_cores_tru64(); - } else { - $no_of_cores = (no_of_cores_gnu_linux() - || no_of_cores_freebsd() - || no_of_cores_netbsd() - || no_of_cores_openbsd() - || no_of_cores_hurd() - || no_of_cores_darwin() - || no_of_cores_solaris() - || no_of_cores_aix() - || no_of_cores_hpux() - || no_of_cores_qnx() - || no_of_cores_openserver() - || no_of_cores_irix() - || no_of_cores_tru64() - || nproc() - ); - } - if($no_of_cores) { - chomp $no_of_cores; - return $no_of_cores; - } else { - ::warning("Cannot figure out number of CPU cores. Using 1.\n"); - return 1; - } -} - -sub nproc { - # Returns: - # Number of cores using `nproc` - my $no_of_cores = `nproc 2>/dev/null`; - return $no_of_cores; -} - -sub no_of_cpus_gnu_linux { - # Returns: - # Number of physical CPUs on GNU/Linux - # undef if not GNU/Linux - my $no_of_cpus; - my $no_of_cores; - if(-e "/proc/cpuinfo") { - $no_of_cpus = 0; - $no_of_cores = 0; - my %seen; - open(my $in_fh, "<", "/proc/cpuinfo") || return undef; - while(<$in_fh>) { - if(/^physical id.*[:](.*)/ and not $seen{$1}++) { - $no_of_cpus++; - } - /^processor.*[:]/i and $no_of_cores++; - } - close $in_fh; - } - return ($no_of_cpus||$no_of_cores); -} - -sub no_of_cores_gnu_linux { - # Returns: - # Number of CPU cores on GNU/Linux - # undef if not GNU/Linux - my $no_of_cores; - if(-e "/proc/cpuinfo") { - $no_of_cores = 0; - open(my $in_fh, "<", "/proc/cpuinfo") || return undef; - while(<$in_fh>) { - /^processor.*[:]/i and $no_of_cores++; - } - close $in_fh; - } - return $no_of_cores; -} - -sub no_of_cpus_freebsd { - # Returns: - # Number of physical CPUs on FreeBSD - # undef if not FreeBSD - my $no_of_cpus = - (`sysctl -a dev.cpu 2>/dev/null | grep \%parent | awk '{ print \$2 }' | uniq | wc -l | awk '{ print \$1 }'` - or - `sysctl hw.ncpu 2>/dev/null | awk '{ print \$2 }'`); - chomp $no_of_cpus; - return $no_of_cpus; -} - -sub no_of_cores_freebsd { - # Returns: - # Number of CPU cores on FreeBSD - # undef if not FreeBSD - my $no_of_cores = - (`sysctl hw.ncpu 2>/dev/null | awk '{ print \$2 }'` - or - `sysctl -a hw 2>/dev/null | grep [^a-z]logicalcpu[^a-z] | awk '{ print \$2 }'`); - chomp $no_of_cores; - return $no_of_cores; -} - -sub no_of_cpus_netbsd { - # Returns: - # Number of physical CPUs on NetBSD - # undef if not NetBSD - my $no_of_cpus = `sysctl -n hw.ncpu 2>/dev/null`; - chomp $no_of_cpus; - return $no_of_cpus; -} - -sub no_of_cores_netbsd { - # Returns: - # Number of CPU cores on NetBSD - # undef if not NetBSD - my $no_of_cores = `sysctl -n hw.ncpu 2>/dev/null`; - chomp $no_of_cores; - return $no_of_cores; -} - -sub no_of_cpus_openbsd { - # Returns: - # Number of physical CPUs on OpenBSD - # undef if not OpenBSD - my $no_of_cpus = `sysctl -n hw.ncpu 2>/dev/null`; - chomp $no_of_cpus; - return $no_of_cpus; -} - -sub no_of_cores_openbsd { - # Returns: - # Number of CPU cores on OpenBSD - # undef if not OpenBSD - my $no_of_cores = `sysctl -n hw.ncpu 2>/dev/null`; - chomp $no_of_cores; - return $no_of_cores; -} - -sub no_of_cpus_hurd { - # Returns: - # Number of physical CPUs on HURD - # undef if not HURD - my $no_of_cpus = `nproc`; - chomp $no_of_cpus; - return $no_of_cpus; -} - -sub no_of_cores_hurd { - # Returns: - # Number of physical CPUs on HURD - # undef if not HURD - my $no_of_cores = `nproc`; - chomp $no_of_cores; - return $no_of_cores; -} - -sub no_of_cpus_darwin { - # Returns: - # Number of physical CPUs on Mac Darwin - # undef if not Mac Darwin - my $no_of_cpus = - (`sysctl -n hw.physicalcpu 2>/dev/null` - or - `sysctl -a hw 2>/dev/null | grep [^a-z]physicalcpu[^a-z] | awk '{ print \$2 }'`); - return $no_of_cpus; -} - -sub no_of_cores_darwin { - # Returns: - # Number of CPU cores on Mac Darwin - # undef if not Mac Darwin - my $no_of_cores = - (`sysctl -n hw.logicalcpu 2>/dev/null` - or - `sysctl -a hw 2>/dev/null | grep [^a-z]logicalcpu[^a-z] | awk '{ print \$2 }'`); - return $no_of_cores; -} - -sub no_of_cpus_solaris { - # Returns: - # Number of physical CPUs on Solaris - # undef if not Solaris - if(-x "/usr/sbin/psrinfo") { - my @psrinfo = `/usr/sbin/psrinfo`; - if($#psrinfo >= 0) { - return $#psrinfo +1; - } - } - if(-x "/usr/sbin/prtconf") { - my @prtconf = `/usr/sbin/prtconf | grep cpu..instance`; - if($#prtconf >= 0) { - return $#prtconf +1; - } - } - return undef; -} - -sub no_of_cores_solaris { - # Returns: - # Number of CPU cores on Solaris - # undef if not Solaris - if(-x "/usr/sbin/psrinfo") { - my @psrinfo = `/usr/sbin/psrinfo`; - if($#psrinfo >= 0) { - return $#psrinfo +1; - } - } - if(-x "/usr/sbin/prtconf") { - my @prtconf = `/usr/sbin/prtconf | grep cpu..instance`; - if($#prtconf >= 0) { - return $#prtconf +1; - } - } - return undef; -} - -sub no_of_cpus_aix { - # Returns: - # Number of physical CPUs on AIX - # undef if not AIX - my $no_of_cpus = 0; - if(-x "/usr/sbin/lscfg") { - open(my $in_fh, "-|", "/usr/sbin/lscfg -vs |grep proc | wc -l|tr -d ' '") - || return undef; - $no_of_cpus = <$in_fh>; - chomp ($no_of_cpus); - close $in_fh; - } - return $no_of_cpus; -} - -sub no_of_cores_aix { - # Returns: - # Number of CPU cores on AIX - # undef if not AIX - my $no_of_cores; - if(-x "/usr/bin/vmstat") { - open(my $in_fh, "-|", "/usr/bin/vmstat 1 1") || return undef; - while(<$in_fh>) { - /lcpu=([0-9]*) / and $no_of_cores = $1; - } - close $in_fh; - } - return $no_of_cores; -} - -sub no_of_cpus_hpux { - # Returns: - # Number of physical CPUs on HP-UX - # undef if not HP-UX - my $no_of_cpus = - (`/usr/bin/mpsched -s 2>&1 | grep 'Locality Domain Count' | awk '{ print \$4 }'`); - return $no_of_cpus; -} - -sub no_of_cores_hpux { - # Returns: - # Number of CPU cores on HP-UX - # undef if not HP-UX - my $no_of_cores = - (`/usr/bin/mpsched -s 2>&1 | grep 'Processor Count' | awk '{ print \$3 }'`); - return $no_of_cores; -} - -sub no_of_cpus_qnx { - # Returns: - # Number of physical CPUs on QNX - # undef if not QNX - # BUG: It is now known how to calculate this. - my $no_of_cpus = 0; - return $no_of_cpus; -} - -sub no_of_cores_qnx { - # Returns: - # Number of CPU cores on QNX - # undef if not QNX - # BUG: It is now known how to calculate this. - my $no_of_cores = 0; - return $no_of_cores; -} - -sub no_of_cpus_openserver { - # Returns: - # Number of physical CPUs on SCO OpenServer - # undef if not SCO OpenServer - my $no_of_cpus = 0; - if(-x "/usr/sbin/psrinfo") { - my @psrinfo = `/usr/sbin/psrinfo`; - if($#psrinfo >= 0) { - return $#psrinfo +1; - } - } - return $no_of_cpus; -} - -sub no_of_cores_openserver { - # Returns: - # Number of CPU cores on SCO OpenServer - # undef if not SCO OpenServer - my $no_of_cores = 0; - if(-x "/usr/sbin/psrinfo") { - my @psrinfo = `/usr/sbin/psrinfo`; - if($#psrinfo >= 0) { - return $#psrinfo +1; - } - } - return $no_of_cores; -} - -sub no_of_cpus_irix { - # Returns: - # Number of physical CPUs on IRIX - # undef if not IRIX - my $no_of_cpus = `hinv | grep HZ | grep Processor | awk '{print \$1}'`; - return $no_of_cpus; -} - -sub no_of_cores_irix { - # Returns: - # Number of CPU cores on IRIX - # undef if not IRIX - my $no_of_cores = `hinv | grep HZ | grep Processor | awk '{print \$1}'`; - return $no_of_cores; -} - -sub no_of_cpus_tru64 { - # Returns: - # Number of physical CPUs on Tru64 - # undef if not Tru64 - my $no_of_cpus = `sizer -pr`; - return $no_of_cpus; -} - -sub no_of_cores_tru64 { - # Returns: - # Number of CPU cores on Tru64 - # undef if not Tru64 - my $no_of_cores = `sizer -pr`; - return $no_of_cores; -} - -sub sshcommand { - my $self = shift; - if (not defined $self->{'sshcommand'}) { - $self->sshcommand_of_sshlogin(); - } - return $self->{'sshcommand'}; -} - -sub serverlogin { - my $self = shift; - if (not defined $self->{'serverlogin'}) { - $self->sshcommand_of_sshlogin(); - } - return $self->{'serverlogin'}; -} - -sub sshcommand_of_sshlogin { - # 'server' -> ('ssh -S /tmp/parallel-ssh-RANDOM/host-','server') - # 'user@server' -> ('ssh','user@server') - # 'myssh user@server' -> ('myssh','user@server') - # 'myssh -l user server' -> ('myssh -l user','server') - # '/usr/bin/myssh -l user server' -> ('/usr/bin/myssh -l user','server') - # Returns: - # sshcommand - defaults to 'ssh' - # login@host - my $self = shift; - my ($sshcmd, $serverlogin); - if($self->{'string'} =~ /(.+) (\S+)$/) { - # Own ssh command - $sshcmd = $1; $serverlogin = $2; - } else { - # Normal ssh - if($opt::controlmaster) { - # Use control_path to make ssh faster - my $control_path = $self->control_path_dir()."/ssh-%r@%h:%p"; - $sshcmd = "ssh -S ".$control_path; - $serverlogin = $self->{'string'}; - if(not $self->{'control_path'}{$control_path}++) { - # Master is not running for this control_path - # Start it - my $pid = fork(); - if($pid) { - $Global::sshmaster{$pid} ||= 1; - } else { - $SIG{'TERM'} = undef; - # Ignore the 'foo' being printed - open(STDOUT,">","/dev/null"); - # OpenSSH_3.6.1p2 gives 'tcgetattr: Invalid argument' with -tt - # STDERR >/dev/null to ignore "process_mux_new_session: tcgetattr: Invalid argument" - open(STDERR,">","/dev/null"); - open(STDIN,"<","/dev/null"); - # Run a sleep that outputs data, so it will discover if the ssh connection closes. - my $sleep = ::shell_quote_scalar('$|=1;while(1){sleep 1;print "foo\n"}'); - my @master = ("ssh", "-tt", "-MTS", $control_path, $serverlogin, "perl", "-e", $sleep); - exec(@master); - } - } - } else { - $sshcmd = "ssh"; $serverlogin = $self->{'string'}; - } - } - $self->{'sshcommand'} = $sshcmd; - $self->{'serverlogin'} = $serverlogin; -} - -sub control_path_dir { - # Returns: - # path to directory - my $self = shift; - if(not defined $self->{'control_path_dir'}) { - -e $ENV{'HOME'}."/.parallel" or mkdir $ENV{'HOME'}."/.parallel"; - -e $ENV{'HOME'}."/.parallel/tmp" or mkdir $ENV{'HOME'}."/.parallel/tmp"; - $self->{'control_path_dir'} = - File::Temp::tempdir($ENV{'HOME'} - . "/.parallel/tmp/control_path_dir-XXXX", - CLEANUP => 1); - } - return $self->{'control_path_dir'}; -} - -sub rsync_transfer_cmd { - # Command to run to transfer a file - # Input: - # $file = filename of file to transfer - # $workdir = destination dir - # Returns: - # $cmd = rsync command to run to transfer $file ("" if unreadable) - my $self = shift; - my $file = shift; - my $workdir = shift; - if(not -r $file) { - ::warning($file, " is not readable and will not be transferred.\n"); - return "true"; - } - my $rsync_destdir; - if($file =~ m:^/:) { - # rsync /foo/bar / - $rsync_destdir = "/"; - } else { - $rsync_destdir = ::shell_quote_file($workdir); - } - $file = ::shell_quote_file($file); - my $sshcmd = $self->sshcommand(); - my $rsync_opt = "-rlDzR -e" . ::shell_quote_scalar($sshcmd); - my $serverlogin = $self->serverlogin(); - # Make dir if it does not exist - return "( $sshcmd $serverlogin mkdir -p $rsync_destdir;" . - rsync()." $rsync_opt $file $serverlogin:$rsync_destdir )"; -} - -sub cleanup_cmd { - # Command to run to remove the remote file - # Input: - # $file = filename to remove - # $workdir = destination dir - # Returns: - # $cmd = ssh command to run to remove $file and empty parent dirs - my $self = shift; - my $file = shift; - my $workdir = shift; - my $f = $file; - if($f =~ m:/\./:) { - # foo/bar/./baz/quux => workdir/baz/quux - # /foo/bar/./baz/quux => workdir/baz/quux - $f =~ s:.*/\./:$workdir/:; - } elsif($f =~ m:^[^/]:) { - # foo/bar => workdir/foo/bar - $f = $workdir."/".$f; - } - my @subdirs = split m:/:, ::dirname($f); - my @rmdir; - my $dir = ""; - for(@subdirs) { - $dir .= $_."/"; - unshift @rmdir, ::shell_quote_file($dir); - } - my $rmdir = @rmdir ? "rmdir @rmdir 2>/dev/null;" : ""; - if(defined $opt::workdir and $opt::workdir eq "...") { - $rmdir .= "rm -rf " . ::shell_quote_file($workdir).';'; - } - - $f = ::shell_quote_file($f); - my $sshcmd = $self->sshcommand(); - my $serverlogin = $self->serverlogin(); - return "$sshcmd $serverlogin ".::shell_quote_scalar("(rm -f $f; $rmdir)"); -} - -{ - my $rsync; - - sub rsync { - # rsync 3.1.x uses protocol 31 which is unsupported by 2.5.7. - # If the version >= 3.1.0: downgrade to protocol 30 - if(not $rsync) { - my @out = `rsync --version`; - for (@out) { - if(/version (\d+.\d+)(.\d+)?/) { - if($1 >= 3.1) { - # Version 3.1.0 or later: Downgrade to protocol 30 - $rsync = "rsync --protocol 30"; - } else { - $rsync = "rsync"; - } - } - } - $rsync or ::die_bug("Cannot figure out version of rsync: @out"); - } - return $rsync; - } -} - - -package JobQueue; - -sub new { - my $class = shift; - my $commandref = shift; - my $read_from = shift; - my $context_replace = shift; - my $max_number_of_args = shift; - my $return_files = shift; - my $commandlinequeue = CommandLineQueue->new - ($commandref, $read_from, $context_replace, $max_number_of_args, - $return_files); - my @unget = (); - return bless { - 'unget' => \@unget, - 'commandlinequeue' => $commandlinequeue, - 'total_jobs' => undef, - }, ref($class) || $class; -} - -sub get { - my $self = shift; - - if(@{$self->{'unget'}}) { - my $job = shift @{$self->{'unget'}}; - return ($job); - } else { - my $commandline = $self->{'commandlinequeue'}->get(); - if(defined $commandline) { - my $job = Job->new($commandline); - return $job; - } else { - return undef; - } - } -} - -sub unget { - my $self = shift; - unshift @{$self->{'unget'}}, @_; -} - -sub empty { - my $self = shift; - my $empty = (not @{$self->{'unget'}}) - && $self->{'commandlinequeue'}->empty(); - ::debug("run", "JobQueue->empty $empty "); - return $empty; -} - -sub total_jobs { - my $self = shift; - if(not defined $self->{'total_jobs'}) { - my $job; - my @queue; - my $start = time; - while($job = $self->get()) { - if(time - $start > 10) { - ::warning("Reading all arguments takes longer than 10 seconds.\n"); - $opt::eta && ::warning("Consider removing --eta.\n"); - $opt::bar && ::warning("Consider removing --bar.\n"); - last; - } - push @queue, $job; - } - while($job = $self->get()) { - push @queue, $job; - } - - $self->unget(@queue); - $self->{'total_jobs'} = $#queue+1; - } - return $self->{'total_jobs'}; -} - -sub next_seq { - my $self = shift; - - return $self->{'commandlinequeue'}->seq(); -} - -sub quote_args { - my $self = shift; - return $self->{'commandlinequeue'}->quote_args(); -} - - -package Job; - -sub new { - my $class = shift; - my $commandlineref = shift; - return bless { - 'commandline' => $commandlineref, # CommandLine object - 'workdir' => undef, # --workdir - 'stdin' => undef, # filehandle for stdin (used for --pipe) - # filename for writing stdout to (used for --files) - 'remaining' => "", # remaining data not sent to stdin (used for --pipe) - 'datawritten' => 0, # amount of data sent via stdin (used for --pipe) - 'transfersize' => 0, # size of files using --transfer - 'returnsize' => 0, # size of files using --return - 'pid' => undef, - # hash of { SSHLogins => number of times the command failed there } - 'failed' => undef, - 'sshlogin' => undef, - # The commandline wrapped with rsync and ssh - 'sshlogin_wrap' => undef, - 'exitstatus' => undef, - 'exitsignal' => undef, - # Timestamp for timeout if any - 'timeout' => undef, - 'virgin' => 1, - }, ref($class) || $class; -} - -sub replaced { - my $self = shift; - $self->{'commandline'} or ::die_bug("commandline empty"); - return $self->{'commandline'}->replaced(); -} - -sub seq { - my $self = shift; - return $self->{'commandline'}->seq(); -} - -sub slot { - my $self = shift; - return $self->{'commandline'}->slot(); -} - -{ - my($cattail); - - sub cattail { - # Returns: - # $cattail = perl program for: cattail "decompress program" writerpid [file_to_decompress or stdin] [file_to_unlink] - if(not $cattail) { - $cattail = q{ - # cat followed by tail. - # If $writerpid dead: finish after this round - use Fcntl; - - $|=1; - - my ($cmd, $writerpid, $read_file, $unlink_file) = @ARGV; - if($read_file) { - open(IN,"<",$read_file) || die("cattail: Cannot open $read_file"); - } else { - *IN = *STDIN; - } - - my $flags; - fcntl(IN, F_GETFL, $flags) || die $!; # Get the current flags on the filehandle - $flags |= O_NONBLOCK; # Add non-blocking to the flags - fcntl(IN, F_SETFL, $flags) || die $!; # Set the flags on the filehandle - open(OUT,"|-",$cmd) || die("cattail: Cannot run $cmd"); - - while(1) { - # clear EOF - seek(IN,0,1); - my $writer_running = kill 0, $writerpid; - $read = sysread(IN,$buf,32768); - if($read) { - # We can unlink the file now: The writer has written something - -e $unlink_file and unlink $unlink_file; - # Blocking print - while($buf) { - my $bytes_written = syswrite(OUT,$buf); - # syswrite may be interrupted by SIGHUP - substr($buf,0,$bytes_written) = ""; - } - # Something printed: Wait less next time - $sleep /= 2; - } else { - if(eof(IN) and not $writer_running) { - # Writer dead: There will never be more to read => exit - exit; - } - # TODO This could probably be done more efficiently using select(2) - # Nothing read: Wait longer before next read - # Up to 30 milliseconds - $sleep = ($sleep < 30) ? ($sleep * 1.001 + 0.01) : ($sleep); - usleep($sleep); - } - } - - sub usleep { - # Sleep this many milliseconds. - my $secs = shift; - select(undef, undef, undef, $secs/1000); - } - }; - $cattail =~ s/#.*//mg; - $cattail =~ s/\s+/ /g; - } - return $cattail; - } -} - -sub openoutputfiles { - # Open files for STDOUT and STDERR - # Set file handles in $self->fh - my $self = shift; - my ($outfhw, $errfhw, $outname, $errname); - if($opt::results) { - my $args_as_dirname = $self->{'commandline'}->args_as_dirname(); - # Output in: prefix/name1/val1/name2/val2/stdout - my $dir = $opt::results."/".$args_as_dirname; - if(eval{ File::Path::mkpath($dir); }) { - # OK - } else { - # mkpath failed: Argument probably too long. - # Set $Global::max_file_length, which will keep the individual - # dir names shorter than the max length - max_file_name_length($opt::results); - $args_as_dirname = $self->{'commandline'}->args_as_dirname(); - # prefix/name1/val1/name2/val2/ - $dir = $opt::results."/".$args_as_dirname; - File::Path::mkpath($dir); - } - # prefix/name1/val1/name2/val2/stdout - $outname = "$dir/stdout"; - if(not open($outfhw, "+>", $outname)) { - ::error("Cannot write to `$outname'.\n"); - ::wait_and_exit(255); - } - # prefix/name1/val1/name2/val2/stderr - $errname = "$dir/stderr"; - if(not open($errfhw, "+>", $errname)) { - ::error("Cannot write to `$errname'.\n"); - ::wait_and_exit(255); - } - $self->set_fh(1,"unlink",""); - $self->set_fh(2,"unlink",""); - } elsif(not $opt::ungroup) { - # To group we create temporary files for STDOUT and STDERR - # To avoid the cleanup unlink the files immediately (but keep them open) - if(@Global::tee_jobs) { - # files must be removed when the tee is done - } elsif($opt::files) { - ($outfhw, $outname) = ::tmpfile(SUFFIX => ".par"); - ($errfhw, $errname) = ::tmpfile(SUFFIX => ".par"); - # --files => only remove stderr - $self->set_fh(1,"unlink",""); - $self->set_fh(2,"unlink",$errname); - } else { - ($outfhw, $outname) = ::tmpfile(SUFFIX => ".par"); - ($errfhw, $errname) = ::tmpfile(SUFFIX => ".par"); - $self->set_fh(1,"unlink",$outname); - $self->set_fh(2,"unlink",$errname); - } - } else { - # --ungroup - open($outfhw,">&",$Global::fd{1}) || die; - open($errfhw,">&",$Global::fd{2}) || die; - # File name must be empty as it will otherwise be printed - $outname = ""; - $errname = ""; - $self->set_fh(1,"unlink",$outname); - $self->set_fh(2,"unlink",$errname); - } - # Set writing FD - $self->set_fh(1,'w',$outfhw); - $self->set_fh(2,'w',$errfhw); - $self->set_fh(1,'name',$outname); - $self->set_fh(2,'name',$errname); - if($opt::compress) { - # Send stdout to stdin for $opt::compress_program(1) - # Send stderr to stdin for $opt::compress_program(2) - # cattail get pid: $pid = $self->fh($fdno,'rpid'); - my $cattail = cattail(); - for my $fdno (1,2) { - my $wpid = open(my $fdw,"|-","$opt::compress_program >>". - $self->fh($fdno,'name')) || die $?; - $self->set_fh($fdno,'w',$fdw); - $self->set_fh($fdno,'wpid',$wpid); - my $rpid = open(my $fdr, "-|", "perl", "-e", $cattail, - $opt::decompress_program, $wpid, - $self->fh($fdno,'name'),$self->fh($fdno,'unlink')) || die $?; - $self->set_fh($fdno,'r',$fdr); - $self->set_fh($fdno,'rpid',$rpid); - } - } elsif(not $opt::ungroup) { - # Set reading FD if using --group (--ungroup does not need) - for my $fdno (1,2) { - # Re-open the file for reading - # so fdw can be closed separately - # and fdr can be seeked separately (for --line-buffer) - open(my $fdr,"<", $self->fh($fdno,'name')) || - ::die_bug("fdr: Cannot open ".$self->fh($fdno,'name')); - $self->set_fh($fdno,'r',$fdr); - # Unlink if required - $Global::debug or unlink $self->fh($fdno,"unlink"); - } - } - if($opt::linebuffer) { - # Set non-blocking when using --linebuffer - $Global::use{"Fcntl"} ||= eval "use Fcntl qw(:DEFAULT :flock); 1;"; - for my $fdno (1,2) { - my $fdr = $self->fh($fdno,'r'); - my $flags; - fcntl($fdr, &F_GETFL, $flags) || die $!; # Get the current flags on the filehandle - $flags |= &O_NONBLOCK; # Add non-blocking to the flags - fcntl($fdr, &F_SETFL, $flags) || die $!; # Set the flags on the filehandle - } - } -} - -sub max_file_name_length { - # Figure out the max length of a subdir - # TODO and the max total length - # Ext4 = 255,130816 - my $testdir = shift; - - my $upper = 8_000_000; - my $len = 8; - my $dir="x"x$len; - do { - rmdir($testdir."/".$dir); - $len *= 16; - $dir="x"x$len; - } while (mkdir $testdir."/".$dir); - # Then search for the actual max length between $len/16 and $len - my $min = $len/16; - my $max = $len; - while($max-$min > 5) { - # If we are within 5 chars of the exact value: - # it is not worth the extra time to find the exact value - my $test = int(($min+$max)/2); - $dir="x"x$test; - if(mkdir $testdir."/".$dir) { - rmdir($testdir."/".$dir); - $min = $test; - } else { - $max = $test; - } - } - $Global::max_file_length = $min; - return $min; -} - -sub set_fh { - # Set file handle - my ($self, $fd_no, $key, $fh) = @_; - $self->{'fd'}{$fd_no,$key} = $fh; -} - -sub fh { - # Get file handle - my ($self, $fd_no, $key) = @_; - return $self->{'fd'}{$fd_no,$key}; -} - -sub write { - my $self = shift; - my $remaining_ref = shift; - my $stdin_fh = $self->fh(0,"w"); - syswrite($stdin_fh,$$remaining_ref); -} - -sub set_stdin_buffer { - # Copy stdin buffer from $block_ref up to $endpos - # Prepend with $header_ref - # Remove $recstart and $recend if needed - # Input: - # $header_ref = ref to $header to prepend - # $block_ref = ref to $block to pass on - # $endpos = length of $block to pass on - # $recstart = --recstart regexp - # $recend = --recend regexp - # Returns: - # N/A - my $self = shift; - my ($header_ref,$block_ref,$endpos,$recstart,$recend) = @_; - $self->{'stdin_buffer'} = ($self->virgin() ? $$header_ref : "").substr($$block_ref,0,$endpos); - if($opt::remove_rec_sep) { - remove_rec_sep(\$self->{'stdin_buffer'},$recstart,$recend); - } - $self->{'stdin_buffer_length'} = length $self->{'stdin_buffer'}; - $self->{'stdin_buffer_pos'} = 0; -} - -sub stdin_buffer_length { - my $self = shift; - return $self->{'stdin_buffer_length'}; -} - -sub remove_rec_sep { - my ($block_ref,$recstart,$recend) = @_; - # Remove record separator - $$block_ref =~ s/$recend$recstart//gos; - $$block_ref =~ s/^$recstart//os; - $$block_ref =~ s/$recend$//os; -} - -sub non_block_write { - my $self = shift; - my $something_written = 0; - use POSIX qw(:errno_h); -# use Fcntl; -# my $flags = ''; - for my $buf (substr($self->{'stdin_buffer'},$self->{'stdin_buffer_pos'})) { - my $in = $self->fh(0,"w"); -# fcntl($in, F_GETFL, $flags) -# or die "Couldn't get flags for HANDLE : $!\n"; -# $flags |= O_NONBLOCK; -# fcntl($in, F_SETFL, $flags) -# or die "Couldn't set flags for HANDLE: $!\n"; - my $rv = syswrite($in, $buf); - if (!defined($rv) && $! == EAGAIN) { - # would block - $something_written = 0; - } elsif ($self->{'stdin_buffer_pos'}+$rv != $self->{'stdin_buffer_length'}) { - # incomplete write - # Remove the written part - $self->{'stdin_buffer_pos'} += $rv; - $something_written = $rv; - } else { - # successfully wrote everything - my $a=""; - $self->set_stdin_buffer(\$a,\$a,"",""); - $something_written = $rv; - } - } - - ::debug("pipe", "Non-block: ", $something_written); - return $something_written; -} - - -sub virgin { - my $self = shift; - return $self->{'virgin'}; -} - -sub set_virgin { - my $self = shift; - $self->{'virgin'} = shift; -} - -sub pid { - my $self = shift; - return $self->{'pid'}; -} - -sub set_pid { - my $self = shift; - $self->{'pid'} = shift; -} - -sub starttime { - # Returns: - # UNIX-timestamp this job started - my $self = shift; - return sprintf("%.3f",$self->{'starttime'}); -} - -sub set_starttime { - my $self = shift; - my $starttime = shift || ::now(); - $self->{'starttime'} = $starttime; -} - -sub runtime { - # Returns: - # Run time in seconds - my $self = shift; - return sprintf("%.3f",int(($self->endtime() - $self->starttime())*1000)/1000); -} - -sub endtime { - # Returns: - # UNIX-timestamp this job ended - # 0 if not ended yet - my $self = shift; - return ($self->{'endtime'} || 0); -} - -sub set_endtime { - my $self = shift; - my $endtime = shift; - $self->{'endtime'} = $endtime; -} - -sub timedout { - # Is the job timedout? - # Input: - # $delta_time = time that the job may run - # Returns: - # True or false - my $self = shift; - my $delta_time = shift; - return time > $self->{'starttime'} + $delta_time; -} - -sub kill { - # Kill the job. - # Send the signals to (grand)*children and pid. - # If no signals: TERM TERM KILL - # Wait 200 ms after each TERM. - # Input: - # @signals = signals to send - my $self = shift; - my @signals = @_; - my @family_pids = $self->family_pids(); - # Record this jobs as failed - $self->set_exitstatus(-1); - # Send two TERMs to give time to clean up - ::debug("run", "Kill seq ", $self->seq(), "\n"); - my @send_signals = @signals || ("TERM", "TERM", "KILL"); - for my $signal (@send_signals) { - my $alive = 0; - for my $pid (@family_pids) { - if(kill 0, $pid) { - # The job still running - kill $signal, $pid; - $alive = 1; - } - } - # If a signal was given as input, do not do the sleep below - @signals and next; - - if($signal eq "TERM" and $alive) { - # Wait up to 200 ms between TERMs - but only if any pids are alive - my $sleep = 1; - for (my $sleepsum = 0; kill 0, $family_pids[0] and $sleepsum < 200; - $sleepsum += $sleep) { - $sleep = ::reap_usleep($sleep); - } - } - } -} - -sub family_pids { - # Find the pids with this->pid as (grand)*parent - # Returns: - # @pids = pids of (grand)*children - my $self = shift; - my $pid = $self->pid(); - my @pids; - - my ($children_of_ref, $parent_of_ref, $name_of_ref) = ::pid_table(); - - my @more = ($pid); - # While more (grand)*children - while(@more) { - my @m; - push @pids, @more; - for my $parent (@more) { - if($children_of_ref->{$parent}) { - # add the children of this parent - push @m, @{$children_of_ref->{$parent}}; - } - } - @more = @m; - } - return (@pids); -} - -sub failed { - # return number of times failed for this $sshlogin - # Input: - # $sshlogin - # Returns: - # Number of times failed for $sshlogin - my $self = shift; - my $sshlogin = shift; - return $self->{'failed'}{$sshlogin}; -} - -sub failed_here { - # return number of times failed for the current $sshlogin - # Returns: - # Number of times failed for this sshlogin - my $self = shift; - return $self->{'failed'}{$self->sshlogin()}; -} - -sub add_failed { - # increase the number of times failed for this $sshlogin - my $self = shift; - my $sshlogin = shift; - $self->{'failed'}{$sshlogin}++; -} - -sub add_failed_here { - # increase the number of times failed for the current $sshlogin - my $self = shift; - $self->{'failed'}{$self->sshlogin()}++; -} - -sub reset_failed { - # increase the number of times failed for this $sshlogin - my $self = shift; - my $sshlogin = shift; - delete $self->{'failed'}{$sshlogin}; -} - -sub reset_failed_here { - # increase the number of times failed for this $sshlogin - my $self = shift; - delete $self->{'failed'}{$self->sshlogin()}; -} - -sub min_failed { - # Returns: - # the number of sshlogins this command has failed on - # the minimal number of times this command has failed - my $self = shift; - my $min_failures = - ::min(map { $self->{'failed'}{$_} } keys %{$self->{'failed'}}); - my $number_of_sshlogins_failed_on = scalar keys %{$self->{'failed'}}; - return ($number_of_sshlogins_failed_on,$min_failures); -} - -sub total_failed { - # Returns: - # $total_failures = the number of times this command has failed - my $self = shift; - my $total_failures = 0; - for (values %{$self->{'failed'}}) { - $total_failures += $_; - } - return $total_failures; -} - -sub wrapped { - # Wrap command with: - # * --shellquote - # * --nice - # * --cat - # * --fifo - # * --sshlogin - # * --pipepart (@Global::cat_partials) - # * --pipe - # * --tmux - # The ordering of the wrapping is important: - # * --nice/--cat/--fifo should be done on the remote machine - # * --pipepart/--pipe should be done on the local machine inside --tmux - # Uses: - # $Global::envvar - # $opt::shellquote - # $opt::nice - # $Global::shell - # $opt::cat - # $opt::fifo - # @Global::cat_partials - # $opt::pipe - # $opt::tmux - # Returns: - # $self->{'wrapped'} = the command wrapped with the above - my $self = shift; - if(not defined $self->{'wrapped'}) { - my $command = $Global::envvar.$self->replaced(); - if($opt::shellquote) { - # Prepend echo - # and quote twice - $command = "echo " . - ::shell_quote_scalar(::shell_quote_scalar($command)); - } - if($opt::nice) { - # Prepend \nice -n19 $SHELL -c - # and quote. - # The '\' before nice is needed to avoid tcsh's built-in - $command = '\nice'. " -n". $opt::nice. " ". - $Global::shell. " -c ". - ::shell_quote_scalar($command); - } - if($opt::cat) { - # Prepend 'cat > {};' - # Append '_EXIT=$?;(rm {};exit $_EXIT)' - $command = - $self->{'commandline'}->replace_placeholders(["cat > \257<\257>; "], 0, 0). - $command. - $self->{'commandline'}->replace_placeholders( - ["; _EXIT=\$?; rm \257<\257>; exit \$_EXIT"], 0, 0); - } elsif($opt::fifo) { - # Prepend 'mkfifo {}; (' - # Append ') & _PID=$!; cat > {}; wait $_PID; _EXIT=$?;(rm {};exit $_EXIT)' - $command = - $self->{'commandline'}->replace_placeholders(["mkfifo \257<\257>; ("], 0, 0). - $command. - $self->{'commandline'}->replace_placeholders([") & _PID=\$!; cat > \257<\257>; ", - "wait \$_PID; _EXIT=\$?; ", - "rm \257<\257>; exit \$_EXIT"], - 0,0); - } - # Wrap with ssh + tranferring of files - $command = $self->sshlogin_wrap($command); - if(@Global::cat_partials) { - # Prepend: - # < /tmp/foo perl -e 'while(@ARGV) { sysseek(STDIN,shift,0) || die; $left = shift; while($read = sysread(STDIN,$buf, ($left > 32768 ? 32768 : $left))){ $left -= $read; syswrite(STDOUT,$buf); } }' 0 0 0 11 | - $command = (shift @Global::cat_partials). "|". "(". $command. ")"; - } elsif($opt::pipe) { - # Prepend EOF-detector to avoid starting $command if EOF. - # The $tmpfile might exist if run on a remote system - we accept that risk - my ($dummy_fh, $tmpfile) = ::tmpfile(SUFFIX => ".chr"); - # Unlink to avoid leaving files if --dry-run or --sshlogin - unlink $tmpfile; - $command = - # Exit value: - # empty input = true - # some input = exit val from command - qq{ sh -c 'dd bs=1 count=1 of=$tmpfile 2>/dev/null'; }. - qq{ test \! -s "$tmpfile" && rm -f "$tmpfile" && exec true; }. - qq{ (cat $tmpfile; rm $tmpfile; cat - ) | }. - "($command);"; - } - if($opt::tmux) { - # Wrap command with 'tmux' - $command = $self->tmux_wrap($command); - } - $self->{'wrapped'} = $command; - } - return $self->{'wrapped'}; -} - -sub set_sshlogin { - my $self = shift; - my $sshlogin = shift; - $self->{'sshlogin'} = $sshlogin; - delete $self->{'sshlogin_wrap'}; # If sshlogin is changed the wrap is wrong - delete $self->{'wrapped'}; -} - -sub sshlogin { - my $self = shift; - return $self->{'sshlogin'}; -} - -sub sshlogin_wrap { - # Wrap the command with the commands needed to run remotely - # Returns: - # $self->{'sshlogin_wrap'} = command wrapped with ssh+transfer commands - my $self = shift; - my $command = shift; - if(not defined $self->{'sshlogin_wrap'}) { - my $sshlogin = $self->sshlogin(); - my $sshcmd = $sshlogin->sshcommand(); - my $serverlogin = $sshlogin->serverlogin(); - my ($pre,$post,$cleanup)=("","",""); - - if($serverlogin eq ":") { - # No transfer neeeded - $self->{'sshlogin_wrap'} = $command; - } else { - # --transfer - $pre .= $self->sshtransfer(); - # --return - $post .= $self->sshreturn(); - # --cleanup - $post .= $self->sshcleanup(); - if($post) { - # We need to save the exit status of the job - $post = '_EXIT_status=$?; ' . $post . ' exit $_EXIT_status;'; - } - # If the remote login shell is (t)csh then use 'setenv' - # otherwise use 'export' - # We cannot use parse_env_var(), as PARALLEL_SEQ changes - # for each command - my $parallel_env = - ($Global::envwarn - . q{ 'eval `echo $SHELL | grep "/t\\{0,1\\}csh" > /dev/null } - . q{ && echo setenv PARALLEL_SEQ '$PARALLEL_SEQ'\; } - . q{ setenv PARALLEL_PID '$PARALLEL_PID' } - . q{ || echo PARALLEL_SEQ='$PARALLEL_SEQ'\;export PARALLEL_SEQ\; } - . q{ PARALLEL_PID='$PARALLEL_PID'\;export PARALLEL_PID` ;' }); - my $remote_pre = ""; - my $ssh_options = ""; - if(($opt::pipe or $opt::pipepart) and $opt::ctrlc - or - not ($opt::pipe or $opt::pipepart) and not $opt::noctrlc) { - # TODO Determine if this is needed - # Propagating CTRL-C to kill remote jobs requires - # remote jobs to be run with a terminal. - $ssh_options = "-tt -oLogLevel=quiet"; -# $ssh_options = ""; - # tty - check if we have a tty. - # stty: - # -onlcr - make output 8-bit clean - # isig - pass CTRL-C as signal - # -echo - do not echo input - $remote_pre .= ::shell_quote_scalar('tty >/dev/null && stty isig -onlcr -echo;'); - } - if($opt::workdir) { - my $wd = ::shell_quote_file($self->workdir()); - $remote_pre .= ::shell_quote_scalar("mkdir -p ") . $wd . - ::shell_quote_scalar("; cd ") . $wd . - # exit 255 (instead of exec false) would be the correct thing, - # but that fails on tcsh - ::shell_quote_scalar(qq{ || exec false;}); - } - # This script is to solve the problem of - # * not mixing STDERR and STDOUT - # * terminating with ctrl-c - # It works on Linux but not Solaris - # Finishes on Solaris, but wrong exit code: - # $SIG{CHLD} = sub {exit ($?&127 ? 128+($?&127) : 1+$?>>8)}; - # Hangs on Solaris, but correct exit code on Linux: - # $SIG{CHLD} = sub { $done = 1 }; - # $p->poll; - my $signal_script = "perl -e '". - q{ - use IO::Poll; - $SIG{CHLD} = sub { $done = 1 }; - $p = IO::Poll->new; - $p->mask(STDOUT, POLLHUP); - $pid=fork; unless($pid) {setpgrp; exec $ENV{SHELL}, "-c", @ARGV; die "exec: $!\n"} - $p->poll; - kill SIGHUP, -${pid} unless $done; - wait; exit ($?&127 ? 128+($?&127) : 1+$?>>8) - } . "' "; - $signal_script =~ s/\s+/ /g; - - $self->{'sshlogin_wrap'} = - ($pre - . "$sshcmd $ssh_options $serverlogin $parallel_env " - . $remote_pre -# . ::shell_quote_scalar($signal_script . ::shell_quote_scalar($command)) - . ::shell_quote_scalar($command) - . ";" - . $post); - } - } - return $self->{'sshlogin_wrap'}; -} - -sub transfer { - # Files to transfer - # Returns: - # @transfer - File names of files to transfer - my $self = shift; - my @transfer = (); - $self->{'transfersize'} = 0; - if($opt::transfer) { - for my $record (@{$self->{'commandline'}{'arg_list'}}) { - # Merge arguments from records into args - for my $arg (@$record) { - CORE::push @transfer, $arg->orig(); - # filesize - if(-e $arg->orig()) { - $self->{'transfersize'} += (stat($arg->orig()))[7]; - } - } - } - } - return @transfer; -} - -sub transfersize { - my $self = shift; - return $self->{'transfersize'}; -} - -sub sshtransfer { - # Returns for each transfer file: - # rsync $file remote:$workdir - my $self = shift; - my @pre; - my $sshlogin = $self->sshlogin(); - my $workdir = $self->workdir(); - for my $file ($self->transfer()) { - push @pre, $sshlogin->rsync_transfer_cmd($file,$workdir).";"; - } - return join("",@pre); -} - -sub return { - # Files to return - # Non-quoted and with {...} substituted - # Returns: - # @non_quoted_filenames - my $self = shift; - return $self->{'commandline'}-> - replace_placeholders($self->{'commandline'}{'return_files'},0,0); -} - -sub returnsize { - # This is called after the job has finished - # Returns: - # $number_of_bytes transferred in return - my $self = shift; - for my $file ($self->return()) { - if(-e $file) { - $self->{'returnsize'} += (stat($file))[7]; - } - } - return $self->{'returnsize'}; -} - -sub sshreturn { - # Returns for each return-file: - # rsync remote:$workdir/$file . - my $self = shift; - my $sshlogin = $self->sshlogin(); - my $sshcmd = $sshlogin->sshcommand(); - my $serverlogin = $sshlogin->serverlogin(); - my $rsync_opt = "-rlDzR -e".::shell_quote_scalar($sshcmd); - my $pre = ""; - for my $file ($self->return()) { - $file =~ s:^\./::g; # Remove ./ if any - my $relpath = ($file !~ m:^/:); # Is the path relative? - my $cd = ""; - my $wd = ""; - if($relpath) { - # rsync -avR /foo/./bar/baz.c remote:/tmp/ - # == (on old systems) - # rsync -avR --rsync-path="cd /foo; rsync" remote:bar/baz.c /tmp/ - $wd = ::shell_quote_file($self->workdir()."/"); - } - # Only load File::Basename if actually needed - $Global::use{"File::Basename"} ||= eval "use File::Basename; 1;"; - # dir/./file means relative to dir, so remove dir on remote - $file =~ m:(.*)/\./:; - my $basedir = $1 ? ::shell_quote_file($1."/") : ""; - my $nobasedir = $file; - $nobasedir =~ s:.*/\./::; - $cd = ::shell_quote_file(::dirname($nobasedir)); - my $rsync_cd = '--rsync-path='.::shell_quote_scalar("cd $wd$cd; rsync"); - my $basename = ::shell_quote_scalar(::shell_quote_file(basename($file))); - # --return - # mkdir -p /home/tange/dir/subdir/; - # rsync (--protocol 30) -rlDzR --rsync-path="cd /home/tange/dir/subdir/; rsync" - # server:file.gz /home/tange/dir/subdir/ - $pre .= "mkdir -p $basedir$cd; ".$sshlogin->rsync()." $rsync_cd $rsync_opt $serverlogin:". - $basename . " ".$basedir.$cd.";"; - } - return $pre; -} - -sub sshcleanup { - # Return the sshcommand needed to remove the file - # Returns: - # ssh command needed to remove files from sshlogin - my $self = shift; - my $sshlogin = $self->sshlogin(); - my $sshcmd = $sshlogin->sshcommand(); - my $serverlogin = $sshlogin->serverlogin(); - my $workdir = $self->workdir(); - my $cleancmd = ""; - - for my $file ($self->cleanup()) { - my @subworkdirs = parentdirs_of($file); - $cleancmd .= $sshlogin->cleanup_cmd($file,$workdir).";"; - } - if(defined $opt::workdir and $opt::workdir eq "...") { - $cleancmd .= "$sshcmd $serverlogin rm -rf " . ::shell_quote_scalar($workdir).';'; - } - return $cleancmd; -} - -sub cleanup { - # Returns: - # Files to remove at cleanup - my $self = shift; - if($opt::cleanup) { - my @transfer = $self->transfer(); - my @return = $self->return(); - return (@transfer,@return); - } else { - return (); - } -} - -sub workdir { - # Returns: - # the workdir on a remote machine - my $self = shift; - if(not defined $self->{'workdir'}) { - my $workdir; - if(defined $opt::workdir) { - if($opt::workdir eq ".") { - # . means current dir - my $home = $ENV{'HOME'}; - eval 'use Cwd'; - my $cwd = cwd(); - $workdir = $cwd; - if($home) { - # If homedir exists: remove the homedir from - # workdir if cwd starts with homedir - # E.g. /home/foo/my/dir => my/dir - # E.g. /tmp/my/dir => /tmp/my/dir - my ($home_dev, $home_ino) = (stat($home))[0,1]; - my $parent = ""; - my @dir_parts = split(m:/:,$cwd); - my $part; - while(defined ($part = shift @dir_parts)) { - $part eq "" and next; - $parent .= "/".$part; - my ($parent_dev, $parent_ino) = (stat($parent))[0,1]; - if($parent_dev == $home_dev and $parent_ino == $home_ino) { - # dev and ino is the same: We found the homedir. - $workdir = join("/",@dir_parts); - last; - } - } - } - if($workdir eq "") { - $workdir = "."; - } - } elsif($opt::workdir eq "...") { - $workdir = ".parallel/tmp/" . ::hostname() . "-" . $$ - . "-" . $self->seq(); - } else { - $workdir = $opt::workdir; - # Rsync treats /./ special. We don't want that - $workdir =~ s:/\./:/:g; # Remove /./ - $workdir =~ s:/+$::; # Remove ending / if any - $workdir =~ s:^\./::g; # Remove starting ./ if any - } - } else { - $workdir = "."; - } - $self->{'workdir'} = ::shell_quote_scalar($workdir); - } - return $self->{'workdir'}; -} - -sub parentdirs_of { - # Return: - # all parentdirs except . of this dir or file - sorted desc by length - my $d = shift; - my @parents = (); - while($d =~ s:/[^/]+$::) { - if($d ne ".") { - push @parents, $d; - } - } - return @parents; -} - -sub start { - # Setup STDOUT and STDERR for a job and start it. - # Returns: - # job-object or undef if job not to run - my $job = shift; - # Get the shell command to be executed (possibly with ssh infront). - my $command = $job->wrapped(); - - if($Global::interactive or $Global::stderr_verbose) { - if($Global::interactive) { - print $Global::original_stderr "$command ?..."; - open(my $tty_fh, "<", "/dev/tty") || ::die_bug("interactive-tty"); - my $answer = <$tty_fh>; - close $tty_fh; - my $run_yes = ($answer =~ /^\s*y/i); - if (not $run_yes) { - $command = "true"; # Run the command 'true' - } - } else { - print $Global::original_stderr "$command\n"; - } - } - - my $pid; - $job->openoutputfiles(); - my($stdout_fh,$stderr_fh) = ($job->fh(1,"w"),$job->fh(2,"w")); - local (*IN,*OUT,*ERR); - open OUT, '>&', $stdout_fh or ::die_bug("Can't redirect STDOUT: $!"); - open ERR, '>&', $stderr_fh or ::die_bug("Can't dup STDOUT: $!"); - - if(($opt::dryrun or $Global::verbose) and $opt::ungroup) { - if($Global::verbose <= 1) { - print $stdout_fh $job->replaced(),"\n"; - } else { - # Verbose level > 1: Print the rsync and stuff - print $stdout_fh $command,"\n"; - } - } - if($opt::dryrun) { - $command = "true"; - } - $ENV{'PARALLEL_SEQ'} = $job->seq(); - $ENV{'PARALLEL_PID'} = $$; - ::debug("run", $Global::total_running, " processes . Starting (", - $job->seq(), "): $command\n"); - if($opt::pipe) { - my ($stdin_fh); - # The eval is needed to catch exception from open3 - eval { - $pid = ::open3($stdin_fh, ">&OUT", ">&ERR", $Global::shell, "-c", $command) || - ::die_bug("open3-pipe"); - 1; - }; - $job->set_fh(0,"w",$stdin_fh); - } elsif(@opt::a and not $Global::stdin_in_opt_a and $job->seq() == 1 - and $job->sshlogin()->string() eq ":") { - # Give STDIN to the first job if using -a (but only if running - # locally - otherwise CTRL-C does not work for other jobs Bug#36585) - *IN = *STDIN; - # The eval is needed to catch exception from open3 - eval { - $pid = ::open3("<&IN", ">&OUT", ">&ERR", $Global::shell, "-c", $command) || - ::die_bug("open3-a"); - 1; - }; - # Re-open to avoid complaining - open(STDIN, "<&", $Global::original_stdin) - or ::die_bug("dup-\$Global::original_stdin: $!"); - } elsif ($opt::tty and not $Global::tty_taken and -c "/dev/tty" and - open(my $devtty_fh, "<", "/dev/tty")) { - # Give /dev/tty to the command if no one else is using it - *IN = $devtty_fh; - # The eval is needed to catch exception from open3 - eval { - $pid = ::open3("<&IN", ">&OUT", ">&ERR", $Global::shell, "-c", $command) || - ::die_bug("open3-/dev/tty"); - $Global::tty_taken = $pid; - close $devtty_fh; - 1; - }; - } else { - # The eval is needed to catch exception from open3 - eval { - $pid = ::open3(::gensym, ">&OUT", ">&ERR", $Global::shell, "-c", $command) || - ::die_bug("open3-gensym"); - 1; - }; - } - if($pid) { - # A job was started - $Global::total_running++; - $Global::total_started++; - $job->set_pid($pid); - $job->set_starttime(); - $Global::running{$job->pid()} = $job; - if($opt::timeout) { - $Global::timeoutq->insert($job); - } - $Global::newest_job = $job; - $Global::newest_starttime = ::now(); - return $job; - } else { - # No more processes - ::debug("run", "Cannot spawn more jobs.\n"); - return undef; - } -} - -sub tmux_wrap { - # Wrap command with tmux for session pPID - # Input: - # $actual_command = the actual command being run (incl ssh wrap) - my $self = shift; - my $actual_command = shift; - # Temporary file name. Used for fifo to communicate exit val - my ($fh, $tmpfile) = ::tmpfile(SUFFIX => ".tmx"); - $Global::unlink{$tmpfile}=1; - close $fh; - unlink $tmpfile; - my $visual_command = $self->replaced(); - my $title = $visual_command; - # ; causes problems - # ascii 194-245 annoys tmux - $title =~ tr/[\011-\016;\302-\365]//d; - - my $tmux; - if($Global::total_running == 0) { - $tmux = "tmux new-session -s p$$ -d -n ". - ::shell_quote_scalar($title); - print $Global::original_stderr "See output with: tmux attach -t p$$\n"; - } else { - $tmux = "tmux new-window -t p$$ -n ".::shell_quote_scalar($title); - } - return "mkfifo $tmpfile; $tmux ". - # Run in tmux - ::shell_quote_scalar( - "(".$actual_command.');(echo $?$status;echo 255) >'.$tmpfile."&". - "echo ".::shell_quote_scalar($visual_command).";". - "echo \007Job finished at: `date`;sleep 10"). - # Run outside tmux - # Read the first line from the fifo and use that as status code - "; exit `perl -ne 'unlink \$ARGV; 1..1 and print' $tmpfile` "; -} - -sub is_already_in_results { - # Do we already have results for this job? - # Returns: - # $job_already_run = bool whether there is output for this or not - my $job = $_[0]; - my $args_as_dirname = $job->{'commandline'}->args_as_dirname(); - # prefix/name1/val1/name2/val2/ - my $dir = $opt::results."/".$args_as_dirname; - ::debug("run", "Test $dir/stdout", -e "$dir/stdout", "\n"); - return -e "$dir/stdout"; -} - -sub is_already_in_joblog { - my $job = shift; - return vec($Global::job_already_run,$job->seq(),1); -} - -sub set_job_in_joblog { - my $job = shift; - vec($Global::job_already_run,$job->seq(),1) = 1; -} - -sub should_be_retried { - # Should this job be retried? - # Returns - # 0 - do not retry - # 1 - job queued for retry - my $self = shift; - if (not $opt::retries) { - return 0; - } - if(not $self->exitstatus()) { - # Completed with success. If there is a recorded failure: forget it - $self->reset_failed_here(); - return 0 - } else { - # The job failed. Should it be retried? - $self->add_failed_here(); - if($self->total_failed() == $opt::retries) { - # This has been retried enough - return 0; - } else { - # This command should be retried - $self->set_endtime(undef); - $Global::JobQueue->unget($self); - ::debug("run", "Retry ", $self->seq(), "\n"); - return 1; - } - } -} - -sub print { - # Print the output of the jobs - # Returns: N/A - - my $self = shift; - ::debug("print", ">>joboutput ", $self->replaced(), "\n"); - if($opt::dryrun) { - # Nothing was printed to this job: - # cleanup tmp files if --files was set - unlink $self->fh(1,"name"); - } - if($opt::pipe and $self->virgin()) { - # Skip --joblog, --dryrun, --verbose - } else { - if($Global::joblog and defined $self->{'exitstatus'}) { - # Add to joblog when finished - $self->print_joblog(); - } - - # Printing is only relevant for grouped/--line-buffer output. - $opt::ungroup and return; - # Check for disk full - exit_if_disk_full(); - - if(($opt::dryrun or $Global::verbose) - and - not $self->{'verbose_printed'}) { - $self->{'verbose_printed'}++; - if($Global::verbose <= 1) { - print STDOUT $self->replaced(),"\n"; - } else { - # Verbose level > 1: Print the rsync and stuff - print STDOUT $self->wrapped(),"\n"; - } - # If STDOUT and STDERR are merged, - # we want the command to be printed first - # so flush to avoid STDOUT being buffered - flush STDOUT; - } - } - for my $fdno (sort { $a <=> $b } keys %Global::fd) { - # Sort by file descriptor numerically: 1,2,3,..,9,10,11 - $fdno == 0 and next; - my $out_fd = $Global::fd{$fdno}; - my $in_fh = $self->fh($fdno,"r"); - if(not $in_fh) { - if(not $Job::file_descriptor_warning_printed{$fdno}++) { - # ::warning("File descriptor $fdno not defined\n"); - } - next; - } - ::debug("print", "File descriptor $fdno (", $self->fh($fdno,"name"), "):"); - if($opt::files) { - # If --compress: $in_fh must be closed first. - close $self->fh($fdno,"w"); - close $in_fh; - if($opt::pipe and $self->virgin()) { - # Nothing was printed to this job: - # cleanup unused tmp files if --files was set - for my $fdno (1,2) { - unlink $self->fh($fdno,"name"); - unlink $self->fh($fdno,"unlink"); - } - } elsif($fdno == 1 and $self->fh($fdno,"name")) { - print $out_fd $self->fh($fdno,"name"),"\n"; - } - } elsif($opt::linebuffer) { - # Line buffered print out - $self->linebuffer_print($fdno,$in_fh,$out_fd); - } else { - my $buf; - close $self->fh($fdno,"w"); - seek $in_fh, 0, 0; - # $in_fh is now ready for reading at position 0 - if($opt::tag or defined $opt::tagstring) { - my $tag = $self->tag(); - if($fdno == 2) { - # OpenSSH_3.6.1p2 gives 'tcgetattr: Invalid argument' with -tt - # This is a crappy way of ignoring it. - while(<$in_fh>) { - if(/^(client_process_control: )?tcgetattr: Invalid argument\n/) { - # Skip - } else { - print $out_fd $tag,$_; - } - # At most run the loop once - last; - } - } - while(<$in_fh>) { - print $out_fd $tag,$_; - } - } else { - my $buf; - if($fdno == 2) { - # OpenSSH_3.6.1p2 gives 'tcgetattr: Invalid argument' with -tt - # This is a crappy way of ignoring it. - sysread($in_fh,$buf,1_000); - $buf =~ s/^(client_process_control: )?tcgetattr: Invalid argument\n//; - print $out_fd $buf; - } - while(sysread($in_fh,$buf,32768)) { - print $out_fd $buf; - } - } - close $in_fh; - } - flush $out_fd; - } - ::debug("print", "<{'partial_line',$fdno}; - - if(defined $self->{'exitstatus'}) { - # If the job is dead: close printing fh. Needed for --compress - close $self->fh($fdno,"w"); - if($opt::compress) { - # Blocked reading in final round - $Global::use{"Fcntl"} ||= eval "use Fcntl qw(:DEFAULT :flock); 1;"; - for my $fdno (1,2) { - my $fdr = $self->fh($fdno,'r'); - my $flags; - fcntl($fdr, &F_GETFL, $flags) || die $!; # Get the current flags on the filehandle - $flags &= ~&O_NONBLOCK; # Remove non-blocking to the flags - fcntl($fdr, &F_SETFL, $flags) || die $!; # Set the flags on the filehandle - } - } - } - # This seek will clear EOF - seek $in_fh, tell($in_fh), 0; - # The read is non-blocking: The $in_fh is set to non-blocking. - # 32768 --tag = 5.1s - # 327680 --tag = 4.4s - # 1024000 --tag = 4.4s - # 3276800 --tag = 4.3s - # 32768000 --tag = 4.7s - # 10240000 --tag = 4.3s - while(read($in_fh,substr($$partial,length $$partial),3276800)) { - # Append to $$partial - # Find the last \n - my $i = rindex($$partial,"\n"); - if($i != -1) { - # One or more complete lines were found - if($fdno == 2 and not $self->{'printed_first_line',$fdno}++) { - # OpenSSH_3.6.1p2 gives 'tcgetattr: Invalid argument' with -tt - # This is a crappy way of ignoring it. - $$partial =~ s/^(client_process_control: )?tcgetattr: Invalid argument\n//; - # Length of partial line has changed: Find the last \n again - $i = rindex($$partial,"\n"); - } - if($opt::tag or defined $opt::tagstring) { - # Replace ^ with $tag within the full line - my $tag = $self->tag(); - substr($$partial,0,$i+1) =~ s/^/$tag/gm; - # Length of partial line has changed: Find the last \n again - $i = rindex($$partial,"\n"); - } - # Print up to and including the last \n - print $out_fd substr($$partial,0,$i+1); - # Remove the printed part - substr($$partial,0,$i+1)=""; - } - } - if(defined $self->{'exitstatus'}) { - # If the job is dead: print the remaining partial line - # read remaining - if($$partial and ($opt::tag or defined $opt::tagstring)) { - my $tag = $self->tag(); - $$partial =~ s/^/$tag/gm; - } - print $out_fd $$partial; - # Release the memory - $$partial = undef; - if($self->fh($fdno,"rpid") and CORE::kill 0, $self->fh($fdno,"rpid")) { - # decompress still running - } else { - # decompress done: close fh - close $in_fh; - } - } -} - -sub print_joblog { - my $self = shift; - my $cmd; - if($Global::verbose <= 1) { - $cmd = $self->replaced(); - } else { - # Verbose level > 1: Print the rsync and stuff - $cmd = "@command"; - } - print $Global::joblog - join("\t", $self->seq(), $self->sshlogin()->string(), - $self->starttime(), sprintf("%10.3f",$self->runtime()), - $self->transfersize(), $self->returnsize(), - $self->exitstatus(), $self->exitsignal(), $cmd - ). "\n"; - flush $Global::joblog; - $self->set_job_in_joblog(); -} - -sub tag { - my $self = shift; - if(not defined $self->{'tag'}) { - $self->{'tag'} = $self->{'commandline'}-> - replace_placeholders([$opt::tagstring],0,0)."\t"; - } - return $self->{'tag'}; -} - -sub hostgroups { - my $self = shift; - if(not defined $self->{'hostgroups'}) { - $self->{'hostgroups'} = $self->{'commandline'}->{'arg_list'}[0][0]->{'hostgroups'}; - } - return @{$self->{'hostgroups'}}; -} - -sub exitstatus { - my $self = shift; - return $self->{'exitstatus'}; -} - -sub set_exitstatus { - my $self = shift; - my $exitstatus = shift; - if($exitstatus) { - # Overwrite status if non-zero - $self->{'exitstatus'} = $exitstatus; - } else { - # Set status but do not overwrite - # Status may have been set by --timeout - $self->{'exitstatus'} ||= $exitstatus; - } -} - -sub exitsignal { - my $self = shift; - return $self->{'exitsignal'}; -} - -sub set_exitsignal { - my $self = shift; - my $exitsignal = shift; - $self->{'exitsignal'} = $exitsignal; -} - -{ - my ($disk_full_fh, $b8193, $name); - sub exit_if_disk_full { - # Checks if $TMPDIR is full by writing 8kb to a tmpfile - # If the disk is full: Exit immediately. - # Returns: - # N/A - if(not $disk_full_fh) { - ($disk_full_fh, $name) = ::tmpfile(SUFFIX => ".df"); - unlink $name; - $b8193 = "x"x8193; - } - # Linux does not discover if a disk is full if writing <= 8192 - # Tested on: - # bfs btrfs cramfs ext2 ext3 ext4 ext4dev jffs2 jfs minix msdos - # ntfs reiserfs tmpfs ubifs vfat xfs - # TODO this should be tested on different OS similar to this: - # - # doit() { - # sudo mount /dev/ram0 /mnt/loop; sudo chmod 1777 /mnt/loop - # seq 100000 | parallel --tmpdir /mnt/loop/ true & - # seq 6900000 > /mnt/loop/i && echo seq OK - # seq 6980868 > /mnt/loop/i - # seq 10000 > /mnt/loop/ii - # sleep 3 - # sudo umount /mnt/loop/ || sudo umount -l /mnt/loop/ - # echo >&2 - # } - print $disk_full_fh $b8193; - if(not $disk_full_fh - or - tell $disk_full_fh == 0) { - ::error("Output is incomplete. Cannot append to buffer file in $ENV{'TMPDIR'}. Is the disk full?\n"); - ::error("Change \$TMPDIR with --tmpdir or use --compress.\n"); - ::wait_and_exit(255); - } - truncate $disk_full_fh, 0; - seek($disk_full_fh, 0, 0) || die; - } -} - - -package CommandLine; - -sub new { - my $class = shift; - my $seq = shift; - my $commandref = shift; - $commandref || die; - my $arg_queue = shift; - my $context_replace = shift; - my $max_number_of_args = shift; # for -N and normal (-n1) - my $return_files = shift; - my $replacecount_ref = shift; - my $len_ref = shift; - my %replacecount = %$replacecount_ref; - my %len = %$len_ref; - for (keys %$replacecount_ref) { - # Total length of this replacement string {} replaced with all args - $len{$_} = 0; - } - return bless { - 'command' => $commandref, - 'seq' => $seq, - 'len' => \%len, - 'arg_list' => [], - 'arg_queue' => $arg_queue, - 'max_number_of_args' => $max_number_of_args, - 'replacecount' => \%replacecount, - 'context_replace' => $context_replace, - 'return_files' => $return_files, - 'replaced' => undef, - }, ref($class) || $class; -} - -sub seq { - my $self = shift; - return $self->{'seq'}; -} - -{ - my $max_slot_number; - - sub slot { - # Find the number of a free job slot and return it - # Uses: - # @Global::slots - # Returns: - # $jobslot = number of jobslot - my $self = shift; - if(not $self->{'slot'}) { - if(not @Global::slots) { - # $Global::max_slot_number will typically be $Global::max_jobs_running - push @Global::slots, ++$max_slot_number; - } - $self->{'slot'} = shift @Global::slots; - } - return $self->{'slot'}; - } -} - -sub populate { - # Add arguments from arg_queue until the number of arguments or - # max line length is reached - # Uses: - # $Global::minimal_command_line_length - # $opt::cat - # $opt::fifo - # $Global::JobQueue - # $opt::m - # $opt::X - # $CommandLine::already_spread - # $Global::max_jobs_running - # Returns: N/A - my $self = shift; - my $next_arg; - my $max_len = $Global::minimal_command_line_length || Limits::Command::max_length(); - - if($opt::cat or $opt::fifo) { - # Generate a tempfile name that will be used as {} - my($outfh,$name) = ::tmpfile(SUFFIX => ".pip"); - close $outfh; - # Unlink is needed if: ssh otheruser@localhost - unlink $name; - $Global::JobQueue->{'commandlinequeue'}->{'arg_queue'}->unget([Arg->new($name)]); - } - - while (not $self->{'arg_queue'}->empty()) { - $next_arg = $self->{'arg_queue'}->get(); - if(not defined $next_arg) { - next; - } - $self->push($next_arg); - if($self->len() >= $max_len) { - # Command length is now > max_length - # If there are arguments: remove the last - # If there are no arguments: Error - # TODO stuff about -x opt_x - if($self->number_of_args() > 1) { - # There is something to work on - $self->{'arg_queue'}->unget($self->pop()); - last; - } else { - my $args = join(" ", map { $_->orig() } @$next_arg); - ::error("Command line too long (", - $self->len(), " >= ", - $max_len, - ") at number ", - $self->{'arg_queue'}->arg_number(), - ": ". - (substr($args,0,50))."...\n"); - $self->{'arg_queue'}->unget($self->pop()); - ::wait_and_exit(255); - } - } - - if(defined $self->{'max_number_of_args'}) { - if($self->number_of_args() >= $self->{'max_number_of_args'}) { - last; - } - } - } - if(($opt::m or $opt::X) and not $CommandLine::already_spread - and $self->{'arg_queue'}->empty() and $Global::max_jobs_running) { - # -m or -X and EOF => Spread the arguments over all jobslots - # (unless they are already spread) - $CommandLine::already_spread ||= 1; - if($self->number_of_args() > 1) { - $self->{'max_number_of_args'} = - ::ceil($self->number_of_args()/$Global::max_jobs_running); - $Global::JobQueue->{'commandlinequeue'}->{'max_number_of_args'} = - $self->{'max_number_of_args'}; - $self->{'arg_queue'}->unget($self->pop_all()); - while($self->number_of_args() < $self->{'max_number_of_args'}) { - $self->push($self->{'arg_queue'}->get()); - } - } - } -} - -sub push { - # Add one or more records as arguments - # Returns: N/A - my $self = shift; - my $record = shift; - push @{$self->{'arg_list'}}, $record; - - my $quote_arg = $Global::noquote ? 0 : not $Global::quoting; - my $rep; - for my $arg (@$record) { - if(defined $arg) { - for my $perlexpr (keys %{$self->{'replacecount'}}) { - # 50% faster than below - $self->{'len'}{$perlexpr} += length $arg->replace($perlexpr,$quote_arg,$self); - # $rep = $arg->replace($perlexpr,$quote_arg,$self); - # $self->{'len'}{$perlexpr} += length $rep; - # ::debug("length", "Length: ", length $rep, - # "(", $perlexpr, "=>", $rep, ")\n"); - } - } - } -} - -sub pop { - # Remove last argument - # Returns: - # the last record - my $self = shift; - my $record = pop @{$self->{'arg_list'}}; - my $quote_arg = $Global::noquote ? 0 : not $Global::quoting; - for my $arg (@$record) { - if(defined $arg) { - for my $perlexpr (keys %{$self->{'replacecount'}}) { - $self->{'len'}{$perlexpr} -= - length $arg->replace($perlexpr,$quote_arg,$self); - } - } - } - return $record; -} - -sub pop_all { - # Remove all arguments and zeros the length of replacement strings - # Returns: - # all records - my $self = shift; - my @popped = @{$self->{'arg_list'}}; - for my $replacement_string (keys %{$self->{'replacecount'}}) { - $self->{'len'}{$replacement_string} = 0; - } - $self->{'arg_list'} = []; - return @popped; -} - -sub number_of_args { - # The number of records - # Returns: - # number of records - my $self = shift; - # Ftq rudef oaawuq ime dqxqmeqp az 2011-01-24 mzp ime iaz nk MQhmd - # Mdzrvadp Nvmdymeaz az 2011-04-10. Ftue oaawuq dqxqmeqp az - # 2013-08-18 ue m nuf tmdpqd me kag tmhq fa geq daf14. Bxqmeq - # qymux oaawuq@fmzsq.pw itqz kag dqmp ftue. - # - # U my ftq ymuzfmuzqd ar m buqoq ar rdqq earfimdq omxxqp SZG - # Bmdmxxqx. Rdqq earfimdq sgmdmzfqqe kag mooqee fa ftq eagdoq - # oapq, ngf U tmhq nqqz iazpqduzs tai ymzk mofgmxxk _dqmp_ ftq - # eagdoq oapq. - # - # Fa fqef ftue U bgf uz m oayyqzf fqxxuzs bqabxq fa qymux yq itqz - # ftqk dqmp ftue. Ftq oayyqzf ime bgf uz m eqofuaz ar ftq oapq - # ftmf za azq iagxp xaaw fa ruj ad uybdahq ftq earfimdq - ea ftq - # eagdoq oapq qcguhmxqzf fa m pgefk oadzqd. Fa ymwq egdq ftq - # oayyqzf iagxp zaf etai gb ur eayq azq vgef sdqbbqp ftdagst ftq - # eagdoq oapq U daf13'qp ftq eagdoq oapq - # tffb://qz.iuwubqpum.ads/iuwu/DAF13 - # - # 2.5 yazfte xmfqd U dqoquhqp mz qymux rday eayqazq ita zaf azxk - # ymzmsqp fa ruzp ftq oayyqzf, ngf mxea ymzmsqp fa sgqee ftq oapq - # tmp fa nq daf13'qp. - # - # Ftue nduzse yq fa ftq oazoxgeuaz ftmf ftqdq _mdq_ bqabxq, ita - # mdq zaf mrruxumfqp iuft ftq bdavqof, ftmf iuxx dqmp ftq eagdoq - # oapq - ftagst uf ymk zaf tmbbqz hqdk arfqz. - # - # This is really the number of records - return $#{$self->{'arg_list'}}+1; -} - -sub number_of_recargs { - # The number of args in records - # Returns: - # number of args records - my $self = shift; - my $sum = 0; - my $nrec = scalar @{$self->{'arg_list'}}; - if($nrec) { - $sum = $nrec * (scalar @{$self->{'arg_list'}[0]}); - } - return $sum; -} - -sub args_as_string { - # Returns: - # all unmodified arguments joined with ' ' (similar to {}) - my $self = shift; - return (join " ", map { $_->orig() } - map { @$_ } @{$self->{'arg_list'}}); -} - -sub args_as_dirname { - # Returns: - # all unmodified arguments joined with '/' (similar to {}) - # \t \0 \\ and / are quoted as: \t \0 \\ \_ - # If $Global::max_file_length: Keep subdirs < $Global::max_file_length - my $self = shift; - my @res = (); - - for my $rec_ref (@{$self->{'arg_list'}}) { - # If headers are used, sort by them. - # Otherwise keep the order from the command line. - my @header_indexes_sorted = header_indexes_sorted($#$rec_ref+1); - for my $n (@header_indexes_sorted) { - CORE::push(@res, - $Global::input_source_header{$n}, - map { my $s = $_; - # \t \0 \\ and / are quoted as: \t \0 \\ \_ - $s =~ s/\\/\\\\/g; - $s =~ s/\t/\\t/g; - $s =~ s/\0/\\0/g; - $s =~ s:/:\\_:g; - if($Global::max_file_length) { - # Keep each subdir shorter than the longest - # allowed file name - $s = substr($s,0,$Global::max_file_length); - } - $s; } - $rec_ref->[$n-1]->orig()); - } - } - return join "/", @res; -} - -sub header_indexes_sorted { - # Sort headers first by number then by name. - # E.g.: 1a 1b 11a 11b - # Returns: - # Indexes of %Global::input_source_header sorted - my $max_col = shift; - - no warnings 'numeric'; - for my $col (1 .. $max_col) { - # Make sure the header is defined. If it is not: use column number - if(not defined $Global::input_source_header{$col}) { - $Global::input_source_header{$col} = $col; - } - } - my @header_indexes_sorted = sort { - # Sort headers numerically then asciibetically - $Global::input_source_header{$a} <=> $Global::input_source_header{$b} - or - $Global::input_source_header{$a} cmp $Global::input_source_header{$b} - } 1 .. $max_col; - return @header_indexes_sorted; -} - -sub len { - # Uses: - # $opt::shellquote - # The length of the command line with args substituted - my $self = shift; - my $len = 0; - # Add length of the original command with no args - # Length of command w/ all replacement args removed - $len += $self->{'len'}{'noncontext'} + @{$self->{'command'}} -1; - ::debug("length", "noncontext + command: $len\n"); - my $recargs = $self->number_of_recargs(); - if($self->{'context_replace'}) { - # Context is duplicated for each arg - $len += $recargs * $self->{'len'}{'context'}; - for my $replstring (keys %{$self->{'replacecount'}}) { - # If the replacements string is more than once: mulitply its length - $len += $self->{'len'}{$replstring} * - $self->{'replacecount'}{$replstring}; - ::debug("length", $replstring, " ", $self->{'len'}{$replstring}, "*", - $self->{'replacecount'}{$replstring}, "\n"); - } - # echo 11 22 33 44 55 66 77 88 99 1010 - # echo 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10 - # 5 + ctxgrp*arg - ::debug("length", "Ctxgrp: ", $self->{'len'}{'contextgroups'}, - " Groups: ", $self->{'len'}{'noncontextgroups'}, "\n"); - # Add space between context groups - $len += ($recargs-1) * ($self->{'len'}{'contextgroups'}); - } else { - # Each replacement string may occur several times - # Add the length for each time - $len += 1*$self->{'len'}{'context'}; - ::debug("length", "context+noncontext + command: $len\n"); - for my $replstring (keys %{$self->{'replacecount'}}) { - # (space between regargs + length of replacement) - # * number this replacement is used - $len += ($recargs -1 + $self->{'len'}{$replstring}) * - $self->{'replacecount'}{$replstring}; - } - } - if($opt::nice) { - # Pessimistic length if --nice is set - # Worse than worst case: every char needs to be quoted with \ - $len *= 2; - } - if($Global::quoting) { - # Pessimistic length if -q is set - # Worse than worst case: every char needs to be quoted with \ - $len *= 2; - } - if($opt::shellquote) { - # Pessimistic length if --shellquote is set - # Worse than worst case: every char needs to be quoted with \ twice - $len *= 4; - } - # If we are using --env, add the prefix for that, too. - $len += $Global::envvarlen; - - return $len; -} - -sub replaced { - # Uses: - # $Global::noquote - # $Global::quoting - # Returns: - # $replaced = command with place holders replaced and prepended - my $self = shift; - if(not defined $self->{'replaced'}) { - # Don't quote arguments if the input is the full command line - my $quote_arg = $Global::noquote ? 0 : not $Global::quoting; - $self->{'replaced'} = $self->replace_placeholders($self->{'command'},$Global::quoting,$quote_arg); - my $len = length $self->{'replaced'}; - if ($len != $self->len()) { - ::debug("length", $len, " != ", $self->len(), " ", $self->{'replaced'}, "\n"); - } else { - ::debug("length", $len, " == ", $self->len(), " ", $self->{'replaced'}, "\n"); - } - } - return $self->{'replaced'}; -} - -sub replace_placeholders { - # Replace foo{}bar with fooargbar - # Input: - # $targetref = command as shell words - # $quote = should everything be quoted? - # $quote_arg = should replaced arguments be quoted? - # Returns: - # @target with placeholders replaced - my $self = shift; - my $targetref = shift; - my $quote = shift; - my $quote_arg = shift; - my $context_replace = $self->{'context_replace'}; - my @target = @$targetref; - ::debug("replace", "Replace @target\n"); - # -X = context replace - # maybe multiple input sources - # maybe --xapply - if(not @target) { - # @target is empty: Return empty array - return @target; - } - # Fish out the words that have replacement strings in them - my %word; - for (@target) { - my $tt = $_; - ::debug("replace", "Target: $tt"); - # a{1}b{}c{}d - # a{=1 $_=$_ =}b{= $_=$_ =}c{= $_=$_ =}d - # a\257<1 $_=$_ \257>b\257< $_=$_ \257>c\257< $_=$_ \257>d - # A B C => aAbA B CcA B Cd - # -X A B C => aAbAcAd aAbBcBd aAbCcCd - - if($context_replace) { - while($tt =~ s/([^\s\257]* # before {= - (?: - \257< # {= - [^\257]*? # The perl expression - \257> # =} - [^\s\257]* # after =} - )+)/ /x) { - # $1 = pre \257 perlexpr \257 post - $word{"$1"} ||= 1; - } - } else { - while($tt =~ s/( (?: \257<([^\257]*?)\257>) )//x) { - # $f = \257 perlexpr \257 - $word{$1} ||= 1; - } - } - } - my @word = keys %word; - - my %replace; - my @arg; - for my $record (@{$self->{'arg_list'}}) { - # $self->{'arg_list'} = [ [Arg11, Arg12], [Arg21, Arg22], [Arg31, Arg32] ] - # Merge arg-objects from records into @arg for easy access - CORE::push @arg, @$record; - } - # Add one arg if empty to allow {#} and {%} to be computed only once - if(not @arg) { @arg = (Arg->new("")); } - # Number of arguments - used for positional arguments - my $n = $#_+1; - - # This is actually a CommandLine-object, - # but it looks nice to be able to say {= $job->slot() =} - my $job = $self; - for my $word (@word) { - # word = AB \257< perlexpr \257> CD \257< perlexpr \257> EF - my $w = $word; - ::debug("replace", "Replacing in $w\n"); - - # Replace positional arguments - $w =~ s< ([^\s\257]*) # before {= - \257< # {= - (-?\d+) # Position (eg. -2 or 3) - ([^\257]*?) # The perl expression - \257> # =} - ([^\s\257]*) # after =} - > - { $1. # Context (pre) - ( - $arg[$2 > 0 ? $2-1 : $n+$2] ? # If defined: replace - $arg[$2 > 0 ? $2-1 : $n+$2]->replace($3,$quote_arg,$self) - : "") - .$4 }egx;# Context (post) - ::debug("replace", "Positional replaced $word with: $w\n"); - - if($w !~ /\257/) { - # No more replacement strings in $w: No need to do more - if($quote) { - CORE::push(@{$replace{::shell_quote($word)}}, $w); - } else { - CORE::push(@{$replace{$word}}, $w); - } - next; - } - # for each arg: - # compute replacement for each string - # replace replacement strings with replacement in the word value - # push to replace word value - ::debug("replace", "Positional done: $w\n"); - for my $arg (@arg) { - my $val = $w; - my $number_of_replacements = 0; - for my $perlexpr (keys %{$self->{'replacecount'}}) { - # Replace {= perl expr =} with value for each arg - $number_of_replacements += - $val =~ s{\257<\Q$perlexpr\E\257>} - {$arg ? $arg->replace($perlexpr,$quote_arg,$self) : ""}eg; - } - my $ww = $word; - if($quote) { - $ww = ::shell_quote_scalar($word); - $val = ::shell_quote_scalar($val); - } - if($number_of_replacements) { - CORE::push(@{$replace{$ww}}, $val); - } - } - } - - if($quote) { - @target = ::shell_quote(@target); - } - # ::debug("replace", "%replace=",::my_dump(%replace),"\n"); - if(%replace) { - # Substitute the replace strings with the replacement values - # Must be sorted by length if a short word is a substring of a long word - my $regexp = join('|', map { my $s = $_; $s =~ s/(\W)/\\$1/g; $s } - sort { length $b <=> length $a } keys %replace); - for(@target) { - s/($regexp)/join(" ",@{$replace{$1}})/ge; - } - } - ::debug("replace", "Return @target\n"); - return wantarray ? @target : "@target"; -} - - -package CommandLineQueue; - -sub new { - my $class = shift; - my $commandref = shift; - my $read_from = shift; - my $context_replace = shift; - my $max_number_of_args = shift; - my $return_files = shift; - my @unget = (); - my ($count,%replacecount,$posrpl,$perlexpr,%len); - my @command = @$commandref; - # If the first command start with '-' it is probably an option - if($command[0] =~ /^\s*(-\S+)/) { - # Is this really a command in $PATH starting with '-'? - my $cmd = $1; - if(not ::which($cmd)) { - ::error("Command ($cmd) starts with '-'. Is this a wrong option?\n"); - ::wait_and_exit(255); - } - } - # Replace replacement strings with {= perl expr =} - # Protect matching inside {= perl expr =} - # by replacing {= and =} with \257< and \257> - for(@command) { - if(/\257/) { - ::error("Command cannot contain the character \257. Use a function for that.\n"); - ::wait_and_exit(255); - } - s/\Q$Global::parensleft\E(.*?)\Q$Global::parensright\E/\257<$1\257>/gx; - } - for my $rpl (keys %Global::rpl) { - # Replace the short hand string with the {= perl expr =} in $command and $opt::tagstring - # Avoid replacing inside existing {= perl expr =} - for(@command,@Global::ret_files) { - while(s/((^|\257>)[^\257]*?) # Don't replace after \257 unless \257> - \Q$rpl\E/$1\257<$Global::rpl{$rpl}\257>/xg) { - } - } - if(defined $opt::tagstring) { - for($opt::tagstring) { - while(s/((^|\257>)[^\257]*?) # Don't replace after \257 unless \257> - \Q$rpl\E/$1\257<$Global::rpl{$rpl}\257>/x) {} - } - } - # Do the same for the positional replacement strings - # A bit harder as we have to put in the position number - $posrpl = $rpl; - if($posrpl =~ s/^\{//) { - # Only do this if the shorthand start with { - for(@command,@Global::ret_files) { - s/\{(-?\d+)\Q$posrpl\E/\257<$1 $Global::rpl{$rpl}\257>/g; - } - if(defined $opt::tagstring) { - $opt::tagstring =~ s/\{(-?\d+)\Q$posrpl\E/\257<$1 $perlexpr\257>/g; - } - } - } - my $sum = 0; - while($sum == 0) { - # Count how many times each replacement string is used - my @cmd = @command; - my $contextlen = 0; - my $noncontextlen = 0; - my $contextgroups = 0; - for my $c (@cmd) { - while($c =~ s/ \257<([^\257]*?)\257> /\000/x) { - # %replacecount = { "perlexpr" => number of times seen } - # e.g { "$_++" => 2 } - $replacecount{$1} ++; - $sum++; - } - # Measure the length of the context around the {= perl expr =} - # Use that {=...=} has been replaced with \000 above - # So there is no need to deal with \257< - while($c =~ s/ (\S*\000\S*) //x) { - my $w = $1; - $w =~ tr/\000//d; # Remove all \000's - $contextlen += length($w); - $contextgroups++; - } - # All {= perl expr =} have been removed: The rest is non-context - $noncontextlen += length $c; - } - if($opt::tagstring) { - my $t = $opt::tagstring; - while($t =~ s/ \257<([^\257]*)\257> //x) { - # %replacecount = { "perlexpr" => number of times seen } - # e.g { "$_++" => 2 } - # But for tagstring we just need to mark it as seen - $replacecount{$1}||=1; - } - } - - $len{'context'} = 0+$contextlen; - $len{'noncontext'} = $noncontextlen; - $len{'contextgroups'} = $contextgroups; - $len{'noncontextgroups'} = @cmd-$contextgroups; - ::debug("length", "@command Context: ", $len{'context'}, - " Non: ", $len{'noncontext'}, " Ctxgrp: ", $len{'contextgroups'}, - " NonCtxGrp: ", $len{'noncontextgroups'}, "\n"); - if($sum == 0) { - # Default command = {} - # If not replacement string: append {} - if(not @command) { - @command = ("\257<\257>"); - $Global::noquote = 1; - } elsif(($opt::pipe or $opt::pipepart) - and not $opt::fifo and not $opt::cat) { - # With --pipe / --pipe-part you can have no replacement - last; - } else { - # Append {} to the command if there are no {...}'s and no {=...=} - push @command, ("\257<\257>"); - } - } - } - - return bless { - 'unget' => \@unget, - 'command' => \@command, - 'replacecount' => \%replacecount, - 'arg_queue' => RecordQueue->new($read_from,$opt::colsep), - 'context_replace' => $context_replace, - 'len' => \%len, - 'max_number_of_args' => $max_number_of_args, - 'size' => undef, - 'return_files' => $return_files, - 'seq' => 1, - }, ref($class) || $class; -} - -sub get { - my $self = shift; - if(@{$self->{'unget'}}) { - my $cmd_line = shift @{$self->{'unget'}}; - return ($cmd_line); - } else { - my $cmd_line; - $cmd_line = CommandLine->new($self->seq(), - $self->{'command'}, - $self->{'arg_queue'}, - $self->{'context_replace'}, - $self->{'max_number_of_args'}, - $self->{'return_files'}, - $self->{'replacecount'}, - $self->{'len'}, - ); - $cmd_line->populate(); - ::debug("init","cmd_line->number_of_args ", - $cmd_line->number_of_args(), "\n"); - if($opt::pipe or $opt::pipepart) { - if($cmd_line->replaced() eq "") { - # Empty command - pipe requires a command - ::error("--pipe must have a command to pipe into (e.g. 'cat').\n"); - ::wait_and_exit(255); - } - } else { - if($cmd_line->number_of_args() == 0) { - # We did not get more args - maybe at EOF string? - return undef; - } elsif($cmd_line->replaced() eq "") { - # Empty command - get the next instead - return $self->get(); - } - } - $self->set_seq($self->seq()+1); - return $cmd_line; - } -} - -sub unget { - my $self = shift; - unshift @{$self->{'unget'}}, @_; -} - -sub empty { - my $self = shift; - my $empty = (not @{$self->{'unget'}}) && $self->{'arg_queue'}->empty(); - ::debug("run", "CommandLineQueue->empty $empty"); - return $empty; -} - -sub seq { - my $self = shift; - return $self->{'seq'}; -} - -sub set_seq { - my $self = shift; - $self->{'seq'} = shift; -} - -sub quote_args { - my $self = shift; - # If there is not command emulate |bash - return $self->{'command'}; -} - -sub size { - my $self = shift; - if(not $self->{'size'}) { - my @all_lines = (); - while(not $self->{'arg_queue'}->empty()) { - push @all_lines, CommandLine->new($self->{'command'}, - $self->{'arg_queue'}, - $self->{'context_replace'}, - $self->{'max_number_of_args'}); - } - $self->{'size'} = @all_lines; - $self->unget(@all_lines); - } - return $self->{'size'}; -} - - -package Limits::Command; - -# Maximal command line length (for -m and -X) -sub max_length { - # Find the max_length of a command line and cache it - # Returns: - # number of chars on the longest command line allowed - if(not $Limits::Command::line_max_len) { - # Disk cache of max command line length - my $len_cache = $ENV{'HOME'} . "/.parallel/tmp/linelen-" . ::hostname(); - my $cached_limit; - if(-e $len_cache) { - open(my $fh, "<", $len_cache) || ::die_bug("Cannot read $len_cache"); - $cached_limit = <$fh>; - close $fh; - } else { - $cached_limit = real_max_length(); - # If $HOME is write protected: Do not fail - mkdir($ENV{'HOME'} . "/.parallel"); - mkdir($ENV{'HOME'} . "/.parallel/tmp"); - open(my $fh, ">", $len_cache); - print $fh $cached_limit; - close $fh; - } - $Limits::Command::line_max_len = $cached_limit; - if($opt::max_chars) { - if($opt::max_chars <= $cached_limit) { - $Limits::Command::line_max_len = $opt::max_chars; - } else { - ::warning("Value for -s option ", - "should be < $cached_limit.\n"); - } - } - } - return $Limits::Command::line_max_len; -} - -sub real_max_length { - # Find the max_length of a command line - # Returns: - # The maximal command line length - # Use an upper bound of 8 MB if the shell allows for for infinite long lengths - my $upper = 8_000_000; - my $len = 8; - do { - if($len > $upper) { return $len }; - $len *= 16; - } while (is_acceptable_command_line_length($len)); - # Then search for the actual max length between 0 and upper bound - return binary_find_max_length(int($len/16),$len); -} - -sub binary_find_max_length { - # Given a lower and upper bound find the max_length of a command line - # Returns: - # number of chars on the longest command line allowed - my ($lower, $upper) = (@_); - if($lower == $upper or $lower == $upper-1) { return $lower; } - my $middle = int (($upper-$lower)/2 + $lower); - ::debug("init", "Maxlen: $lower,$upper,$middle : "); - if (is_acceptable_command_line_length($middle)) { - return binary_find_max_length($middle,$upper); - } else { - return binary_find_max_length($lower,$middle); - } -} - -sub is_acceptable_command_line_length { - # Test if a command line of this length can run - # Returns: - # 0 if the command line length is too long - # 1 otherwise - my $len = shift; - - local *STDERR; - open (STDERR, ">", "/dev/null"); - system "true "."x"x$len; - close STDERR; - ::debug("init", "$len=$? "); - return not $?; -} - - -package RecordQueue; - -sub new { - my $class = shift; - my $fhs = shift; - my $colsep = shift; - my @unget = (); - my $arg_sub_queue; - if($colsep) { - # Open one file with colsep - $arg_sub_queue = RecordColQueue->new($fhs); - } else { - # Open one or more files if multiple -a - $arg_sub_queue = MultifileQueue->new($fhs); - } - return bless { - 'unget' => \@unget, - 'arg_number' => 0, - 'arg_sub_queue' => $arg_sub_queue, - }, ref($class) || $class; -} - -sub get { - # Returns: - # reference to array of Arg-objects - my $self = shift; - if(@{$self->{'unget'}}) { - $self->{'arg_number'}++; - return shift @{$self->{'unget'}}; - } - my $ret = $self->{'arg_sub_queue'}->get(); - if(defined $Global::max_number_of_args - and $Global::max_number_of_args == 0) { - ::debug("run", "Read 1 but return 0 args\n"); - return [Arg->new("")]; - } else { - return $ret; - } -} - -sub unget { - my $self = shift; - ::debug("run", "RecordQueue-unget '@_'\n"); - $self->{'arg_number'} -= @_; - unshift @{$self->{'unget'}}, @_; -} - -sub empty { - my $self = shift; - my $empty = not @{$self->{'unget'}}; - $empty &&= $self->{'arg_sub_queue'}->empty(); - ::debug("run", "RecordQueue->empty $empty"); - return $empty; -} - -sub arg_number { - my $self = shift; - return $self->{'arg_number'}; -} - - -package RecordColQueue; - -sub new { - my $class = shift; - my $fhs = shift; - my @unget = (); - my $arg_sub_queue = MultifileQueue->new($fhs); - return bless { - 'unget' => \@unget, - 'arg_sub_queue' => $arg_sub_queue, - }, ref($class) || $class; -} - -sub get { - # Returns: - # reference to array of Arg-objects - my $self = shift; - if(@{$self->{'unget'}}) { - return shift @{$self->{'unget'}}; - } - my $unget_ref=$self->{'unget'}; - if($self->{'arg_sub_queue'}->empty()) { - return undef; - } - my $in_record = $self->{'arg_sub_queue'}->get(); - if(defined $in_record) { - my @out_record = (); - for my $arg (@$in_record) { - ::debug("run", "RecordColQueue::arg $arg\n"); - my $line = $arg->orig(); - ::debug("run", "line='$line'\n"); - if($line ne "") { - for my $s (split /$opt::colsep/o, $line, -1) { - push @out_record, Arg->new($s); - } - } else { - push @out_record, Arg->new(""); - } - } - return \@out_record; - } else { - return undef; - } -} - -sub unget { - my $self = shift; - ::debug("run", "RecordColQueue-unget '@_'\n"); - unshift @{$self->{'unget'}}, @_; -} - -sub empty { - my $self = shift; - my $empty = (not @{$self->{'unget'}} and $self->{'arg_sub_queue'}->empty()); - ::debug("run", "RecordColQueue->empty $empty"); - return $empty; -} - - -package MultifileQueue; - -@Global::unget_argv=(); - -sub new { - my $class = shift; - my $fhs = shift; - for my $fh (@$fhs) { - if(-t $fh) { - ::warning("Input is read from the terminal. ". - "Only experts do this on purpose. ". - "Press CTRL-D to exit.\n"); - } - } - return bless { - 'unget' => \@Global::unget_argv, - 'fhs' => $fhs, - 'arg_matrix' => undef, - }, ref($class) || $class; -} - -sub get { - my $self = shift; - if($opt::xapply) { - return $self->xapply_get(); - } else { - return $self->nest_get(); - } -} - -sub unget { - my $self = shift; - ::debug("run", "MultifileQueue-unget '@_'\n"); - unshift @{$self->{'unget'}}, @_; -} - -sub empty { - my $self = shift; - my $empty = (not @Global::unget_argv - and not @{$self->{'unget'}}); - for my $fh (@{$self->{'fhs'}}) { - $empty &&= eof($fh); - } - ::debug("run", "MultifileQueue->empty $empty "); - return $empty; -} - -sub xapply_get { - my $self = shift; - if(@{$self->{'unget'}}) { - return shift @{$self->{'unget'}}; - } - my @record = (); - my $prepend = undef; - my $empty = 1; - for my $fh (@{$self->{'fhs'}}) { - my $arg = read_arg_from_fh($fh); - if(defined $arg) { - # Record $arg for recycling at end of file - push @{$self->{'arg_matrix'}{$fh}}, $arg; - push @record, $arg; - $empty = 0; - } else { - ::debug("run", "EOA "); - # End of file: Recycle arguments - push @{$self->{'arg_matrix'}{$fh}}, shift @{$self->{'arg_matrix'}{$fh}}; - # return last @{$args->{'args'}{$fh}}; - push @record, @{$self->{'arg_matrix'}{$fh}}[-1]; - } - } - if($empty) { - return undef; - } else { - return \@record; - } -} - -sub nest_get { - my $self = shift; - if(@{$self->{'unget'}}) { - return shift @{$self->{'unget'}}; - } - my @record = (); - my $prepend = undef; - my $empty = 1; - my $no_of_inputsources = $#{$self->{'fhs'}} + 1; - if(not $self->{'arg_matrix'}) { - # Initialize @arg_matrix with one arg from each file - # read one line from each file - my @first_arg_set; - my $all_empty = 1; - for (my $fhno = 0; $fhno < $no_of_inputsources ; $fhno++) { - my $arg = read_arg_from_fh($self->{'fhs'}[$fhno]); - if(defined $arg) { - $all_empty = 0; - } - $self->{'arg_matrix'}[$fhno][0] = $arg || Arg->new(""); - push @first_arg_set, $self->{'arg_matrix'}[$fhno][0]; - } - if($all_empty) { - # All filehandles were at eof or eof-string - return undef; - } - return [@first_arg_set]; - } - - # Treat the case with one input source special. For multiple - # input sources we need to remember all previously read values to - # generate all combinations. But for one input source we can - # forget the value after first use. - if($no_of_inputsources == 1) { - my $arg = read_arg_from_fh($self->{'fhs'}[0]); - if(defined($arg)) { - return [$arg]; - } - return undef; - } - for (my $fhno = $no_of_inputsources - 1; $fhno >= 0; $fhno--) { - if(eof($self->{'fhs'}[$fhno])) { - next; - } else { - # read one - my $arg = read_arg_from_fh($self->{'fhs'}[$fhno]); - defined($arg) || next; # If we just read an EOF string: Treat this as EOF - my $len = $#{$self->{'arg_matrix'}[$fhno]} + 1; - $self->{'arg_matrix'}[$fhno][$len] = $arg; - # make all new combinations - my @combarg = (); - for (my $fhn = 0; $fhn < $no_of_inputsources; $fhn++) { - push @combarg, [0, $#{$self->{'arg_matrix'}[$fhn]}]; - } - $combarg[$fhno] = [$len,$len]; # Find only combinations with this new entry - # map combinations - # [ 1, 3, 7 ], [ 2, 4, 1 ] - # => - # [ m[0][1], m[1][3], m[3][7] ], [ m[0][2], m[1][4], m[2][1] ] - my @mapped; - for my $c (expand_combinations(@combarg)) { - my @a; - for my $n (0 .. $no_of_inputsources - 1 ) { - push @a, $self->{'arg_matrix'}[$n][$$c[$n]]; - } - push @mapped, \@a; - } - # append the mapped to the ungotten arguments - push @{$self->{'unget'}}, @mapped; - # get the first - return shift @{$self->{'unget'}}; - } - } - # all are eof or at EOF string; return from the unget queue - return shift @{$self->{'unget'}}; -} - -sub read_arg_from_fh { - # Read one Arg from filehandle - # Returns: - # Arg-object with one read line - # undef if end of file - my $fh = shift; - my $prepend = undef; - my $arg; - do {{ - # This makes 10% faster - if(not ($arg = <$fh>)) { - if(defined $prepend) { - return Arg->new($prepend); - } else { - return undef; - } - } -# ::debug("run", "read $arg\n"); - # Remove delimiter - $arg =~ s:$/$::; - if($Global::end_of_file_string and - $arg eq $Global::end_of_file_string) { - # Ignore the rest of input file - close $fh; - ::debug("run", "EOF-string ($arg) met\n"); - if(defined $prepend) { - return Arg->new($prepend); - } else { - return undef; - } - } - if(defined $prepend) { - $arg = $prepend.$arg; # For line continuation - $prepend = undef; #undef; - } - if($Global::ignore_empty) { - if($arg =~ /^\s*$/) { - redo; # Try the next line - } - } - if($Global::max_lines) { - if($arg =~ /\s$/) { - # Trailing space => continued on next line - $prepend = $arg; - redo; - } - } - }} while (1 == 0); # Dummy loop {{}} for redo - if(defined $arg) { - return Arg->new($arg); - } else { - ::die_bug("multiread arg undefined"); - } -} - -sub expand_combinations { - # Input: - # ([xmin,xmax], [ymin,ymax], ...) - # Returns: ([x,y,...],[x,y,...]) - # where xmin <= x <= xmax and ymin <= y <= ymax - my $minmax_ref = shift; - my $xmin = $$minmax_ref[0]; - my $xmax = $$minmax_ref[1]; - my @p; - if(@_) { - # If there are more columns: Compute those recursively - my @rest = expand_combinations(@_); - for(my $x = $xmin; $x <= $xmax; $x++) { - push @p, map { [$x, @$_] } @rest; - } - } else { - for(my $x = $xmin; $x <= $xmax; $x++) { - push @p, [$x]; - } - } - return @p; -} - - -package Arg; - -sub new { - my $class = shift; - my $orig = shift; - my @hostgroups; - if($opt::hostgroups) { - if($orig =~ s:@(.+)::) { - # We found hostgroups on the arg - @hostgroups = split(/\+/, $1); - if(not grep { defined $Global::hostgroups{$_} } @hostgroups) { - ::warning("No such hostgroup (@hostgroups)\n"); - @hostgroups = (keys %Global::hostgroups); - } - } else { - @hostgroups = (keys %Global::hostgroups); - } - } - return bless { - 'orig' => $orig, - 'hostgroups' => \@hostgroups, - }, ref($class) || $class; -} - -sub replace { - # Calculates the corresponding value for a given perl expression - # Returns: - # The calculated string (quoted if asked for) - my $self = shift; - my $perlexpr = shift; # E.g. $_=$_ or s/.gz// - my $quote = (shift) ? 1 : 0; # should the string be quoted? - # This is actually a CommandLine-object, - # but it looks nice to be able to say {= $job->slot() =} - my $job = shift; - $perlexpr =~ s/^-?\d+ //; # Positional replace treated as normal replace - if(not defined $self->{"rpl",0,$perlexpr}) { - local $_; - if($Global::trim eq "n") { - $_ = $self->{'orig'}; - } else { - $_ = trim_of($self->{'orig'}); - } - ::debug("replace", "eval ", $perlexpr, " ", $_, "\n"); - if(not $Global::perleval{$perlexpr}) { - # Make an anonymous function of the $perlexpr - # And more importantly: Compile it only once - if($Global::perleval{$perlexpr} = - eval('sub { no strict; no warnings; my $job = shift; '. - $perlexpr.' }')) { - # All is good - } else { - # The eval failed. Maybe $perlexpr is invalid perl? - ::error("Cannot use $perlexpr: $@\n"); - ::wait_and_exit(255); - } - } - # Execute the function - $Global::perleval{$perlexpr}->($job); - $self->{"rpl",0,$perlexpr} = $_; - } - if(not defined $self->{"rpl",$quote,$perlexpr}) { - $self->{"rpl",1,$perlexpr} = - ::shell_quote_scalar($self->{"rpl",0,$perlexpr}); - } - return $self->{"rpl",$quote,$perlexpr}; -} - -sub orig { - my $self = shift; - return $self->{'orig'}; -} - -sub trim_of { - # Removes white space as specifed by --trim: - # n = nothing - # l = start - # r = end - # lr|rl = both - # Returns: - # string with white space removed as needed - my @strings = map { defined $_ ? $_ : "" } (@_); - my $arg; - if($Global::trim eq "n") { - # skip - } elsif($Global::trim eq "l") { - for my $arg (@strings) { $arg =~ s/^\s+//; } - } elsif($Global::trim eq "r") { - for my $arg (@strings) { $arg =~ s/\s+$//; } - } elsif($Global::trim eq "rl" or $Global::trim eq "lr") { - for my $arg (@strings) { $arg =~ s/^\s+//; $arg =~ s/\s+$//; } - } else { - ::error("--trim must be one of: r l rl lr.\n"); - ::wait_and_exit(255); - } - return wantarray ? @strings : "@strings"; -} - - -package TimeoutQueue; - -sub new { - my $class = shift; - my $delta_time = shift; - my ($pct); - if($delta_time =~ /(\d+(\.\d+)?)%/) { - # Timeout in percent - $pct = $1/100; - $delta_time = 1_000_000; - } - return bless { - 'queue' => [], - 'delta_time' => $delta_time, - 'pct' => $pct, - 'remedian_idx' => 0, - 'remedian_arr' => [], - 'remedian' => undef, - }, ref($class) || $class; -} - -sub delta_time { - my $self = shift; - return $self->{'delta_time'}; -} - -sub set_delta_time { - my $self = shift; - $self->{'delta_time'} = shift; -} - -sub remedian { - my $self = shift; - return $self->{'remedian'}; -} - -sub set_remedian { - # Set median of the last 999^3 (=997002999) values using Remedian - # - # Rousseeuw, Peter J., and Gilbert W. Bassett Jr. "The remedian: A - # robust averaging method for large data sets." Journal of the - # American Statistical Association 85.409 (1990): 97-104. - my $self = shift; - my $val = shift; - my $i = $self->{'remedian_idx'}++; - my $rref = $self->{'remedian_arr'}; - $rref->[0][$i%999] = $val; - $rref->[1][$i/999%999] = (sort @{$rref->[0]})[$#{$rref->[0]}/2]; - $rref->[2][$i/999/999%999] = (sort @{$rref->[1]})[$#{$rref->[1]}/2]; - $self->{'remedian'} = (sort @{$rref->[2]})[$#{$rref->[2]}/2]; -} - -sub update_delta_time { - # Update delta_time based on runtime of finished job if timeout is - # a percentage - my $self = shift; - my $runtime = shift; - if($self->{'pct'}) { - $self->set_remedian($runtime); - $self->{'delta_time'} = $self->{'pct'} * $self->remedian(); - ::debug("run", "Timeout: $self->{'delta_time'}s "); - } -} - -sub process_timeouts { - # Check if there was a timeout - my $self = shift; - # $self->{'queue'} is sorted by start time - while (@{$self->{'queue'}}) { - my $job = $self->{'queue'}[0]; - if($job->endtime()) { - # Job already finished. No need to timeout the job - # This could be because of --keep-order - shift @{$self->{'queue'}}; - } elsif($job->timedout($self->{'delta_time'})) { - # Need to shift off queue before kill - # because kill calls usleep that calls process_timeouts - shift @{$self->{'queue'}}; - $job->kill(); - } else { - # Because they are sorted by start time the rest are later - last; - } - } -} - -sub insert { - my $self = shift; - my $in = shift; - push @{$self->{'queue'}}, $in; -} - - -package Semaphore; - -# This package provides a counting semaphore -# -# If a process dies without releasing the semaphore the next process -# that needs that entry will clean up dead semaphores -# -# The semaphores are stored in ~/.parallel/semaphores/id- Each -# file in ~/.parallel/semaphores/id-/ is the process ID of the -# process holding the entry. If the process dies, the entry can be -# taken by another process. - -sub new { - my $class = shift; - my $id = shift; - my $count = shift; - $id=~s/([^-_a-z0-9])/unpack("H*",$1)/ige; # Convert non-word chars to hex - $id="id-".$id; # To distinguish it from a process id - my $parallel_dir = $ENV{'HOME'}."/.parallel"; - -d $parallel_dir or mkdir_or_die($parallel_dir); - my $parallel_locks = $parallel_dir."/semaphores"; - -d $parallel_locks or mkdir_or_die($parallel_locks); - my $lockdir = "$parallel_locks/$id"; - my $lockfile = $lockdir.".lock"; - if($count < 1) { ::die_bug("semaphore-count: $count"); } - return bless { - 'lockfile' => $lockfile, - 'lockfh' => Symbol::gensym(), - 'lockdir' => $lockdir, - 'id' => $id, - 'idfile' => $lockdir."/".$id, - 'pid' => $$, - 'pidfile' => $lockdir."/".$$.'@'.::hostname(), - 'count' => $count + 1 # nlinks returns a link for the 'id-' as well - }, ref($class) || $class; -} - -sub acquire { - my $self = shift; - my $sleep = 1; # 1 ms - my $start_time = time; - while(1) { - $self->atomic_link_if_count_less_than() and last; - ::debug("sem", "Remove dead locks"); - my $lockdir = $self->{'lockdir'}; - for my $d (glob "$lockdir/*") { - ::debug("sem", "Lock $d $lockdir\n"); - $d =~ m:$lockdir/([0-9]+)\@([-\._a-z0-9]+)$:o or next; - my ($pid, $host) = ($1, $2); - if($host eq ::hostname()) { - if(not kill 0, $1) { - ::debug("sem", "Dead: $d"); - unlink $d; - } else { - ::debug("sem", "Alive: $d"); - } - } - } - # try again - $self->atomic_link_if_count_less_than() and last; - # Retry slower and slower up to 1 second - $sleep = ($sleep < 1000) ? ($sleep * 1.1) : ($sleep); - # Random to avoid every sleeping job waking up at the same time - ::usleep(rand()*$sleep); - if(defined($opt::timeout) and - $start_time + $opt::timeout > time) { - # Acquire the lock anyway - if(not -e $self->{'idfile'}) { - open (my $fh, ">", $self->{'idfile'}) or - ::die_bug("timeout_write_idfile: $self->{'idfile'}"); - close $fh; - } - link $self->{'idfile'}, $self->{'pidfile'}; - last; - } - } - ::debug("sem", "acquired $self->{'pid'}\n"); -} - -sub release { - my $self = shift; - unlink $self->{'pidfile'}; - if($self->nlinks() == 1) { - # This is the last link, so atomic cleanup - $self->lock(); - if($self->nlinks() == 1) { - unlink $self->{'idfile'}; - rmdir $self->{'lockdir'}; - } - $self->unlock(); - } - ::debug("run", "released $self->{'pid'}\n"); -} - -sub _release { - my $self = shift; - - unlink $self->{'pidfile'}; - $self->lock(); - my $nlinks = $self->nlinks(); - ::debug("sem", $nlinks, "<", $self->{'count'}); - if($nlinks-- > 1) { - unlink $self->{'idfile'}; - open (my $fh, ">", $self->{'idfile'}) or - ::die_bug("write_idfile: $self->{'idfile'}"); - print $fh "#"x$nlinks; - close $fh; - } else { - unlink $self->{'idfile'}; - rmdir $self->{'lockdir'}; - } - $self->unlock(); - ::debug("sem", "released $self->{'pid'}\n"); -} - -sub atomic_link_if_count_less_than { - # Link $file1 to $file2 if nlinks to $file1 < $count - my $self = shift; - my $retval = 0; - $self->lock(); - ::debug($self->nlinks(), "<", $self->{'count'}); - if($self->nlinks() < $self->{'count'}) { - -d $self->{'lockdir'} or mkdir_or_die($self->{'lockdir'}); - if(not -e $self->{'idfile'}) { - open (my $fh, ">", $self->{'idfile'}) or - ::die_bug("write_idfile: $self->{'idfile'}"); - close $fh; - } - $retval = link $self->{'idfile'}, $self->{'pidfile'}; - } - $self->unlock(); - ::debug("run", "atomic $retval"); - return $retval; -} - -sub _atomic_link_if_count_less_than { - # Link $file1 to $file2 if nlinks to $file1 < $count - my $self = shift; - my $retval = 0; - $self->lock(); - my $nlinks = $self->nlinks(); - ::debug("sem", $nlinks, "<", $self->{'count'}); - if($nlinks++ < $self->{'count'}) { - -d $self->{'lockdir'} or mkdir_or_die($self->{'lockdir'}); - if(not -e $self->{'idfile'}) { - open (my $fh, ">", $self->{'idfile'}) or - ::die_bug("write_idfile: $self->{'idfile'}"); - close $fh; - } - open (my $fh, ">", $self->{'idfile'}) or - ::die_bug("write_idfile: $self->{'idfile'}"); - print $fh "#"x$nlinks; - close $fh; - $retval = link $self->{'idfile'}, $self->{'pidfile'}; - } - $self->unlock(); - ::debug("sem", "atomic $retval"); - return $retval; -} - -sub nlinks { - my $self = shift; - if(-e $self->{'idfile'}) { - ::debug("sem", "nlinks", (stat(_))[3], "size", (stat(_))[7], "\n"); - return (stat(_))[3]; - } else { - return 0; - } -} - -sub lock { - my $self = shift; - my $sleep = 100; # 100 ms - my $total_sleep = 0; - $Global::use{"Fcntl"} ||= eval "use Fcntl qw(:DEFAULT :flock); 1;"; - my $locked = 0; - while(not $locked) { - if(tell($self->{'lockfh'}) == -1) { - # File not open - open($self->{'lockfh'}, ">", $self->{'lockfile'}) - or ::debug("run", "Cannot open $self->{'lockfile'}"); - } - if($self->{'lockfh'}) { - # File is open - chmod 0666, $self->{'lockfile'}; # assuming you want it a+rw - if(flock($self->{'lockfh'}, LOCK_EX()|LOCK_NB())) { - # The file is locked: No need to retry - $locked = 1; - last; - } else { - if ($! =~ m/Function not implemented/) { - ::warning("flock: $!"); - ::warning("Will wait for a random while\n"); - ::usleep(rand(5000)); - # File cannot be locked: No need to retry - $locked = 2; - last; - } - } - } - # Locking failed in first round - # Sleep and try again - $sleep = ($sleep < 1000) ? ($sleep * 1.1) : ($sleep); - # Random to avoid every sleeping job waking up at the same time - ::usleep(rand()*$sleep); - $total_sleep += $sleep; - if($opt::semaphoretimeout) { - if($total_sleep/1000 > $opt::semaphoretimeout) { - # Timeout: bail out - ::warning("Semaphore timed out. Ignoring timeout."); - $locked = 3; - last; - } - } else { - if($total_sleep/1000 > 30) { - ::warning("Semaphore stuck for 30 seconds. Consider using --semaphoretimeout."); - } - } - } - ::debug("run", "locked $self->{'lockfile'}"); -} - -sub unlock { - my $self = shift; - unlink $self->{'lockfile'}; - close $self->{'lockfh'}; - ::debug("run", "unlocked\n"); -} - -sub mkdir_or_die { - # If dir is not writable: die - my $dir = shift; - my @dir_parts = split(m:/:,$dir); - my ($ddir,$part); - while(defined ($part = shift @dir_parts)) { - $part eq "" and next; - $ddir .= "/".$part; - -d $ddir and next; - mkdir $ddir; - } - if(not -w $dir) { - ::error("Cannot write to $dir: $!\n"); - ::wait_and_exit(255); - } -} - -# Keep perl -w happy -$opt::x = $Semaphore::timeout = $Semaphore::wait = -$Job::file_descriptor_warning_printed = 0; diff --git a/build_tools/make_package.sh b/build_tools/make_package.sh index 68a5d8a722..ce27356253 100755 --- a/build_tools/make_package.sh +++ b/build_tools/make_package.sh @@ -63,9 +63,9 @@ function gem_install() { function main() { if [[ $# -ne 1 ]]; then - fatal "Usage: $0 " + fatal "Usage: $0 " else - log "using rocksdb version: $1" + log "using Speedb version: $1" fi if [[ -d /vagrant ]]; then @@ -115,13 +115,13 @@ function main() { -s dir \ -t $FPM_OUTPUT \ -C package \ - -n rocksdb \ + -n speedb \ -v $1 \ - --url http://rocksdb.org/ \ - -m rocksdb@fb.com \ - --license BSD \ - --vendor Facebook \ - --description "RocksDB is an embeddable persistent key-value store for fast storage." \ + --url http://speedb.io/ \ + -m hello@speedb.io \ + --license Apache \ + --vendor Speedb \ + --description "Speedb is an embeddable persistent key-value store for fast storage based on RocksDB." \ usr } diff --git a/build_tools/version.sh b/build_tools/version.sh index dbc1a92964..5e3632346c 100755 --- a/build_tools/version.sh +++ b/build_tools/version.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. if [ "$#" = "0" ]; then echo "Usage: $0 major|minor|patch|full" @@ -6,18 +6,18 @@ if [ "$#" = "0" ]; then fi if [ "$1" = "major" ]; then - cat include/rocksdb/version.h | grep MAJOR | head -n1 | awk '{print $3}' + grep MAJOR speedb/version.h | head -n1 | awk '{print $3}' fi if [ "$1" = "minor" ]; then - cat include/rocksdb/version.h | grep MINOR | head -n1 | awk '{print $3}' + grep MINOR speedb/version.h | head -n1 | awk '{print $3}' fi if [ "$1" = "patch" ]; then - cat include/rocksdb/version.h | grep PATCH | head -n1 | awk '{print $3}' + grep PATCH speedb/version.h | head -n1 | awk '{print $3}' fi if [ "$1" = "full" ]; then - awk '/#define ROCKSDB/ { env[$2] = $3 } - END { printf "%s.%s.%s\n", env["ROCKSDB_MAJOR"], - env["ROCKSDB_MINOR"], - env["ROCKSDB_PATCH"] }' \ - include/rocksdb/version.h + awk '/#define SPEEDB/ { env[$2] = $3 } + END { printf "%s.%s.%s\n", env["SPEEDB_MAJOR"], + env["SPEEDB_MINOR"], + env["SPEEDB_PATCH"] }' \ + speedb/version.h fi diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index cd9fac3d91..312ee1c95c 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -22,6 +22,7 @@ #include "rocksdb/secondary_cache.h" #include "rocksdb/system_clock.h" #include "rocksdb/table_properties.h" +#include "speedb/version.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/cachable_entry.h" #include "util/coding.h" @@ -579,7 +580,8 @@ class CacheBench { } void PrintEnv() const { - printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("Speedb version : %s\n", + GetSpeedbVersionAsString(false).c_str()); printf("Number of threads : %u\n", FLAGS_threads); printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); printf("Cache size : %s\n", diff --git a/cache/cache_reservation_manager.cc b/cache/cache_reservation_manager.cc index 3cc149b432..29192d3872 100644 --- a/cache/cache_reservation_manager.cc +++ b/cache/cache_reservation_manager.cc @@ -118,7 +118,7 @@ Status CacheReservationManagerImpl::IncreaseCacheReservation( return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry, GetNoopDeleterForRole(), &handle); - if (return_status != Status::OK()) { + if (!return_status.ok()) { return return_status; } diff --git a/cache/cache_reservation_manager_test.cc b/cache/cache_reservation_manager_test.cc index 87af653bc2..e56cc1a556 100644 --- a/cache/cache_reservation_manager_test.cc +++ b/cache/cache_reservation_manager_test.cc @@ -40,7 +40,7 @@ class CacheReservationManagerTest : public ::testing::Test { TEST_F(CacheReservationManagerTest, GenerateCacheKey) { std::size_t new_mem_used = 1 * kSizeDummyEntry; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry); ASSERT_LT(cache->GetPinnedUsage(), 1 * kSizeDummyEntry + kMetaDataChargeOverhead); @@ -66,7 +66,7 @@ TEST_F(CacheReservationManagerTest, GenerateCacheKey) { TEST_F(CacheReservationManagerTest, KeepCacheReservationTheSame) { std::size_t new_mem_used = 1 * kSizeDummyEntry; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 1 * kSizeDummyEntry); ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); @@ -76,7 +76,7 @@ TEST_F(CacheReservationManagerTest, KeepCacheReservationTheSame) { 1 * kSizeDummyEntry + kMetaDataChargeOverhead); s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) + EXPECT_OK(s) << "Failed to keep cache reservation the same when new_mem_used equals " "to current cache reservation"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), @@ -95,8 +95,7 @@ TEST_F(CacheReservationManagerTest, IncreaseCacheReservationByMultiplesOfDummyEntrySize) { std::size_t new_mem_used = 2 * kSizeDummyEntry; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) - << "Failed to increase cache reservation correctly"; + EXPECT_OK(s) << "Failed to increase cache reservation correctly"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 2 * kSizeDummyEntry) << "Failed to bookkeep cache reservation increase correctly"; @@ -113,8 +112,7 @@ TEST_F(CacheReservationManagerTest, IncreaseCacheReservationNotByMultiplesOfDummyEntrySize) { std::size_t new_mem_used = 2 * kSizeDummyEntry + kSizeDummyEntry / 2; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) - << "Failed to increase cache reservation correctly"; + EXPECT_OK(s) << "Failed to increase cache reservation correctly"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 3 * kSizeDummyEntry) << "Failed to bookkeep cache reservation increase correctly"; @@ -147,7 +145,7 @@ TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest, std::size_t new_mem_used = kSmallCacheCapacity + 1; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::Incomplete()) + EXPECT_TRUE(s.IsIncomplete()) << "Failed to return status to indicate failure of dummy entry insertion " "during cache reservation on full cache"; EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(), @@ -170,7 +168,7 @@ TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest, new_mem_used = kSmallCacheCapacity / 2; // 2 dummy entries s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) + EXPECT_OK(s) << "Failed to decrease cache reservation after encountering cache " "reservation failure due to full cache"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), @@ -192,7 +190,7 @@ TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest, // Create cache full again for subsequent tests new_mem_used = kSmallCacheCapacity + 1; s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::Incomplete()) + EXPECT_TRUE(s.IsIncomplete()) << "Failed to return status to indicate failure of dummy entry insertion " "during cache reservation on full cache"; EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(), @@ -218,7 +216,7 @@ TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest, cache->SetCapacity(kBigCacheCapacity); new_mem_used = kSmallCacheCapacity + 1; s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) + EXPECT_OK(s) << "Failed to increase cache reservation after increasing cache capacity " "and mitigating cache full error"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), @@ -240,7 +238,7 @@ TEST_F(CacheReservationManagerTest, DecreaseCacheReservationByMultiplesOfDummyEntrySize) { std::size_t new_mem_used = 2 * kSizeDummyEntry; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 2 * kSizeDummyEntry); ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); @@ -250,8 +248,7 @@ TEST_F(CacheReservationManagerTest, new_mem_used = 1 * kSizeDummyEntry; s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) - << "Failed to decrease cache reservation correctly"; + EXPECT_OK(s) << "Failed to decrease cache reservation correctly"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 1 * kSizeDummyEntry) << "Failed to bookkeep cache reservation decrease correctly"; @@ -268,7 +265,7 @@ TEST_F(CacheReservationManagerTest, DecreaseCacheReservationNotByMultiplesOfDummyEntrySize) { std::size_t new_mem_used = 2 * kSizeDummyEntry; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 2 * kSizeDummyEntry); ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); @@ -278,8 +275,7 @@ TEST_F(CacheReservationManagerTest, new_mem_used = kSizeDummyEntry / 2; s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) - << "Failed to decrease cache reservation correctly"; + EXPECT_OK(s) << "Failed to decrease cache reservation correctly"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 1 * kSizeDummyEntry) << "Failed to bookkeep cache reservation decrease correctly"; @@ -309,7 +305,7 @@ TEST(CacheReservationManagerWithDelayedDecreaseTest, std::size_t new_mem_used = 8 * kSizeDummyEntry; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 8 * kSizeDummyEntry); ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); @@ -320,7 +316,7 @@ TEST(CacheReservationManagerWithDelayedDecreaseTest, new_mem_used = 6 * kSizeDummyEntry; s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation"; + EXPECT_OK(s) << "Failed to delay decreasing cache reservation"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 8 * kSizeDummyEntry) << "Failed to bookkeep correctly when delaying cache reservation " @@ -332,7 +328,7 @@ TEST(CacheReservationManagerWithDelayedDecreaseTest, new_mem_used = 7 * kSizeDummyEntry; s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation"; + EXPECT_OK(s) << "Failed to delay decreasing cache reservation"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), 8 * kSizeDummyEntry) << "Failed to bookkeep correctly when delaying cache reservation " @@ -344,7 +340,7 @@ TEST(CacheReservationManagerWithDelayedDecreaseTest, new_mem_used = 6 * kSizeDummyEntry - 1; s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - EXPECT_EQ(s, Status::OK()) + EXPECT_OK(s) << "Failed to decrease cache reservation correctly when new_mem_used < " "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode"; EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), @@ -381,7 +377,7 @@ TEST(CacheReservationManagerDestructorTest, cache); std::size_t new_mem_used = 1 * kSizeDummyEntry; Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry); ASSERT_LT(cache->GetPinnedUsage(), 1 * kSizeDummyEntry + kMetaDataChargeOverhead); @@ -417,7 +413,7 @@ TEST(CacheReservationHandleTest, HandleTest) { Status s = test_cache_rev_mng->MakeCacheReservation( incremental_mem_used_handle_1, &handle_1); mem_used = mem_used + incremental_mem_used_handle_1; - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); EXPECT_TRUE(handle_1 != nullptr); EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used); EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used); @@ -427,7 +423,7 @@ TEST(CacheReservationHandleTest, HandleTest) { s = test_cache_rev_mng->MakeCacheReservation(incremental_mem_used_handle_2, &handle_2); mem_used = mem_used + incremental_mem_used_handle_2; - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); EXPECT_TRUE(handle_2 != nullptr); EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used); EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used); diff --git a/cmake/CTestRunner.cmake b/cmake/CTestRunner.cmake new file mode 100644 index 0000000000..258da5db15 --- /dev/null +++ b/cmake/CTestRunner.cmake @@ -0,0 +1,118 @@ +# Copyright (C) 2022 Speedb Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# 3.12 is needed for FindPython +cmake_minimum_required(VERSION 3.12) + +# Choose the amount of tests to run in parallel if CTEST_PARALLEL_LEVEL wasn't set +if(NOT DEFINED ENV{CTEST_PARALLEL_LEVEL}) + # Compatibility with the Makefile: support the `J` environment variable + if(DEFINED ENV{J} AND "$ENV{J}" GREATER 0) + set(ENV{CTEST_PARALLEL_LEVEL} "$ENV{J}") + else() + include(ProcessorCount) + ProcessorCount(NCPU) + if(NOT NCPU EQUAL 0) + set(ENV{CTEST_PARALLEL_LEVEL} ${NCPU}) + endif() + endif() +endif() + +# For Makefile compatibility try the following sequence if TEST_TMPDIR isn't set: +# * Use TMPD if set +# * Find a suitable base directory and create a temporary directory under it: +# * /dev/shm on Linux if exists and has the sticky bit set +# * TMPDIR if set and exists +# * On Windows use TMP is set and exists +# * On Windows use TEMP is set and exists +# * /tmp if exists +if(NOT DEFINED ENV{TEST_TMPDIR}) + # Use TMPD if set + if(DEFINED ENV{TMPD}) + set(test_dir "$ENV{TMPD}") + else() + # On Linux, use /dev/shm if the sticky bit is set + if("${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Linux" AND IS_DIRECTORY "/dev/shm") + execute_process(COMMAND test -k /dev/shm RESULT_VARIABLE status OUTPUT_QUIET ERROR_QUIET) + if(status EQUAL 0) + set(test_dir "/dev/shm") + endif() + endif() + # Use TMPDIR as base if set + if(NOT DEFINED test_dir AND IS_DIRECTORY "$ENV{TMPDIR}") + set(test_dir "$ENV{TMPDIR}") + elseif("${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Windows") + # Use TMP or TEMP as base if set + # See https://devblogs.microsoft.com/oldnewthing/20150417-00/?p=44213 + if(IS_DIRECTORY "$ENV{TMP}") + set(test_dir "$ENV{TMP}") + elseif(IS_DIRECTORY "$ENV{TEMP}") + set(test_dir "$ENV{TEMP}") + endif() + endif() + # Fall back to /tmp if exists + if(NOT DEFINED test_dir AND IS_DIRECTORY "/tmp") + set(test_dir "/tmp") + endif() + # Create a temporary directory under the base path that we determined + if(DEFINED test_dir) + include(FindPython) + find_package(Python COMPONENTS Interpreter) + # Try using Python for more portability when creating the temporary + # sub-directory, but don't depend on it + if(Python_Interpreter_FOUND) + execute_process( + COMMAND "${CMAKE_COMMAND}" -E env "test_dir=${test_dir}" + "${Python_EXECUTABLE}" -c "import os, tempfile; print(tempfile.mkdtemp(prefix='rocksdb.', dir=os.environ['test_dir']))" + RESULT_VARIABLE status OUTPUT_VARIABLE tmpdir + OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT status EQUAL 0) + message(FATAL_ERROR "Python mkdtemp failed") + endif() + set(test_dir "${tmpdir}") + elseif(NOT "${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Windows") + execute_process( + COMMAND mktemp -d "${test_dir}/rocksdb.XXXXXX" + RESULT_VARIABLE status OUTPUT_VARIABLE tmpdir + OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT status EQUAL 0) + message(FATAL_ERROR "mkdtemp failed") + endif() + set(test_dir "${tmpdir}") + endif() + endif() + endif() + if(DEFINED test_dir) + set(ENV{TEST_TMPDIR} "${test_dir}") + endif() +endif() + +if(DEFINED ENV{TEST_TMPDIR}) + message(STATUS "Running $ENV{CTEST_PARALLEL_LEVEL} tests in parallel in $ENV{TEST_TMPDIR}") +endif() + +# Use a timeout of 10 minutes per test by default +if(DEFINED ENV{TEST_TIMEOUT}) + set(test_timeout "$ENV{TEST_TIMEOUT}") +else() + set(test_timeout 600) +endif() + +# Run all tests, and show test output on failure +execute_process(COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure --schedule-random --timeout ${test_timeout} RESULT_VARIABLE rv) + +# Clean up after ourselves if the run was successful +if(DEFINED tmpdir AND DEFINED rv AND ${rv} EQUAL 0) + file(REMOVE_RECURSE ${tmpdir}) +endif() diff --git a/cmake/RocksDBConfig.cmake.in b/cmake/SpeedbConfig.cmake.in similarity index 89% rename from cmake/RocksDBConfig.cmake.in rename to cmake/SpeedbConfig.cmake.in index 0bd14be11e..3309b45bba 100644 --- a/cmake/RocksDBConfig.cmake.in +++ b/cmake/SpeedbConfig.cmake.in @@ -50,5 +50,5 @@ endif() find_dependency(Threads) -include("${CMAKE_CURRENT_LIST_DIR}/RocksDBTargets.cmake") -check_required_components(RocksDB) +include("${CMAKE_CURRENT_LIST_DIR}/SpeedbTargets.cmake") +check_required_components(Speedb) diff --git a/cmake/modules/ReadSpeedbVersion.cmake b/cmake/modules/ReadSpeedbVersion.cmake new file mode 100644 index 0000000000..061d7cff49 --- /dev/null +++ b/cmake/modules/ReadSpeedbVersion.cmake @@ -0,0 +1,10 @@ +# Read Speedb version from version.h header file. + +function(get_speedb_version version_var) + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/speedb/version.h" version_header_file) + foreach(component MAJOR MINOR PATCH) + string(REGEX MATCH "#define SPEEDB_${component} ([0-9]+)" _ ${version_header_file}) + set(SPEEDB_VERSION_${component} ${CMAKE_MATCH_1}) + endforeach() + set(${version_var} "${SPEEDB_VERSION_MAJOR}.${SPEEDB_VERSION_MINOR}.${SPEEDB_VERSION_PATCH}" PARENT_SCOPE) +endfunction() diff --git a/common.mk b/common.mk new file mode 100644 index 0000000000..eee494dc5a --- /dev/null +++ b/common.mk @@ -0,0 +1,58 @@ +ifndef PYTHON + +# Default to python3. Some distros like CentOS 8 do not have `python`. +ifeq ($(origin PYTHON), undefined) + PYTHON := $(shell which python3 || which python || echo python3) +endif +export PYTHON + +endif + +# To setup tmp directory, first recognize some old variables for setting +# test tmp directory or base tmp directory. TEST_TMPDIR is usually read +# by RocksDB tools though Env/FileSystem::GetTestDirectory. +ifeq ($(TEST_TMPDIR),) +TEST_TMPDIR := $(TMPD) +endif + +# Avoid setting up the tmp directory when the target isn't a check target or +# on Makefile restarts +ifneq ($(filter %check,$(MAKECMDGOALS)),) +ifeq ($(MAKE_RESTARTS),) + +ifeq ($(TEST_TMPDIR),) +ifeq ($(BASE_TMPDIR),) +BASE_TMPDIR :=$(TMPDIR) +endif +ifeq ($(BASE_TMPDIR),) +BASE_TMPDIR :=/tmp +endif +# Use /dev/shm on Linux if it has the sticky bit set (otherwise, /tmp or other +# base dir), and create a randomly-named rocksdb.XXXXXX directory therein. +ifneq ($(shell [ "$$(uname -s)" = "Linux" ] && [ -k /dev/shm ] && echo 1),) +BASE_TMPDIR :=/dev/shm +endif +# Use 6 Xs in the template in order to appease the BusyBox mktemp command, +# which requires the template to end with exactly 6 Xs. +TEST_TMPDIR := $(shell mktemp -d "$(BASE_TMPDIR)/rocksdb.XXXXXX") +endif + +# The `export` line below doesn't work in case Make restarts (due to included +# makefiles getting remade), so we need to output the directory we created into +# a temporary config file that will be included by the `include` directive below +# in case of a restart (we don't want to output it into make_config.mk in order +# to avoid having the TEST_TMPDIR implicitly set for test that are run through +# makefiles that include make_config.mk, and because we don't want to change +# make_config.mk on every run) +$(shell printf 'ifeq ($$(TEST_TMPDIR),)\nTEST_TMPDIR:=$(TEST_TMPDIR)\nendif\n' > test_config.mk) + +else + +# If neither TEST_TMPDIR nor TMPD were specified, try to load TEST_TMPDIR from +# a previous run as saved in test_config.mk (generated by the shell call above) +include test_config.mk + +endif +endif + +export TEST_TMPDIR diff --git a/crash_test.mk b/crash_test.mk index 65ac083f14..23c4278ce5 100644 --- a/crash_test.mk +++ b/crash_test.mk @@ -5,7 +5,7 @@ # build DB_STRESS_CMD so it must exist prior. DB_STRESS_CMD?=./db_stress -include python.mk +include common.mk CRASHTEST_MAKE=$(MAKE) -f crash_test.mk CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) @@ -20,6 +20,12 @@ CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) blackbox_crash_test_with_multiops_wc_txn \ blackbox_crash_test_with_multiops_wp_txn +narrow_crash_test: $(DB_STRESS_CMD) + $(CRASHTEST_PY) narrow $(CRASH_TEST_EXT_ARGS) + +no_kill_crash_test: db_stress + $(CRASHTEST_PY) whitebox --disable_kill_points=1 --duration=4000 $(CRASH_TEST_EXT_ARGS) + crash_test: $(DB_STRESS_CMD) # Do not parallelize $(CRASHTEST_MAKE) whitebox_crash_test diff --git a/db/blob/db_blob_index_test.cc b/db/blob/db_blob_index_test.cc index d93aa6bbcc..6bf5e899b1 100644 --- a/db/blob/db_blob_index_test.cc +++ b/db/blob/db_blob_index_test.cc @@ -506,7 +506,8 @@ TEST_F(DBBlobIndexTest, IntegratedBlobIterate) { auto check_iterator = [&](Iterator* iterator, Status expected_status, const Slice& expected_value) { - ASSERT_EQ(expected_status, iterator->status()); + ASSERT_EQ(expected_status.code(), iterator->status().code()); + ASSERT_EQ(expected_status.subcode(), iterator->status().subcode()); if (expected_status.ok()) { ASSERT_TRUE(iterator->Valid()); ASSERT_EQ(expected_value, iterator->value()); diff --git a/db/column_family.cc b/db/column_family.cc index 4c38546eb7..8fcbcd5f62 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -739,6 +739,12 @@ const double kDelayRecoverSlowdownRatio = 1.4; namespace { // If penalize_stop is true, we further reduce slowdown rate. +// being one of them. The columen families have their own delay / stop logic +// that should rely solely on their considerations. For that reason, the values +// specific to the cf-s source are used in the code below, rather than the +// global values (see write_controller->delayed_write_rate() and +// The write controller supports multiple sources of delay, the column families +// write_controller->NeedsDelay()) below. std::unique_ptr SetupDelay( WriteController* write_controller, uint64_t compaction_needed_bytes, uint64_t prev_compaction_need_bytes, bool penalize_stop, @@ -746,12 +752,13 @@ std::unique_ptr SetupDelay( const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s. uint64_t max_write_rate = write_controller->max_delayed_write_rate(); - uint64_t write_rate = write_controller->delayed_write_rate(); - + uint64_t write_rate = + write_controller->delayed_write_rate(WriteController::DelaySource::kCF); if (auto_comapctions_disabled) { // When auto compaction is disabled, always use the value user gave. write_rate = max_write_rate; - } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) { + } else if (write_controller->NeedsDelay(WriteController::DelaySource::kCF) && + max_write_rate > kMinWriteRate) { // If user gives rate less than kMinWriteRate, don't adjust it. // // If already delayed, need to adjust based on previous compaction debt. @@ -799,7 +806,8 @@ std::unique_ptr SetupDelay( } } } - return write_controller->GetDelayToken(write_rate); + return write_controller->GetDelayToken(WriteController::DelaySource::kCF, + write_rate); } int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, @@ -890,7 +898,8 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( auto write_stall_cause = write_stall_condition_and_cause.second; bool was_stopped = write_controller->IsStopped(); - bool needed_delay = write_controller->NeedsDelay(); + bool needed_delay = + write_controller->NeedsDelay(WriteController::DelaySource::kCF); if (write_stall_condition == WriteStallCondition::kStopped && write_stall_cause == WriteStallCause::kMemtableLimit) { @@ -934,9 +943,11 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( ioptions_.logger, "[%s] Stalling writes because we have %d immutable memtables " "(waiting for flush), max_write_buffer_number is set to %d " - "rate %" PRIu64, + "rate %" PRIu64 " global rate %" PRIu64, name_.c_str(), imm()->NumNotFlushed(), mutable_cf_options.max_write_buffer_number, + write_controller->delayed_write_rate( + WriteController::DelaySource::kCF), write_controller->delayed_write_rate()); } else if (write_stall_condition == WriteStallCondition::kDelayed && write_stall_cause == WriteStallCause::kL0FileCountLimit) { @@ -955,8 +966,10 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( } ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stalling writes because we have %d level-0 files " - "rate %" PRIu64, + "rate %" PRIu64 " global rate %" PRIu64, name_.c_str(), vstorage->l0_delay_trigger_count(), + write_controller->delayed_write_rate( + WriteController::DelaySource::kCF), write_controller->delayed_write_rate()); } else if (write_stall_condition == WriteStallCondition::kDelayed && write_stall_cause == WriteStallCause::kPendingCompactionBytes) { @@ -977,11 +990,14 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats( InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1); + ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stalling writes because of estimated pending compaction " - "bytes %" PRIu64 " rate %" PRIu64, + "bytes %" PRIu64 " rate %" PRIu64 " global rate %" PRIu64, name_.c_str(), vstorage->estimated_compaction_needed_bytes(), + write_controller->delayed_write_rate( + WriteController::DelaySource::kCF), write_controller->delayed_write_rate()); } else { assert(write_stall_condition == WriteStallCondition::kNormal); @@ -1019,9 +1035,12 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( // double the slowdown ratio. This is to balance the long term slowdown // increase signal. if (needed_delay) { - uint64_t write_rate = write_controller->delayed_write_rate(); - write_controller->set_delayed_write_rate(static_cast( - static_cast(write_rate) * kDelayRecoverSlowdownRatio)); + uint64_t write_rate = write_controller->delayed_write_rate( + WriteController::DelaySource::kCF); + write_controller->set_delayed_write_rate( + WriteController::DelaySource::kCF, + static_cast(static_cast(write_rate) * + kDelayRecoverSlowdownRatio)); // Set the low pri limit to be 1/4 the delayed write rate. // Note we don't reset this value even after delay condition is relased. // Low-pri rate will continue to apply if there is a compaction @@ -1214,7 +1233,7 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) { RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES); SuperVersion* sv_to_delete = nullptr; - if (sv && sv->Unref()) { + if (sv != SuperVersion::kSVObsolete && sv->Unref()) { RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS); db->mutex()->Lock(); // NOTE: underlying resources held by superversion (sst files) might diff --git a/db/column_family_test.cc b/db/column_family_test.cc index c55eb12905..24cc32b675 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -2697,7 +2697,8 @@ TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); mutable_cf_options.disable_auto_compactions = true; - dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate); + dbfull()->TEST_write_controler().set_delayed_write_rate( + WriteController::DelaySource::kCF, kBaseRate); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_TRUE(!IsDbWriteStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); @@ -3128,8 +3129,6 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) { #ifndef ROCKSDB_LITE // TEST functions are not supported in lite TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { SpecialEnv env(Env::Default()); - // Allow both of flush and purge job to schedule. - env.SetBackgroundThreads(2, Env::HIGH); db_options_.env = &env; db_options_.max_background_flushes = 1; column_family_options_.memtable_factory.reset( @@ -3163,9 +3162,8 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ {"ColumnFamilyTest::IteratorCloseWALFile2:0", "DBImpl::BGWorkPurge:start"}, - {"ColumnFamilyTest::IteratorCloseWALFile2:2", + {"ColumnFamilyTest::IteratorCloseWALFile2:1", "DBImpl::BackgroundCallFlush:start"}, - {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"}, }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -3177,22 +3175,37 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { ASSERT_EQ(2, env.num_open_wal_file_.load()); // Deleting the iterator will clear its super version, triggering // closing all files - it->Seek(""); + it->Seek(""); // purge (x2) ASSERT_OK(it->status()); ASSERT_EQ(2, env.num_open_wal_file_.load()); ASSERT_EQ(0, env.delete_count_.load()); TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0"); - TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1"); + + // Fill the low priority pool in order to ensure that all background purges + // finished before we continue + std::vector sleeping_tasks( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& task : sleeping_tasks) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &task, + Env::Priority::LOW); + task.WaitUntilSleeping(); + } + // Release and wait for all of the tasks to finish + for (auto& task : sleeping_tasks) { + task.WakeUp(); + task.WaitUntilDone(); + } + ASSERT_EQ(1, env.num_open_wal_file_.load()); ASSERT_EQ(1, env.delete_count_.load()); - TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2"); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1"); WaitForFlush(1); ASSERT_EQ(1, env.num_open_wal_file_.load()); ASSERT_EQ(1, env.delete_count_.load()); - delete it; + delete it; // purge ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); Reopen(); diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index 29e3494eaa..a3d4a44b09 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -228,7 +228,7 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { // verify all compaction input files are deleted for (auto fname : l0_files) { - ASSERT_EQ(Status::NotFound(), env_->FileExists(fname)); + ASSERT_TRUE(env_->FileExists(fname).IsNotFound()); } delete db; } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 7cd5ad7e23..c480ce51a9 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -337,6 +337,68 @@ TEST_F(CorruptionTest, Recovery) { Check(36, 36); } +TEST_F(CorruptionTest, PostPITRCorruptionWALsRetained) { + // Repro for bug where WALs following the point-in-time recovery were not + // retained leading to the next recovery failing. + CloseDb(); + + options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + const std::string test_cf_name = "test_cf"; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions()); + + uint64_t log_num; + { + options_.create_missing_column_families = true; + std::vector cfhs; + ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_)); + + ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v")); + ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k", "v")); + ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2")); + std::vector file_nums; + GetSortedWalFiles(file_nums); + log_num = file_nums.back(); + for (auto* cfh : cfhs) { + delete cfh; + } + CloseDb(); + } + + CorruptFileWithTruncation(FileType::kWalFile, log_num, + /*bytes_to_truncate=*/1); + + { + // Recover "k" -> "v" for both CFs. "k2" -> "v2" is lost due to truncation. + options_.avoid_flush_during_recovery = true; + std::vector cfhs; + ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_)); + // Flush one but not both CFs and write some data so there's a seqno gap + // between the PITR corruption and the next DB session's first WAL. + ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k2", "v2")); + ASSERT_OK(db_->Flush(FlushOptions(), cfhs[1])); + + for (auto* cfh : cfhs) { + delete cfh; + } + CloseDb(); + } + + // With the bug, this DB open would remove the WALs following the PITR + // corruption. Then, the next recovery would fail. + for (int i = 0; i < 2; ++i) { + std::vector cfhs; + ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_)); + + for (auto* cfh : cfhs) { + delete cfh; + } + CloseDb(); + } +} + TEST_F(CorruptionTest, RecoverWriteError) { env_->writable_file_error_ = true; Status s = TryReopen(); diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 3833066dd9..671127b248 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -1125,7 +1125,7 @@ TEST_F(DBBasicTest, DBClose) { s = db->Close(); ASSERT_EQ(env->GetCloseCount(), 1); - ASSERT_EQ(s, Status::IOError()); + ASSERT_TRUE(s.IsIOError()); delete db; ASSERT_EQ(env->GetCloseCount(), 1); @@ -1145,7 +1145,7 @@ TEST_F(DBBasicTest, DBClose) { ASSERT_TRUE(db != nullptr); s = db->Close(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); delete db; ASSERT_EQ(env->GetCloseCount(), 2); options.info_log.reset(); @@ -1168,15 +1168,15 @@ TEST_F(DBBasicTest, DBCloseFlushError) { ASSERT_OK(Put("key3", "value3")); fault_injection_env->SetFilesystemActive(false); Status s = dbfull()->Close(); - ASSERT_NE(s, Status::OK()); + ASSERT_NOK(s); // retry should return the same error s = dbfull()->Close(); - ASSERT_NE(s, Status::OK()); + ASSERT_NOK(s); fault_injection_env->SetFilesystemActive(true); // retry close() is no-op even the system is back. Could be improved if // Close() is retry-able: #9029 s = dbfull()->Close(); - ASSERT_NE(s, Status::OK()); + ASSERT_NOK(s); Destroy(options); } @@ -2247,7 +2247,7 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { // Make the value compressible. A purely random string doesn't compress // and the resultant data block will not be compressed std::string value(rnd.RandomString(128) + zero_str); - assert(Put(Key(i), value) == Status::OK()); + ASSERT_OK(Put(Key(i), value)); } ASSERT_OK(Flush()); @@ -2789,8 +2789,11 @@ class DBBasicTestMultiGet : public DBTestBase { // Make the value compressible. A purely random string doesn't compress // and the resultant data block will not be compressed values_.emplace_back(rnd.RandomString(128) + zero_str); - assert(((num_cfs == 1) ? Put(Key(i), values_[i]) - : Put(cf, Key(i), values_[i])) == Status::OK()); + if (num_cfs == 1) { + assert(Put(Key(i), values_[i]).ok()); + } else { + assert(Put(cf, Key(i), values_[i]).ok()); + } } if (num_cfs == 1) { EXPECT_OK(Flush()); @@ -2802,9 +2805,11 @@ class DBBasicTestMultiGet : public DBTestBase { // block cannot gain space by compression uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0'); std::string tmp_key = "a" + Key(i); - assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i]) - : Put(cf, tmp_key, uncompressable_values_[i])) == - Status::OK()); + if (num_cfs == 1) { + assert(Put(tmp_key, uncompressable_values_[i]).ok()); + } else { + assert(Put(cf, tmp_key, uncompressable_values_[i]).ok()); + } } if (num_cfs == 1) { EXPECT_OK(Flush()); @@ -3229,8 +3234,8 @@ TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { keys.data(), values.data(), statuses.data(), true); ASSERT_TRUE(CheckValue(0, values[0].ToString())); // ASSERT_TRUE(CheckValue(50, values[1].ToString())); - ASSERT_EQ(statuses[0], Status::OK()); - ASSERT_EQ(statuses[1], Status::Corruption()); + ASSERT_OK(statuses[0]); + ASSERT_TRUE(statuses[1].IsCorruption()); SyncPoint::GetInstance()->DisableProcessing(); } @@ -3275,8 +3280,8 @@ TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) { dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); - ASSERT_EQ(statuses[0], Status::IOError()); - ASSERT_EQ(statuses[1], Status::IOError()); + ASSERT_TRUE(statuses[0].IsIOError()); + ASSERT_TRUE(statuses[1].IsIOError()); SyncPoint::GetInstance()->DisableProcessing(); } @@ -3484,9 +3489,7 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet { if (i < num_ok) { EXPECT_OK(statuses[i]); } else { - if (statuses[i] != Status::TimedOut()) { - EXPECT_EQ(statuses[i], Status::TimedOut()); - } + EXPECT_TRUE(statuses[i].IsTimedOut()); } } } @@ -3811,7 +3814,7 @@ TEST_P(DBBasicTestDeadline, PointLookupDeadline) { std::string value; Status s = dbfull()->Get(ro, "k50", &value); if (fs->TimedOut()) { - ASSERT_EQ(s, Status::TimedOut()); + ASSERT_TRUE(s.IsTimedOut()); } else { timedout = false; ASSERT_OK(s); @@ -3898,7 +3901,7 @@ TEST_P(DBBasicTestDeadline, IteratorDeadline) { } if (fs->TimedOut()) { ASSERT_FALSE(iter->Valid()); - ASSERT_EQ(iter->status(), Status::TimedOut()); + ASSERT_TRUE(iter->status().IsTimedOut()); } else { timedout = false; ASSERT_OK(iter->status()); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 0de7e4f662..26708f9390 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -2850,11 +2850,14 @@ TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { options.max_subcompactions = max_subcompactions_; env_->SetBackgroundThreads(1, Env::HIGH); - env_->SetBackgroundThreads(1, Env::LOW); // stop the compaction thread until we simulate the file creation failure. - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); + std::vector sleeping_task_low( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& sleeping_task : sleeping_task_low) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + sleeping_task.WaitUntilSleeping(); + } options.env = env_; @@ -2884,8 +2887,8 @@ TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { // Fail the first file creation. env_->non_writable_count_ = 1; - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + sleeping_task_low[0].WakeUp(); + sleeping_task_low[0].WaitUntilDone(); // Expect compaction to fail here as one file will fail its // creation. @@ -2903,6 +2906,10 @@ TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { } env_->non_writable_count_ = 0; + for (size_t i = 1; i < sleeping_task_low.size(); ++i) { + sleeping_task_low[i].WakeUp(); + sleeping_task_low[i].WaitUntilDone(); + } // Make sure RocksDB will not get into corrupted state. Reopen(options); @@ -2945,17 +2952,22 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) { ASSERT_EQ("0,1", FilesPerLevel(0)); // block compactions - test::SleepingBackgroundTask sleeping_task; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, - Env::Priority::LOW); + std::vector sleeping_tasks( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& task : sleeping_tasks) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &task, + Env::Priority::LOW); + } options.max_bytes_for_level_base = 1024 * 1024; // 1 MB Reopen(options); std::unique_ptr iterator(db_->NewIterator(ReadOptions())); ASSERT_EQ("0,1", FilesPerLevel(0)); // let compactions go - sleeping_task.WakeUp(); - sleeping_task.WaitUntilDone(); + for (auto& task : sleeping_tasks) { + task.WakeUp(); + task.WaitUntilDone(); + } // this should execute L1->L2 (move) ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -6633,12 +6645,12 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); Destroy(options); Reopen(options); @@ -6649,7 +6661,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::FlushMemTable:FlushMemTableFinished", "BackgroundCallCompaction:0"}}); @@ -6660,7 +6672,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); @@ -6674,12 +6686,12 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); // Each write will be similated as corrupted. // Since the file system returns IOStatus::Corruption, it is an @@ -6688,7 +6700,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::FlushMemTable:FlushMemTableFinished", "BackgroundCallCompaction:0"}}); @@ -6698,7 +6710,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); @@ -6727,12 +6739,12 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); Destroy(options); Reopen(options); @@ -6740,7 +6752,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::FlushMemTable:FlushMemTableFinished", "BackgroundCallCompaction:0"}}); @@ -6751,9 +6763,9 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); SyncPoint::GetInstance()->DisableProcessing(); Destroy(options); Reopen(options); @@ -6764,19 +6776,19 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); // options is not set, the checksum handoff will not be triggered fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::FlushMemTable:FlushMemTableFinished", "BackgroundCallCompaction:0"}}); @@ -6786,9 +6798,9 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); Destroy(options); } @@ -6814,12 +6826,12 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); Destroy(options); Reopen(options); @@ -6830,7 +6842,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::FlushMemTable:FlushMemTableFinished", "BackgroundCallCompaction:0"}}); @@ -6841,7 +6853,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); SyncPoint::GetInstance()->DisableProcessing(); @@ -6871,12 +6883,12 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); // Each write will be similated as corrupted. // Since the file system returns IOStatus::Corruption, it is mapped to @@ -6885,7 +6897,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(2), "value2")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::FlushMemTable:FlushMemTableFinished", "BackgroundCallCompaction:0"}}); @@ -6895,7 +6907,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "value3")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); SyncPoint::GetInstance()->DisableProcessing(); @@ -7119,9 +7131,12 @@ TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) { Reopen(options); // Block compaction queue - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); + std::vector sleeping_task_low( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& sleeping_task : sleeping_task_low) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + } // generate files, but avoid trigger auto compaction for (int i = 0; i < kNumL0Files / 2; i++) { @@ -7156,8 +7171,10 @@ TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) { // CompactRange should return before the compaction has the chance to run compact_thread.join(); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + } ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ("0,1", FilesPerLevel(0)); } @@ -7176,9 +7193,12 @@ TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) { Reopen(options); // Block compaction queue - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); + std::vector sleeping_task_low( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& sleeping_task : sleeping_task_low) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + } // generate files, but avoid trigger auto compaction for (int i = 0; i < kNumL0Files / 2; i++) { @@ -7218,8 +7238,10 @@ TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) { auto s = db_->Close(); ASSERT_OK(s); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + } } TEST_F(DBCompactionTest, DBCloseWithManualCompaction) { @@ -7236,9 +7258,12 @@ TEST_F(DBCompactionTest, DBCloseWithManualCompaction) { Reopen(options); // Block compaction queue - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); + std::vector sleeping_task_low( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& sleeping_task : sleeping_task_low) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + } // generate files, but avoid trigger auto compaction for (int i = 0; i < kNumL0Files / 2; i++) { @@ -7275,8 +7300,10 @@ TEST_F(DBCompactionTest, DBCloseWithManualCompaction) { // manual compaction thread should return with Incomplete(). compact_thread.join(); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + } } TEST_F(DBCompactionTest, diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index 76442086d6..19480d6c7e 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -143,6 +143,7 @@ TEST_F(DBFlushTest, FlushInLowPriThreadPool) { options.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); Reopen(options); env_->SetBackgroundThreads(0, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); std::thread::id tid; int num_flushes = 0, num_compactions = 0; @@ -1549,7 +1550,7 @@ TEST_F(DBFlushTest, FlushError) { Status s = dbfull()->TEST_SwitchMemtable(); fault_injection_env->SetFilesystemActive(true); Destroy(options); - ASSERT_NE(s, Status::OK()); + ASSERT_NOK(s); } TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) { @@ -1686,6 +1687,7 @@ TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) { options.create_if_missing = true; options.listeners.push_back(listener); // Setting max_flush_jobs = max_background_jobs / 4 = 2. + options.max_background_flushes = options.max_background_compactions = -1; options.max_background_jobs = 8; // Allow 2 immutable memtables. options.max_write_buffer_number = 3; @@ -2706,6 +2708,7 @@ TEST_P(DBAtomicFlushTest, BgThreadNoWaitAfterManifestError) { options.env = fault_injection_env.get(); // Set a larger value than default so that RocksDB can schedule concurrent // background flush threads. + options.max_background_flushes = options.max_background_compactions = -1; options.max_background_jobs = 8; options.max_write_buffer_number = 8; CreateAndReopenWithCF({"pikachu"}, options); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index dff9fce50c..3bbce0b71e 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -84,6 +84,7 @@ #include "rocksdb/table.h" #include "rocksdb/version.h" #include "rocksdb/write_buffer_manager.h" +#include "speedb/version.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" #include "table/get_context.h" @@ -1539,9 +1540,9 @@ void DBImpl::SchedulePurge() { mutex_.AssertHeld(); assert(opened_successfully_); - // Purge operations are put into High priority queue + // Purge operations are put into the low priority queue bg_purge_scheduled_++; - env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr); + env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::LOW, nullptr); } void DBImpl::BackgroundCallPurge() { @@ -3114,7 +3115,7 @@ Status DBImpl::NewIterators( if (read_options.tailing) { #ifdef ROCKSDB_LITE return Status::InvalidArgument( - "Tailing iterator not supported in RocksDB lite"); + "Tailing iterator not supported in LITE mode"); #else for (auto cfh : column_families) { auto cfd = static_cast_with_check(cfh)->cfd(); @@ -4442,14 +4443,15 @@ void DBImpl::EraseThreadStatusDbInfo() const {} // // A global method that can dump out the build version void DumpRocksDBBuildVersion(Logger* log) { - ROCKS_LOG_HEADER(log, "RocksDB version: %s\n", + ROCKS_LOG_HEADER(log, "Speedb version: %s (%s)\n", + GetSpeedbVersionAsString().c_str(), GetRocksVersionAsString().c_str()); const auto& props = GetRocksBuildProperties(); - const auto& sha = props.find("rocksdb_build_git_sha"); + const auto& sha = props.find("speedb_build_git_sha"); if (sha != props.end()) { ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str()); } - const auto date = props.find("rocksdb_build_date"); + const auto date = props.find("speedb_build_date"); if (date != props.end()) { ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str()); } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 77987ab418..0d922f00b8 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1094,6 +1094,10 @@ class DBImpl : public DB { PeriodicWorkTestScheduler* TEST_GetPeriodicWorkScheduler() const; #endif // !ROCKSDB_LITE + bool TEST_has_write_controller_token() const { + return (write_controller_token_.get() != nullptr); + } + #endif // NDEBUG // persist stats to column family "_persistent_stats" @@ -1240,43 +1244,6 @@ class DBImpl : public DB { std::atomic shutting_down_; - // RecoveryContext struct stores the context about version edits along - // with corresponding column_family_data and column_family_options. - class RecoveryContext { - public: - ~RecoveryContext() { - for (auto& edit_list : edit_lists_) { - for (auto* edit : edit_list) { - delete edit; - } - edit_list.clear(); - } - cfds_.clear(); - mutable_cf_opts_.clear(); - edit_lists_.clear(); - files_to_delete_.clear(); - } - - void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) { - if (map_.find(cfd->GetID()) == map_.end()) { - uint32_t size = static_cast(map_.size()); - map_.emplace(cfd->GetID(), size); - cfds_.emplace_back(cfd); - mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions()); - edit_lists_.emplace_back(autovector()); - } - uint32_t i = map_[cfd->GetID()]; - edit_lists_[i].emplace_back(new VersionEdit(edit)); - } - - std::unordered_map map_; // cf_id to index; - autovector cfds_; - autovector mutable_cf_opts_; - autovector> edit_lists_; - // files_to_delete_ contains sst files - std::set files_to_delete_; - }; - // Except in DB::Open(), WriteOptionsFile can only be called when: // Persist options to options file. // If need_mutex_lock = false, the method will lock DB mutex. @@ -1393,19 +1360,16 @@ class DBImpl : public DB { // be made to the descriptor are added to *edit. // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is // skipped. - // recovery_ctx stores the context about version edits and all those - // edits are persisted to new Manifest after successfully syncing the new WAL. virtual Status Recover( const std::vector& column_families, bool read_only = false, bool error_if_wal_file_exists = false, bool error_if_data_exists_in_wals = false, - uint64_t* recovered_seq = nullptr, - RecoveryContext* recovery_ctx = nullptr); + uint64_t* recovered_seq = nullptr); virtual bool OwnTablesAndLogs() const { return true; } // Set DB identity file, and write DB ID to manifest if necessary. - Status SetDBId(bool read_only, RecoveryContext* recovery_ctx); + Status SetDBId(bool read_only); // REQUIRES: db mutex held when calling this function, but the db mutex can // be released and re-acquired. Db mutex will be held when the function @@ -1414,15 +1378,12 @@ class DBImpl : public DB { // not referenced in the MANIFEST (e.g. // 1. It's best effort recovery; // 2. The VersionEdits referencing the SST files are appended to - // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are + // MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are // still not synced to MANIFEST during recovery.) - // It stores the SST files to be deleted in RecoveryContext. In the + // We delete these SST files. In the // meantime, we find out the largest file number present in the paths, and // bump up the version set's next_file_number_ to be 1 + largest_file_number. - // recovery_ctx stores the context about version edits and files to be - // deleted. All those edits are persisted to new Manifest after successfully - // syncing the new WAL. - Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx); + Status DeleteUnreferencedSstFiles(); // SetDbSessionId() should be called in the constuctor DBImpl() // to ensure that db_session_id_ gets updated every time the DB is opened @@ -1432,11 +1393,6 @@ class DBImpl : public DB { Status FailIfTsSizesMismatch(const ColumnFamilyHandle* column_family, const Slice& ts) const; - // recovery_ctx stores the context about version edits and - // LogAndApplyForRecovery persist all those edits to new Manifest after - // successfully syncing new WAL. - Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx); - private: friend class DB; friend class ErrorHandler; @@ -1691,10 +1647,9 @@ class DBImpl : public DB { // REQUIRES: log_numbers are sorted in ascending order // corrupted_log_found is set to true if we recover from a corrupted log file. - Status RecoverLogFiles(std::vector& log_numbers, + Status RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, bool read_only, - bool* corrupted_log_found, - RecoveryContext* recovery_ctx); + bool* corrupted_log_found); // The following two methods are used to flush a memtable to // storage. The first one is used at database RecoveryTime (when the @@ -1704,12 +1659,6 @@ class DBImpl : public DB { Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit); - // Move all the WAL files starting from corrupted WAL found to - // max_wal_number to avoid column family inconsistency error on recovery. It - // also removes the deleted file from the vector wal_numbers. - void MoveCorruptedWalFiles(std::vector& wal_numbers, - uint64_t corrupted_wal_number); - // Get the size of a log file and, if truncate is true, truncate the // log file to its actual size, thereby freeing preallocated space. // Return success even if truncate fails @@ -2115,6 +2064,8 @@ class DBImpl : public DB { Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, std::string ts_low); + void HandleWBMDelayWritesDuringPreprocessWrite(); + // Lock over the persistent DB state. Non-nullptr iff successfully acquired. FileLock* db_lock_; @@ -2448,6 +2399,13 @@ class DBImpl : public DB { // Pointer to WriteBufferManager stalling interface. std::unique_ptr wbm_stall_; + + // Members used for WBM's required delay + std::unique_ptr write_controller_token_; + WriteBufferManager::UsageState wbm_spdb_usage_state_ = + WriteBufferManager::UsageState::kNone; + uint64_t wbm_spdb_delayed_write_factor_ = + WriteBufferManager::kNoneDelayedWriteFactor; }; extern Options SanitizeOptions(const std::string& db, const Options& src, diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 7808de8de8..02f7d7d951 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -318,7 +318,7 @@ Status DBImpl::FlushMemTableToOutputFile( error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL); } } else { - assert(s == log_io_s); + assert(s.code() == log_io_s.code() && s.subcode() == log_io_s.subcode()); Status new_bg_error = s; error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } @@ -782,7 +782,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL); } } else { - assert(s == log_io_s); + assert(s.code() == log_io_s.code() && s.subcode() == log_io_s.subcode()); Status new_bg_error = s; error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } @@ -2492,7 +2492,11 @@ DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes, int max_background_jobs, bool parallelize_compactions) { BGJobLimits res; - if (max_background_flushes == -1 && max_background_compactions == -1) { + const int flushes = std::max(1, max_background_flushes); + const int compactions = std::max(1, max_background_compactions); + + if ((max_background_flushes == -1 && max_background_compactions == -1) || + (max_background_jobs > flushes + compactions)) { // for our first stab implementing max_background_jobs, simply allocate a // quarter of the threads to flushes. res.max_flushes = std::max(1, max_background_jobs / 4); @@ -2661,7 +2665,7 @@ void DBImpl::BGWorkBottomCompaction(void* arg) { } void DBImpl::BGWorkPurge(void* db) { - IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); reinterpret_cast(db)->BackgroundCallPurge(); TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 0d3a3bea73..1790ed836f 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -863,7 +863,7 @@ uint64_t PrecomputeMinLogNumberToKeep2PC( return min_log_number_to_keep; } -Status DBImpl::SetDBId(bool read_only, RecoveryContext* recovery_ctx) { +Status DBImpl::SetDBId(bool read_only) { Status s; // Happens when immutable_db_options_.write_dbid_to_manifest is set to true // the very first time. @@ -890,14 +890,14 @@ Status DBImpl::SetDBId(bool read_only, RecoveryContext* recovery_ctx) { } s = GetDbIdentityFromIdentityFile(&db_id_); if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { - assert(!read_only); - assert(recovery_ctx != nullptr); - assert(versions_->GetColumnFamilySet() != nullptr); VersionEdit edit; edit.SetDBId(db_id_); + Options options; + MutableCFOptions mutable_cf_options(options); versions_->db_id_ = db_id_; - recovery_ctx->UpdateVersionEdits( - versions_->GetColumnFamilySet()->GetDefault(), edit); + s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &edit, &mutex_, nullptr, + /* new_descriptor_log */ false); } } else if (!read_only) { s = SetIdentityFile(env_, dbname_, db_id_); @@ -905,7 +905,7 @@ Status DBImpl::SetDBId(bool read_only, RecoveryContext* recovery_ctx) { return s; } -Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) { +Status DBImpl::DeleteUnreferencedSstFiles() { mutex_.AssertHeld(); std::vector paths; paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator))); @@ -925,6 +925,7 @@ Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) { uint64_t next_file_number = versions_->current_next_file_number(); uint64_t largest_file_number = next_file_number; + std::set files_to_delete; Status s; for (const auto& path : paths) { std::vector files; @@ -942,9 +943,8 @@ Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) { const std::string normalized_fpath = path + fname; largest_file_number = std::max(largest_file_number, number); if (type == kTableFile && number >= next_file_number && - recovery_ctx->files_to_delete_.find(normalized_fpath) == - recovery_ctx->files_to_delete_.end()) { - recovery_ctx->files_to_delete_.insert(normalized_fpath); + files_to_delete.find(normalized_fpath) == files_to_delete.end()) { + files_to_delete.insert(normalized_fpath); } } } @@ -961,7 +961,21 @@ Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) { assert(versions_->GetColumnFamilySet()); ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault(); assert(default_cfd); - recovery_ctx->UpdateVersionEdits(default_cfd, edit); + s = versions_->LogAndApply( + default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_, + directories_.GetDbDir(), /*new_descriptor_log*/ false); + if (!s.ok()) { + return s; + } + + mutex_.Unlock(); + for (const auto& fname : files_to_delete) { + s = env_->DeleteFile(fname); + if (!s.ok()) { + break; + } + } + mutex_.Lock(); return s; } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 37b567baf4..64896cba47 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -399,7 +399,7 @@ IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname, Status DBImpl::Recover( const std::vector& column_families, bool read_only, bool error_if_wal_file_exists, bool error_if_data_exists_in_wals, - uint64_t* recovered_seq, RecoveryContext* recovery_ctx) { + uint64_t* recovered_seq) { mutex_.AssertHeld(); bool is_new_db = false; @@ -518,10 +518,9 @@ Status DBImpl::Recover( if (!s.ok()) { return s; } - - s = SetDBId(read_only, recovery_ctx); + s = SetDBId(read_only); if (s.ok() && !read_only) { - s = DeleteUnreferencedSstFiles(recovery_ctx); + s = DeleteUnreferencedSstFiles(); } if (immutable_db_options_.paranoid_checks && s.ok()) { @@ -536,6 +535,10 @@ Status DBImpl::Recover( } } } + // DB mutex is already held + if (s.ok() && immutable_db_options_.persist_stats_to_disk) { + s = InitPersistStatsColumnFamily(); + } std::vector files_in_wal_dir; if (s.ok()) { @@ -605,10 +608,7 @@ Status DBImpl::Recover( WalNumber max_wal_number = versions_->GetWalSet().GetWals().rbegin()->first; edit.DeleteWalsBefore(max_wal_number + 1); - assert(recovery_ctx != nullptr); - assert(versions_->GetColumnFamilySet() != nullptr); - recovery_ctx->UpdateVersionEdits( - versions_->GetColumnFamilySet()->GetDefault(), edit); + s = versions_->LogAndApplyToDefaultColumnFamily(&edit, &mutex_); } if (!s.ok()) { return s; @@ -644,8 +644,8 @@ Status DBImpl::Recover( std::sort(wals.begin(), wals.end()); bool corrupted_wal_found = false; - s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found, - recovery_ctx); + s = RecoverLogFiles(wals, &next_sequence, read_only, + &corrupted_wal_found); if (corrupted_wal_found && recovered_seq != nullptr) { *recovered_seq = next_sequence; } @@ -805,30 +805,10 @@ Status DBImpl::InitPersistStatsColumnFamily() { return s; } -Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { - mutex_.AssertHeld(); - assert(versions_->descriptor_log_ == nullptr); - Status s = versions_->LogAndApply( - recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, - recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir()); - if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) { - mutex_.Unlock(); - for (const auto& fname : recovery_ctx.files_to_delete_) { - s = env_->DeleteFile(fname); - if (!s.ok()) { - break; - } - } - mutex_.Lock(); - } - return s; -} - // REQUIRES: wal_numbers are sorted in ascending order -Status DBImpl::RecoverLogFiles(std::vector& wal_numbers, +Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, SequenceNumber* next_sequence, bool read_only, - bool* corrupted_wal_found, - RecoveryContext* recovery_ctx) { + bool* corrupted_wal_found) { struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; @@ -853,7 +833,6 @@ Status DBImpl::RecoverLogFiles(std::vector& wal_numbers, edit.SetColumnFamily(cfd->GetID()); version_edits.insert({cfd->GetID(), edit}); } - int job_id = next_job_id_.fetch_add(1); { auto stream = event_logger_.Log(); @@ -1277,7 +1256,6 @@ Status DBImpl::RecoverLogFiles(std::vector& wal_numbers, edit->SetLogNumber(max_wal_number + 1); } } - if (status.ok()) { // we must mark the next log number as used, even though it's // not actually used. that is because VersionSet assumes @@ -1285,40 +1263,42 @@ Status DBImpl::RecoverLogFiles(std::vector& wal_numbers, // log number versions_->MarkFileNumberUsed(max_wal_number + 1); - if (corrupted_wal_found != nullptr && *corrupted_wal_found == true && - immutable_db_options_.wal_recovery_mode == - WALRecoveryMode::kPointInTimeRecovery) { - MoveCorruptedWalFiles(wal_numbers, corrupted_wal_number); - } - - assert(recovery_ctx != nullptr); + autovector cfds; + autovector cf_opts; + autovector> edit_lists; for (auto* cfd : *versions_->GetColumnFamilySet()) { + cfds.push_back(cfd); + cf_opts.push_back(cfd->GetLatestMutableCFOptions()); auto iter = version_edits.find(cfd->GetID()); assert(iter != version_edits.end()); - recovery_ctx->UpdateVersionEdits(cfd, iter->second); + edit_lists.push_back({&iter->second}); } + std::unique_ptr wal_deletion; if (flushed) { - VersionEdit wal_deletion; + wal_deletion = std::make_unique(); if (immutable_db_options_.track_and_verify_wals_in_manifest) { - wal_deletion.DeleteWalsBefore(max_wal_number + 1); + wal_deletion->DeleteWalsBefore(max_wal_number + 1); } if (!allow_2pc()) { // In non-2pc mode, flushing the memtables of the column families // means we can advance min_log_number_to_keep. - wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1); + wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1); } - assert(versions_->GetColumnFamilySet() != nullptr); - recovery_ctx->UpdateVersionEdits( - versions_->GetColumnFamilySet()->GetDefault(), wal_deletion); + edit_lists.back().push_back(wal_deletion.get()); } + + // write MANIFEST with update + status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_, + directories_.GetDbDir(), + /*new_descriptor_log=*/true); } } if (status.ok()) { if (data_seen && !flushed) { status = RestoreAliveLogFiles(wal_numbers); - } else if (!wal_numbers.empty()) { + } else { // If there's no data in the WAL, or we flushed all the data, still // truncate the log file. If the process goes into a crash loop before // the file is deleted, the preallocated space will never get freed. @@ -1334,48 +1314,6 @@ Status DBImpl::RecoverLogFiles(std::vector& wal_numbers, return status; } -void DBImpl::MoveCorruptedWalFiles(std::vector& wal_numbers, - uint64_t corrupted_wal_number) { - size_t num_wals = wal_numbers.size(); - // Find the first corrupted wal. - auto iter = std::lower_bound(wal_numbers.begin(), wal_numbers.end(), - corrupted_wal_number); - auto corrupt_start_iter = iter; - - // Increment iter to move WAL files from first corrupted_wal_number + 1. - iter++; - - std::string archival_path = - ArchivalDirectory(immutable_db_options_.GetWalDir()); - Status create_status = env_->CreateDirIfMissing(archival_path); - - // create_status is only checked when it needs to move the corrupted WAL files - // to archive folder. - create_status.PermitUncheckedError(); - - // Truncate the last WAL to reclaim the pre allocated space before - // moving it. - GetLogSizeAndMaybeTruncate(wal_numbers.back(), /*truncate=*/true, nullptr) - .PermitUncheckedError(); - - // Move all the WAL files from corrupted_wal_number + 1 to last WAL - // (max_wal_number) to avoid column family inconsistency error to archival - // directory. If its unable to create archive dir, it will delete the - // corrupted WAL files. - // We are moving all but first corrupted WAL file to a different folder. - while (iter != wal_numbers.end()) { - LogFileNumberSize log(*iter); - std::string fname = LogFileName(immutable_db_options_.GetWalDir(), *iter); -#ifndef ROCKSDB_LITE - if (create_status.ok()) { - wal_manager_.ArchiveWALFile(fname, *iter); - } -#endif - iter++; - } - wal_numbers.erase(corrupt_start_iter + 1, wal_numbers.begin() + num_wals); -} - Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, LogFileNumberSize* log_ptr) { LogFileNumberSize log(wal_number); @@ -1438,8 +1376,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { // log has such preallocated space, so we only truncate for the last log. LogFileNumberSize log; s = GetLogSizeAndMaybeTruncate( - wal_number, - /*truncate=*/(wal_number == wal_numbers.back()), &log); + wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log); if (!s.ok()) { break; } @@ -1764,7 +1701,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, std::max(max_write_buffer_size, cf.options.write_buffer_size); } - DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn); + std::unique_ptr impl{ + new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn)}; s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir()); if (s.ok()) { std::vector paths; @@ -1793,20 +1731,15 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, s = impl->CreateArchivalDirectory(); } if (!s.ok()) { - delete impl; return s; } impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); impl->mutex_.Lock(); - - RecoveryContext recovery_ctx; - // Handles create_if_missing, error_if_exists uint64_t recovered_seq(kMaxSequenceNumber); - s = impl->Recover(column_families, false, false, false, &recovered_seq, - &recovery_ctx); + s = impl->Recover(column_families, false, false, false, &recovered_seq); if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); log::Writer* new_log = nullptr; @@ -1823,6 +1756,40 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd != nullptr) { + handles->push_back( + new ColumnFamilyHandleImpl(cfd, impl.get(), &impl->mutex_)); + impl->NewThreadStatusCfInfo(cfd); + } else { + if (db_options.create_missing_column_families) { + // missing column family, create it + ColumnFamilyHandle* handle; + impl->mutex_.Unlock(); + s = impl->CreateColumnFamily(cf.options, cf.name, &handle); + impl->mutex_.Lock(); + if (s.ok()) { + handles->push_back(handle); + } else { + break; + } + } else { + s = Status::InvalidArgument("Column family not found", cf.name); + break; + } + } + } + } + if (s.ok()) { + SuperVersionContext sv_context(/* create_superversion */ true); + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + impl->InstallSuperVersionAndScheduleWork( + cfd, &sv_context, *cfd->GetLatestMutableCFOptions()); + } + sv_context.Clean(); if (impl->two_write_queues_) { impl->log_write_mutex_.Lock(); } @@ -1835,15 +1802,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { // In WritePrepared there could be gap in sequence numbers. This breaks - // the trick we use in kPointInTimeRecovery which assumes the first seq - // in the log right after the corrupted log is one larger than the last - // seq we read from the wals. To let this trick keep working, we add a - // dummy entry with the expected sequence to the first log right after - // recovery. In non-WritePrepared case also the new log after recovery - // could be empty, and thus missing the consecutive seq hint to - // distinguish middle-log corruption to - // corrupted-log-remained-after-recovery. This case also will be - // addressed by a dummy write. + // the trick we use in kPointInTimeRecovery which assumes the first seq in + // the log right after the corrupted log is one larger than the last seq + // we read from the wals. To let this trick keep working, we add a dummy + // entry with the expected sequence to the first log right after recovery. + // In non-WritePrepared case also the new log after recovery could be + // empty, and thus missing the consecutive seq hint to distinguish + // middle-log corruption to corrupted-log-remained-after-recovery. This + // case also will be addressed by a dummy write. if (recovered_seq != kMaxSequenceNumber) { WriteBatch empty_batch; WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); @@ -1862,52 +1828,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } } } - if (s.ok()) { - s = impl->LogAndApplyForRecovery(recovery_ctx); - } - - if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { - impl->mutex_.AssertHeld(); - s = impl->InitPersistStatsColumnFamily(); - } - - if (s.ok()) { - // set column family handles - for (auto cf : column_families) { - auto cfd = - impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); - if (cfd != nullptr) { - handles->push_back( - new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); - impl->NewThreadStatusCfInfo(cfd); - } else { - if (db_options.create_missing_column_families) { - // missing column family, create it - ColumnFamilyHandle* handle; - impl->mutex_.Unlock(); - s = impl->CreateColumnFamily(cf.options, cf.name, &handle); - impl->mutex_.Lock(); - if (s.ok()) { - handles->push_back(handle); - } else { - break; - } - } else { - s = Status::InvalidArgument("Column family not found", cf.name); - break; - } - } - } - } - - if (s.ok()) { - SuperVersionContext sv_context(/* create_superversion */ true); - for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - impl->InstallSuperVersionAndScheduleWork( - cfd, &sv_context, *cfd->GetLatestMutableCFOptions()); - } - sv_context.Clean(); - } if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { // try to read format version s = impl->PersistentStatsProcessFormatVersion(); @@ -1933,8 +1853,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (cfd->ioptions()->merge_operator != nullptr && !cfd->mem()->IsMergeOperatorSupported()) { s = Status::InvalidArgument( - "The memtable of column family %s does not support merge " - "operator " + "The memtable of column family %s does not support merge operator " "its options.merge_operator is non-null", cfd->GetName().c_str()); } @@ -1951,7 +1870,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, persist_options_status = impl->WriteOptionsFile( false /*need_mutex_lock*/, false /*need_enter_write_thread*/); - *dbptr = impl; impl->opened_successfully_ = true; impl->DeleteObsoleteFiles(); TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles"); @@ -2042,7 +1960,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p", - impl); + impl.get()); LogFlush(impl->immutable_db_options_.info_log); assert(impl->TEST_WALBufferIsEmpty()); // If the assert above fails then we need to FlushWAL before returning @@ -2059,14 +1977,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { s = impl->StartPeriodicWorkScheduler(); + *dbptr = impl.release(); } if (!s.ok()) { for (auto* h : *handles) { delete h; } handles->clear(); - delete impl; - *dbptr = nullptr; } return s; } diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index d1a8709dac..1e3c9f2ac5 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -33,8 +33,7 @@ DBImplSecondary::~DBImplSecondary() {} Status DBImplSecondary::Recover( const std::vector& column_families, bool /*readonly*/, bool /*error_if_wal_file_exists*/, - bool /*error_if_data_exists_in_wals*/, uint64_t*, - RecoveryContext* /*recovery_ctx*/) { + bool /*error_if_data_exists_in_wals*/, uint64_t*) { mutex_.AssertHeld(); JobContext job_context(0); diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index fcc86cc879..d3a7940b5b 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -81,8 +81,8 @@ class DBImplSecondary : public DBImpl { // and log_readers_ to facilitate future operations. Status Recover(const std::vector& column_families, bool read_only, bool error_if_wal_file_exists, - bool error_if_data_exists_in_wals, uint64_t* = nullptr, - RecoveryContext* recovery_ctx = nullptr) override; + bool error_if_data_exists_in_wals, + uint64_t* = nullptr) override; // Implementations of the DB interface using DB::Get; diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 39657d4623..9fc99b8932 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1028,6 +1028,68 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) { } } +namespace { + +std::unique_ptr SetupDelayFromFactor( + WriteController& write_controller, uint64_t delay_factor) { + assert(delay_factor > 0U); + constexpr uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s. + + auto max_write_rate = write_controller.max_delayed_write_rate(); + + auto wbm_write_rate = max_write_rate; + if (max_write_rate >= kMinWriteRate) { + // If user gives rate less than kMinWriteRate, don't adjust it. + assert(delay_factor <= WriteBufferManager::kMaxDelayedWriteFactor); + auto write_rate_factor = + static_cast(WriteBufferManager::kMaxDelayedWriteFactor + + WriteBufferManager::kMinDelayedWriteFactor - + delay_factor) / + WriteBufferManager::kMaxDelayedWriteFactor; + wbm_write_rate = max_write_rate * write_rate_factor; + } + + return write_controller.GetDelayToken(WriteController::DelaySource::kWBM, + wbm_write_rate); +} + +} // namespace + +void DBImpl::HandleWBMDelayWritesDuringPreprocessWrite() { + auto [new_usage_state, new_delayed_write_factor] = + write_buffer_manager_->GetUsageStateInfo(); + + if (UNLIKELY((wbm_spdb_usage_state_ != new_usage_state) || + (wbm_spdb_delayed_write_factor_ != new_delayed_write_factor))) { + if (new_usage_state != WriteBufferManager::UsageState::kDelay) { + write_controller_token_.reset(); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Reset WBM Delay Token needs-delay:%d, rate:%lu", + write_controller_.NeedsDelay(), + write_controller_.delayed_write_rate()); + } else if ((wbm_spdb_usage_state_ != + WriteBufferManager::UsageState::kDelay) || + (wbm_spdb_delayed_write_factor_ != new_delayed_write_factor)) { + write_controller_token_ = + SetupDelayFromFactor(write_controller_, new_delayed_write_factor); + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Delaying writes due to WBM's usage relative to quota " + "which is %" PRIu64 "%%(%" PRIu64 "/%" PRIu64 + "). " + "factor:%" PRIu64 ", wbm-rate:%" PRIu64 ", rate:%" PRIu64, + write_buffer_manager_->GetMemoryUsagePercentageOfBufferSize(), + write_buffer_manager_->memory_usage(), + write_buffer_manager_->buffer_size(), new_delayed_write_factor, + write_controller_.delayed_write_rate( + WriteController::DelaySource::kWBM), + write_controller_.delayed_write_rate()); + } + wbm_spdb_usage_state_ = new_usage_state; + wbm_spdb_delayed_write_factor_ = new_delayed_write_factor; + } +} + Status DBImpl::PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync, WriteContext* write_context) { @@ -1071,6 +1133,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, PERF_TIMER_STOP(write_scheduling_flushes_compactions_time); PERF_TIMER_GUARD(write_pre_and_post_process_time); + // Handle latest WBM calculated write delay, if applicable + if (status.ok() && write_buffer_manager_ && + write_buffer_manager_->IsDelayAllowed()) { + HandleWBMDelayWritesDuringPreprocessWrite(); + } + if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || write_controller_.NeedsDelay()))) { PERF_TIMER_STOP(write_pre_and_post_process_time); diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 360a3c561e..d25cffb184 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -615,6 +615,9 @@ TEST_F(DBOptionsTest, SetBackgroundJobs) { Options options; options.create_if_missing = true; options.max_background_jobs = 8; + options.max_background_compactions = options.max_background_flushes = -1; + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + env_->SetBackgroundThreads(1, Env::Priority::LOW); options.env = env_; Reopen(options); diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 8451143392..cf13a0b2ae 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -638,6 +638,8 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) { bbto.cache_index_and_filter_blocks = true; bbto.block_cache = NewLRUCache(8 << 20); opts.table_factory.reset(NewBlockBasedTableFactory(bbto)); + opts.max_background_compactions = 1; + env_->SetBackgroundThreads(1, Env::Priority::LOW); DestroyAndReopen(opts); // Hold a snapshot so range deletions can't become obsolete during compaction diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index fac924d311..b57e46ac51 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -276,7 +276,7 @@ TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) { ASSERT_EQ(metadata.size(), 2U); // This file should have been deleted during last compaction - ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2)); + ASSERT_TRUE(env_->FileExists(dbname_ + file_on_L2).IsNotFound()); listener->VerifyMatchedCount(1); } diff --git a/db/db_test.cc b/db/db_test.cc index a8f74d8ca4..9f4e748145 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -235,7 +235,8 @@ TEST_F(DBTest, SkipDelay) { // when we do Put // TODO(myabandeh): this is time dependent and could potentially make // the test flaky - auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + auto token = dbfull()->TEST_write_controler().GetDelayToken( + WriteController::DelaySource::kCF, 1); std::atomic sleep_count(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::DelayWrite:Sleep", @@ -262,7 +263,8 @@ TEST_F(DBTest, SkipDelay) { ASSERT_GE(wait_count.load(), 0); token.reset(); - token = dbfull()->TEST_write_controler().GetDelayToken(1000000); + token = dbfull()->TEST_write_controler().GetDelayToken( + WriteController::DelaySource::kCF, 1000000); wo.no_slowdown = false; ASSERT_OK(dbfull()->Put(wo, "foo3", large_value)); ASSERT_GE(sleep_count.load(), 1); @@ -297,7 +299,8 @@ TEST_F(DBTest, MixedSlowdownOptions) { // when we do Put // TODO(myabandeh): this is time dependent and could potentially make // the test flaky - auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + auto token = dbfull()->TEST_write_controler().GetDelayToken( + WriteController::DelaySource::kCF, 1); std::atomic sleep_count(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) { @@ -351,7 +354,8 @@ TEST_F(DBTest, MixedSlowdownOptionsInQueue) { // when we do Put // TODO(myabandeh): this is time dependent and could potentially make // the test flaky - auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + auto token = dbfull()->TEST_write_controler().GetDelayToken( + WriteController::DelaySource::kCF, 1); std::atomic sleep_count(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) { @@ -706,6 +710,34 @@ TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) { ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument()); } +TEST_F(DBTest, GetFromBlockCacheWithDisabledCache) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + const std::string key = "key"; + const std::string value = "value"; + + ASSERT_OK(Put(key, value)); + ASSERT_OK(Flush()); + + std::string result; + ASSERT_OK(db_->Get(ReadOptions(), key, &result)); + ASSERT_EQ(result, value); + result.clear(); + + // Disallow I/O + ReadOptions read_options; + read_options.read_tier = kBlockCacheTier; + + Status s = db_->Get(read_options, key, &result); + ASSERT_TRUE(result.empty()); + ASSERT_TRUE(s.IsIncomplete()); +} + // Disable because not all platform can run it. // It requires more than 9GB memory to run it, With single allocation // of more than 3GB. @@ -4130,9 +4162,6 @@ TEST_F(DBTest, ConcurrentMemtableNotSupported) { TEST_F(DBTest, SanitizeNumThreads) { for (int attempt = 0; attempt < 2; attempt++) { - const size_t kTotalTasks = 8; - test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; - Options options = CurrentOptions(); if (attempt == 0) { options.max_background_compactions = 3; @@ -4141,11 +4170,17 @@ TEST_F(DBTest, SanitizeNumThreads) { options.create_if_missing = true; DestroyAndReopen(options); - for (size_t i = 0; i < kTotalTasks; i++) { + const size_t low_task_count = + options.env->GetBackgroundThreads(Env::Priority::LOW) + 1; + const size_t high_task_count = + options.env->GetBackgroundThreads(Env::Priority::HIGH) + 2; + std::vector sleeping_tasks(low_task_count + + high_task_count); + for (size_t i = 0; i < sleeping_tasks.size(); ++i) { // Insert 5 tasks to low priority queue and 5 tasks to high priority queue - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_tasks[i], - (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH); + env_->Schedule( + &test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks[i], + (i < low_task_count) ? Env::Priority::LOW : Env::Priority::HIGH); } // Wait until 10s for they are scheduled. @@ -4162,9 +4197,9 @@ TEST_F(DBTest, SanitizeNumThreads) { // pool size 2, total task 4. Queue size should be 2. ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH)); - for (size_t i = 0; i < kTotalTasks; i++) { - sleeping_tasks[i].WakeUp(); - sleeping_tasks[i].WaitUntilDone(); + for (auto& task : sleeping_tasks) { + task.WakeUp(); + task.WaitUntilDone(); } ASSERT_OK(Put("abc", "def")); @@ -5216,10 +5251,13 @@ TEST_F(DBTest, DynamicCompactionOptions) { ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Block compaction - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - sleeping_task_low.WaitUntilSleeping(); + std::vector sleeping_task_low( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& sleeping_task : sleeping_task_low) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + sleeping_task.WaitUntilSleeping(); + } ASSERT_EQ(NumTableFilesAtLevel(0), 0); int count = 0; Random rnd(301); @@ -5229,14 +5267,18 @@ TEST_F(DBTest, DynamicCompactionOptions) { ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); count++; if (dbfull()->TEST_write_controler().IsStopped()) { - sleeping_task_low.WakeUp(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WakeUp(); + } break; } } // Stop trigger = 8 ASSERT_EQ(count, 8); // Unblock - sleeping_task_low.WaitUntilDone(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WaitUntilDone(); + } // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. // Block compaction thread again. Perform the put and memtable flushes @@ -5247,23 +5289,29 @@ TEST_F(DBTest, DynamicCompactionOptions) { ASSERT_EQ(NumTableFilesAtLevel(0), 0); // Block compaction again - sleeping_task_low.Reset(); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - sleeping_task_low.WaitUntilSleeping(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + sleeping_task.WaitUntilSleeping(); + } count = 0; while (count < 64) { ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo)); ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); count++; if (dbfull()->TEST_write_controler().IsStopped()) { - sleeping_task_low.WakeUp(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WakeUp(); + } break; } } ASSERT_EQ(count, 6); // Unblock - sleeping_task_low.WaitUntilDone(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WaitUntilDone(); + } // Test disable_auto_compactions // Compaction thread is unblocked but auto compaction is disabled. Write @@ -6525,11 +6573,14 @@ TEST_F(DBTest, SoftLimit) { ASSERT_OK(Put(Key(0), "")); - test::SleepingBackgroundTask sleeping_task_low; + std::vector sleeping_task_low( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); // Block compactions - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - sleeping_task_low.WaitUntilSleeping(); + for (auto& sleeping_task : sleeping_task_low) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + sleeping_task.WaitUntilSleeping(); + } // Create 3 L0 files, making score of L0 to be 3. for (int i = 0; i < 3; i++) { @@ -6543,9 +6594,11 @@ TEST_F(DBTest, SoftLimit) { ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); - sleeping_task_low.Reset(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + sleeping_task.Reset(); + } ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Now there is one L1 file but doesn't trigger soft_rate_limit @@ -6562,14 +6615,16 @@ TEST_F(DBTest, SoftLimit) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BackgroundCallCompaction:0", [&](void* /*arg*/) { // Schedule a sleeping task. - sleeping_task_low.Reset(); + sleeping_task_low[0].Reset(); env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_low, Env::Priority::LOW); + &sleeping_task_low[0], Env::Priority::LOW); }); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - sleeping_task_low.WaitUntilSleeping(); + for (auto& sleeping_task : sleeping_task_low) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + sleeping_task.WaitUntilSleeping(); + } // Create 3 L0 files, making score of L0 to be 3 for (int i = 0; i < 3; i++) { ASSERT_OK(Put(Key(10 + i), std::string(5000, 'x'))); @@ -6583,8 +6638,8 @@ TEST_F(DBTest, SoftLimit) { // Wake up sleep task to enable compaction to run and waits // for it to go to sleep state again to make sure one compaction // goes through. - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilSleeping(); + sleeping_task_low[0].WakeUp(); + sleeping_task_low[0].WaitUntilSleeping(); // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB // Given level multiplier 10, estimated pending compaction is around 100KB @@ -6605,8 +6660,8 @@ TEST_F(DBTest, SoftLimit) { // Wake up sleep task to enable compaction to run and waits // for it to go to sleep state again to make sure one compaction // goes through. - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilSleeping(); + sleeping_task_low[0].WakeUp(); + sleeping_task_low[0].WaitUntilSleeping(); // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB // L2 size is 360KB, so the estimated level fanout 4, estimated pending @@ -6616,8 +6671,8 @@ TEST_F(DBTest, SoftLimit) { ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilSleeping(); + sleeping_task_low[0].WakeUp(); + sleeping_task_low[0].WaitUntilSleeping(); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal)); @@ -6632,10 +6687,12 @@ TEST_F(DBTest, SoftLimit) { ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); - sleeping_task_low.WaitUntilSleeping(); + sleeping_task_low[0].WaitUntilSleeping(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); + for (auto& sleeping_task : sleeping_task_low) { + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + } } TEST_F(DBTest, LastWriteBufferDelay) { @@ -6744,7 +6801,7 @@ TEST_F(DBTest, PinnableSliceAndRowCache) { { PinnableSlice pin_slice; - ASSERT_EQ(Get("foo", &pin_slice), Status::OK()); + ASSERT_OK(Get("foo", &pin_slice)); ASSERT_EQ(pin_slice.ToString(), "bar"); // Entry is already in cache, lookup will remove the element from lru ASSERT_EQ( @@ -6957,7 +7014,7 @@ TEST_F(DBTest, CreationTimeOfOldestFile) { uint64_t creation_time; Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time); ASSERT_EQ(0, creation_time); - ASSERT_EQ(s1, Status::OK()); + ASSERT_OK(s1); // Testing with non-zero file creation time. set_file_creation_time_to_zero = false; @@ -6982,14 +7039,14 @@ TEST_F(DBTest, CreationTimeOfOldestFile) { uint64_t ctime; Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime); ASSERT_EQ(uint_time_1, ctime); - ASSERT_EQ(s2, Status::OK()); + ASSERT_OK(s2); // Testing with max_open_files != -1 options = CurrentOptions(); options.max_open_files = 10; DestroyAndReopen(options); Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime); - ASSERT_EQ(s3, Status::NotSupported()); + ASSERT_TRUE(s3.IsNotSupported()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } diff --git a/db/db_test2.cc b/db/db_test2.cc index 4ff979962f..11239115fb 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5218,7 +5218,7 @@ TEST_F(DBTest2, PinnableSliceAndMmapReads) { ASSERT_OK(Flush()); PinnableSlice pinned_value; - ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + ASSERT_OK(Get("foo", &pinned_value)); // It is not safe to pin mmap files as they might disappear by compaction ASSERT_FALSE(pinned_value.IsPinned()); ASSERT_EQ(pinned_value.ToString(), "bar"); @@ -5236,7 +5236,7 @@ TEST_F(DBTest2, PinnableSliceAndMmapReads) { // Unsafe to pin mmap files when they could be kicked out of table cache Close(); ASSERT_OK(ReadOnlyReopen(options)); - ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + ASSERT_OK(Get("foo", &pinned_value)); ASSERT_FALSE(pinned_value.IsPinned()); ASSERT_EQ(pinned_value.ToString(), "bar"); @@ -5246,7 +5246,7 @@ TEST_F(DBTest2, PinnableSliceAndMmapReads) { Close(); options.max_open_files = -1; ASSERT_OK(ReadOnlyReopen(options)); - ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + ASSERT_OK(Get("foo", &pinned_value)); ASSERT_TRUE(pinned_value.IsPinned()); ASSERT_EQ(pinned_value.ToString(), "bar"); #endif @@ -5896,10 +5896,12 @@ TEST_F(DBTest2, BackgroundPurgeTest) { size_t value = options.write_buffer_manager->memory_usage(); ASSERT_GT(value, base_value); - db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH); + // Take up a slot in the low priority pool + // in order to prevent a purge from running when the iterator is deleted. + db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::LOW); test::SleepingBackgroundTask sleeping_task_after; db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_after, Env::Priority::HIGH); + &sleeping_task_after, Env::Priority::LOW); delete iter; Env::Default()->SleepForMicroseconds(100000); @@ -5911,7 +5913,7 @@ TEST_F(DBTest2, BackgroundPurgeTest) { test::SleepingBackgroundTask sleeping_task_after2; db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_after2, Env::Priority::HIGH); + &sleeping_task_after2, Env::Priority::LOW); sleeping_task_after2.WakeUp(); sleeping_task_after2.WaitUntilDone(); diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 2cae1b6eff..af62858533 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -965,8 +965,7 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { bool first = true; while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) != - Status::OK()) { + if (!ParseInternalKey(iter->key(), &ikey, true /* log_err_key */).ok()) { result += "CORRUPTED"; } else { if (!last_options_.comparator->Equal(ikey.user_key, user_key)) { @@ -1515,7 +1514,9 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, ASSERT_EQ(Get(kv.first), kv.second); } else { std::string value; - ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value)); + Status ref_s = db_->Get(ReadOptions(), kv.first, &value); + ASSERT_EQ(s.code(), ref_s.code()); + ASSERT_EQ(s.subcode(), ref_s.subcode()); } total_reads++; } @@ -1536,7 +1537,8 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, if (!current_status.ok()) { s = current_status; } - ASSERT_EQ(iter->status(), s); + ASSERT_EQ(iter->status().code(), s.code()); + ASSERT_EQ(iter->status().subcode(), s.subcode()); if (current_status.ok()) { ASSERT_EQ(iter->value().ToString(), data_iter->second); } @@ -1559,7 +1561,8 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, if (!current_status.ok()) { s = current_status; } - ASSERT_EQ(iter->status(), s); + ASSERT_EQ(iter->status().code(), s.code()); + ASSERT_EQ(iter->status().subcode(), s.subcode()); if (current_status.ok()) { ASSERT_EQ(iter->value().ToString(), data_rev->second); } diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 9a953a178a..cd7ce9e7d6 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -37,7 +37,7 @@ class DBWALTestBase : public DBTestBase { int alloc_status = fallocate(fd, 0, 0, 1); int err_number = errno; close(fd); - assert(env_->DeleteFile(fname_test_fallocate) == Status::OK()); + assert(env_->DeleteFile(fname_test_fallocate).ok()); if (err_number == ENOSYS || err_number == EOPNOTSUPP) { fprintf(stderr, "Skipped preallocated space check: %s\n", errnoStr(err_number).c_str()); @@ -1496,6 +1496,9 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) { options.track_and_verify_wals_in_manifest = true; // The following make sure there are two bg flush threads. options.max_background_jobs = 8; + options.max_background_compactions = options.max_background_flushes = -1; + env_->SetBackgroundThreads(1, Env::Priority::HIGH); + env_->SetBackgroundThreads(1, Env::Priority::LOW); const std::string cf1_name("cf1"); CreateAndReopenWithCF({cf1_name}, options); diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index 0c14e43338..93d954a98c 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -520,30 +520,30 @@ TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) { std::string ts_low_str_back = Timestamp(8, 0); auto s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str_back); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // test IncreaseFullHistoryTsLow with a timestamp whose length is longger // than the cf's timestamp size std::string ts_low_str_long(Timestamp(0, 0).size() + 1, 'a'); s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str_long); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // test IncreaseFullHistoryTsLow with a timestamp which is null std::string ts_low_str_null = ""; s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str_null); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // test IncreaseFullHistoryTsLow for a column family that does not enable // timestamp options.comparator = BytewiseComparator(); DestroyAndReopen(options); ts_low_str = Timestamp(10, 0); s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // test GetFullHistoryTsLow for a column family that does not enable // timestamp std::string current_ts_low; s = db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), ¤t_ts_low); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); Close(); } @@ -719,7 +719,8 @@ TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) { ropts.timestamp = &ts; std::string value; Status s = db->Get(ropts, key, &value); - ASSERT_TRUE(s == status); + ASSERT_EQ(s.code(), status.code()); + ASSERT_EQ(s.subcode(), status.subcode()); if (s.ok()) { ASSERT_EQ(checkValue, value); } diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc index c1e8f7100e..af723d193b 100644 --- a/db/db_write_buffer_manager_test.cc +++ b/db/db_write_buffer_manager_test.cc @@ -14,10 +14,12 @@ namespace ROCKSDB_NAMESPACE { class DBWriteBufferManagerTest : public DBTestBase, - public testing::WithParamInterface { + public ::testing::WithParamInterface { public: DBWriteBufferManagerTest() : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {} + + void SetUp() override { cost_cache_ = GetParam(); } bool cost_cache_; }; @@ -27,7 +29,6 @@ TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) { options.write_buffer_size = 500000; // this is never hit std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); ASSERT_LT(cache->GetUsage(), 256 * 1024); - cost_cache_ = GetParam(); if (cost_cache_) { options.write_buffer_manager.reset( @@ -70,7 +71,6 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { options.write_buffer_size = 500000; // this is never hit std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); ASSERT_LT(cache->GetUsage(), 256 * 1024); - cost_cache_ = GetParam(); if (cost_cache_) { options.write_buffer_manager.reset( @@ -197,7 +197,6 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { options.write_buffer_size = 500000; // this is never hit std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); ASSERT_LT(cache->GetUsage(), 256 * 1024); - cost_cache_ = GetParam(); if (cost_cache_) { options.write_buffer_manager.reset( @@ -314,7 +313,6 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { options.write_buffer_size = 500000; // this is never hit std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); ASSERT_LT(cache->GetUsage(), 256 * 1024); - cost_cache_ = GetParam(); if (cost_cache_) { options.write_buffer_manager.reset( @@ -456,7 +454,6 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) { options.write_buffer_size = 500000; // this is never hit std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); ASSERT_LT(cache->GetUsage(), 256 * 1024); - cost_cache_ = GetParam(); if (cost_cache_) { options.write_buffer_manager.reset( @@ -618,7 +615,6 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { options.write_buffer_size = 500000; // this is never hit std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); ASSERT_LT(cache->GetUsage(), 256 * 1024); - cost_cache_ = GetParam(); if (cost_cache_) { options.write_buffer_manager.reset( @@ -780,9 +776,133 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +class DBWriteBufferManagerTest1 : public DBTestBase, + public ::testing::WithParamInterface { + public: + DBWriteBufferManagerTest1() + : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {} + + void SetUp() override { cost_cache_ = GetParam(); } + bool cost_cache_; +}; + +TEST_P(DBWriteBufferManagerTest1, WbmDelaySharedWriteBufferAcrossCFs) { + constexpr size_t kQuota = 100 * 1000; + constexpr size_t kDelayThreshold = + WriteBufferManager::kStartDelayPercentThreshold * kQuota / 100; + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = kQuota; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(kQuota, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(kQuota, nullptr, true)); + } + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + + // Reach the delay threshold by writing to two cf-s, no flush + ASSERT_OK(Put(0, Key(1), DummyString(kDelayThreshold / 2), wo)); + ASSERT_OK(Put(1, Key(1), DummyString(kDelayThreshold / 2), wo)); + + // Write another byte to trigger writing and a delay token in the write + // controller + auto& write_controller = dbfull()->TEST_write_controler(); + ASSERT_FALSE(dbfull()->TEST_has_write_controller_token()); + ASSERT_FALSE(write_controller.NeedsDelay()); + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); + ASSERT_TRUE(dbfull()->TEST_has_write_controller_token()); + ASSERT_TRUE(write_controller.NeedsDelay()); + + Flush(1); + + // Delay token should be released when the next write arrives + ASSERT_TRUE(dbfull()->TEST_has_write_controller_token()); + ASSERT_TRUE(write_controller.NeedsDelay()); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + ASSERT_FALSE(write_controller.NeedsDelay()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBWriteBufferManagerTest1, WbmDelaySharedWriteBufferAcrossDBs) { + constexpr size_t kQuota = 500 * 1000; + constexpr size_t kDelayThreshold = + WriteBufferManager::kStartDelayPercentThreshold * kQuota / 100; + + std::vector dbnames; + std::vector dbs; + int num_dbs = 3; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(kQuota, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(kQuota, nullptr, true)); + } + + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(kDelayThreshold / num_dbs))); + } + + // Write another byte to trigger writing and a delay token in the write + // controller + auto& write_controller = dbfull()->TEST_write_controler(); + ASSERT_FALSE(dbfull()->TEST_has_write_controller_token()); + ASSERT_FALSE(write_controller.NeedsDelay()); + + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); + ASSERT_TRUE(dbfull()->TEST_has_write_controller_token()); + ASSERT_TRUE(write_controller.NeedsDelay()); + + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest, testing::Bool()); +INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest1, DBWriteBufferManagerTest1, + ::testing::Bool()); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index b4cf0cbb7a..f90075aaca 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -250,24 +250,32 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 3 sst after compaction with live iterator CheckFileTypeCounts(dbname_, 0, 3, 1); - test::SleepingBackgroundTask sleeping_task_before; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_before, Env::Priority::HIGH); + std::vector sleeping_task_before( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& sleeping_task : sleeping_task_before) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + } delete itr; test::SleepingBackgroundTask sleeping_task_after; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_after, Env::Priority::HIGH); + &sleeping_task_after, Env::Priority::LOW); // Make sure no purges are executed foreground CheckFileTypeCounts(dbname_, 0, 3, 1); - sleeping_task_before.WakeUp(); - sleeping_task_before.WaitUntilDone(); + sleeping_task_before[0].WakeUp(); + sleeping_task_before[0].WaitUntilDone(); // Make sure all background purges are executed sleeping_task_after.WakeUp(); sleeping_task_after.WaitUntilDone(); // 1 sst after iterator deletion CheckFileTypeCounts(dbname_, 0, 1, 1); + + for (size_t i = 1; i < sleeping_task_before.size(); ++i) { + sleeping_task_before[i].WakeUp(); + sleeping_task_before[i].WaitUntilDone(); + } } TEST_F(DeleteFileTest, PurgeDuringOpen) { @@ -332,16 +340,21 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { CheckFileTypeCounts(dbname_, 0, 1, 1); delete cfh; - test::SleepingBackgroundTask sleeping_task_after; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_after, Env::Priority::HIGH); + std::vector sleeping_task_after( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& sleeping_task : sleeping_task_after) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + } // If background purge is enabled, the file should still be there. CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1); TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1"); // Execute background purges. - sleeping_task_after.WakeUp(); - sleeping_task_after.WaitUntilDone(); + for (auto& sleeping_task : sleeping_task_after) { + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + } // The file should have been deleted. CheckFileTypeCounts(dbname_, 0, 0, 1); }; @@ -401,13 +414,19 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { CheckFileTypeCounts(dbname_, 0, 3, 1); delete itr; - test::SleepingBackgroundTask sleeping_task_after; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_after, Env::Priority::HIGH); + std::vector sleeping_task_after( + std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW))); + for (auto& sleeping_task : sleeping_task_after) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + sleeping_task.WaitUntilSleeping(); + } // Make sure all background purges are executed - sleeping_task_after.WakeUp(); - sleeping_task_after.WaitUntilDone(); + for (auto& sleeping_task : sleeping_task_after) { + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + } // 1 sst after iterator deletion CheckFileTypeCounts(dbname_, 0, 1, 1); } @@ -447,9 +466,14 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); delete itr1; + for (int i = 0; + i < std::max(1, env_->GetBackgroundThreads(Env::Priority::LOW)); ++i) { + env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::LOW); + } env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH); delete itr2; env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH); + env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::LOW); Close(); TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"); diff --git a/db/error_handler.cc b/db/error_handler.cc index b00611ac92..4ba8c131e9 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -518,8 +518,8 @@ Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, { uint64_t free_space; - if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path, - &free_space) == Status::NotSupported()) { + if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path, &free_space) + .IsNotSupported()) { *auto_recovery = false; } } diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index 547d087322..040744e48a 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -1305,7 +1305,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) { WriteOptions wopts; wopts.sync = true; s = dbfull()->Write(wopts, &batch); - ASSERT_EQ(s, s.NoSpace()); + ASSERT_TRUE(s.IsNoSpace()); } SyncPoint::GetInstance()->DisableProcessing(); fault_fs_->SetFilesystemActive(true); @@ -2461,7 +2461,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) { s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); ASSERT_EQ(listener->WaitForRecovery(5000000), true); - ASSERT_EQ(listener->new_bg_error(), Status::Aborted()); + ASSERT_TRUE(listener->new_bg_error().IsAborted()); SyncPoint::GetInstance()->DisableProcessing(); fault_fs_->SetFilesystemActive(true); diff --git a/db/experimental.cc b/db/experimental.cc index e2917a443c..6488d1fd67 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -34,12 +34,12 @@ Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) { Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/, const Slice* /*begin*/, const Slice* /*end*/) { - return Status::NotSupported("Not supported in RocksDB LITE"); + return Status::NotSupported("Not supported in LITE mode"); } Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/, int /*target_level*/) { - return Status::NotSupported("Not supported in RocksDB LITE"); + return Status::NotSupported("Not supported in LITE mode"); } #endif // ROCKSDB_LITE diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index b6f05a846e..d66d85f5b1 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -683,7 +683,7 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) { s = DeprecatedAddFile({file1}, true /* move file */); ASSERT_OK(s) << s.ToString(); - ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); + ASSERT_TRUE(env_->FileExists(file1).IsNotFound()); s = DeprecatedAddFile({file2}, false /* copy file */); ASSERT_OK(s) << s.ToString(); diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index de675e6912..917839afa1 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -543,7 +543,7 @@ TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) { std::string val; ASSERT_OK(db_->Get(ro, "cats", &val)); ASSERT_EQ("dogs", val); - ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound()); + ASSERT_TRUE(db_->Get(ro, "boys", &val).IsNotFound()); } TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) { diff --git a/db/flush_job.cc b/db/flush_job.cc index 3ccce2af18..96d62d7891 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -276,16 +276,12 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, s = WriteLevel0Table(); } - if (s.ok() && cfd_->IsDropped()) { - s = Status::ColumnFamilyDropped("Column family dropped during compaction"); - } - if ((s.ok() || s.IsColumnFamilyDropped()) && - shutting_down_->load(std::memory_order_acquire)) { - s = Status::ShutdownInProgress("Database shutdown"); - } - if (!s.ok()) { cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber()); + } else if (shutting_down_->load(std::memory_order_acquire)) { + s = Status::ShutdownInProgress("Database shutdown"); + } else if (cfd_->IsDropped()) { + s = Status::ColumnFamilyDropped("Column family dropped during flush"); } else if (write_manifest_) { TEST_SYNC_POINT("FlushJob::InstallResults"); // Replace immutable memtable with the generated Table diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index e459e935ed..f3d8a7e46b 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -502,22 +502,30 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { { // Create column family with existing cf name. ExportImportFilesMetaData metadata; + metadata.db_comparator_name = options.comparator->Name(); - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::InvalidArgument("Column family already exists")); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_NE(s.getState(), nullptr); + EXPECT_NE(strstr(s.getState(), "Column family already exists"), nullptr) + << s.getState(); ASSERT_EQ(import_cfh_, nullptr); } { // Import with no files specified. ExportImportFilesMetaData metadata; + metadata.db_comparator_name = options.comparator->Name(); - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::InvalidArgument("The list of files is empty")); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_NE(s.getState(), nullptr); + EXPECT_NE(strstr(s.getState(), "The list of files is empty"), nullptr) + << s.getState(); ASSERT_EQ(import_cfh_, nullptr); } @@ -544,10 +552,13 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19)); metadata.db_comparator_name = options.comparator->Name(); - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::InvalidArgument("Files have overlapping ranges")); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_NE(s.getState(), nullptr); + EXPECT_NE(strstr(s.getState(), "Files have overlapping ranges"), nullptr) + << s.getState(); ASSERT_EQ(import_cfh_, nullptr); } @@ -568,10 +579,13 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); metadata.db_comparator_name = mismatch_options.comparator->Name(); - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::InvalidArgument("Comparator name mismatch")); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_NE(s.getState(), nullptr); + EXPECT_NE(strstr(s.getState(), "Comparator name mismatch"), nullptr) + << s.getState(); ASSERT_EQ(import_cfh_, nullptr); } @@ -593,10 +607,13 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19)); metadata.db_comparator_name = options.comparator->Name(); - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::IOError("No such file or directory")); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(s.IsIOError()); + ASSERT_NE(s.getState(), nullptr); + EXPECT_NE(strstr(s.getState(), "No such file or directory"), nullptr) + << s.getState(); ASSERT_EQ(import_cfh_, nullptr); // Test successful import after a failure with the same CF name. Ensures diff --git a/db/memtable.h b/db/memtable.h index 965404d25f..684a6b5a98 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -479,13 +479,31 @@ class MemTable { uint64_t GetID() const { return id_; } - void SetFlushCompleted(bool completed) { flush_completed_ = completed; } + void SetFlushCompleted(bool completed) { + // Flush Can't complete twice + if (completed) { + assert(!flush_completed_); + } + // In case flush is aborted, notify the memory tracker + if (flush_completed_ && (completed == false)) { + mem_tracker_.FreeMemAborted(); + } + flush_completed_ = completed; + } uint64_t GetFileNumber() const { return file_number_; } void SetFileNumber(uint64_t file_num) { file_number_ = file_num; } void SetFlushInProgress(bool in_progress) { + if (in_progress && (flush_in_progress_ == false)) { + assert(!flush_completed_); + mem_tracker_.FreeMemStarted(); + } else if ((in_progress == false) && flush_in_progress_) { + // In case flush is aborted, notify the memory tracker + mem_tracker_.FreeMemAborted(); + } + flush_in_progress_ = in_progress; } @@ -528,8 +546,8 @@ class MemTable { std::atomic write_buffer_size_; // These are used to manage memtable flushes to storage - bool flush_in_progress_; // started the flush - bool flush_completed_; // finished the flush + bool flush_in_progress_; // started the flush + bool flush_completed_; // finished the flush uint64_t file_number_; // filled up after flush is complete // The updates to be applied to the transaction log when this diff --git a/db/memtable_list.cc b/db/memtable_list.cc index f447ee7353..a3a171c4bc 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -365,7 +365,7 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, if (num_flush_not_started_ == 0) { imm_flush_needed.store(false, std::memory_order_release); } - m->flush_in_progress_ = true; // flushing will start very soon + m->SetFlushInProgress(true); // flushing will start very soon if (max_next_log_number) { *max_next_log_number = std::max(m->GetNextLogNumber(), *max_next_log_number); @@ -390,8 +390,8 @@ void MemTableList::RollbackMemtableFlush(const autovector& mems, assert(m->flush_in_progress_); assert(m->file_number_ == 0); - m->flush_in_progress_ = false; - m->flush_completed_ = false; + m->SetFlushInProgress(false); + m->SetFlushCompleted(false); m->edit_.Clear(); num_flush_not_started_++; } @@ -419,7 +419,7 @@ Status MemTableList::TryInstallMemtableFlushResults( // All the edits are associated with the first memtable of this batch. assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0); - mems[i]->flush_completed_ = true; + mems[i]->SetFlushCompleted(true); mems[i]->file_number_ = file_number; } @@ -701,8 +701,9 @@ void MemTableList::RemoveMemTablesOrRestoreFlags( m->edit_.GetBlobFileAdditions().size(), mem_id); } - m->flush_completed_ = false; - m->flush_in_progress_ = false; + m->SetFlushCompleted(false); + m->SetFlushInProgress(false); + m->edit_.Clear(); num_flush_not_started_++; m->file_number_ = 0; diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index df1694c212..04ced505e8 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -541,313 +541,65 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { } } -TEST_F(MemTableListTest, FlushPendingTest) { - const int num_tables = 6; - SequenceNumber seq = 1; - Status s; - - auto factory = std::make_shared(); - options.memtable_factory = factory; - ImmutableOptions ioptions(options); - InternalKeyComparator cmp(BytewiseComparator()); - WriteBufferManager wb(options.db_write_buffer_size); - autovector to_delete; - - // Create MemTableList - int min_write_buffer_number_to_merge = 3; - int max_write_buffer_number_to_maintain = 7; - int64_t max_write_buffer_size_to_maintain = - 7 * static_cast(options.write_buffer_size); - MemTableList list(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain, - max_write_buffer_size_to_maintain); - - // Create some MemTables - uint64_t memtable_id = 0; - std::vector tables; - MutableCFOptions mutable_cf_options(options); - for (int i = 0; i < num_tables; i++) { - MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb, - kMaxSequenceNumber, 0 /* column_family_id */); - mem->SetID(memtable_id++); - mem->Ref(); - - std::string value; - MergeContext merge_context; - - ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i), - nullptr /* kv_prot_info */)); - ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN", - nullptr /* kv_prot_info */)); - ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value", - nullptr /* kv_prot_info */)); - ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM", - nullptr /* kv_prot_info */)); - ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "", - nullptr /* kv_prot_info */)); - - tables.push_back(mem); - } - - // Nothing to flush - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - autovector to_flush; - list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); - ASSERT_EQ(0, to_flush.size()); - - // Request a flush even though there is nothing to flush - list.FlushRequested(); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Attempt to 'flush' to clear request for flush - list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); - ASSERT_EQ(0, to_flush.size()); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Request a flush again - list.FlushRequested(); - // No flush pending since the list is empty. - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Add 2 tables - list.Add(tables[0], &to_delete); - list.Add(tables[1], &to_delete); - ASSERT_EQ(2, list.NumNotFlushed()); - ASSERT_EQ(0, to_delete.size()); - - // Even though we have less than the minimum to flush, a flush is - // pending since we had previously requested a flush and never called - // PickMemtablesToFlush() to clear the flush. - ASSERT_TRUE(list.IsFlushPending()); - ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Pick tables to flush - list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); - ASSERT_EQ(2, to_flush.size()); - ASSERT_EQ(2, list.NumNotFlushed()); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Revert flush - list.RollbackMemtableFlush(to_flush, 0); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); - to_flush.clear(); - - // Add another table - list.Add(tables[2], &to_delete); - // We now have the minimum to flush regardles of whether FlushRequested() - // was called. - ASSERT_TRUE(list.IsFlushPending()); - ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); - ASSERT_EQ(0, to_delete.size()); - - // Pick tables to flush - list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); - ASSERT_EQ(3, to_flush.size()); - ASSERT_EQ(3, list.NumNotFlushed()); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Pick tables to flush again - autovector to_flush2; - list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2); - ASSERT_EQ(0, to_flush2.size()); - ASSERT_EQ(3, list.NumNotFlushed()); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Add another table - list.Add(tables[3], &to_delete); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); - ASSERT_EQ(0, to_delete.size()); - - // Request a flush again - list.FlushRequested(); - ASSERT_TRUE(list.IsFlushPending()); - ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Pick tables to flush again - list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2); - ASSERT_EQ(1, to_flush2.size()); - ASSERT_EQ(4, list.NumNotFlushed()); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Rollback first pick of tables - list.RollbackMemtableFlush(to_flush, 0); - ASSERT_TRUE(list.IsFlushPending()); - ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); - to_flush.clear(); - - // Add another tables - list.Add(tables[4], &to_delete); - ASSERT_EQ(5, list.NumNotFlushed()); - // We now have the minimum to flush regardles of whether FlushRequested() - ASSERT_TRUE(list.IsFlushPending()); - ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); - ASSERT_EQ(0, to_delete.size()); - - // Pick tables to flush - list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); - // Should pick 4 of 5 since 1 table has been picked in to_flush2 - ASSERT_EQ(4, to_flush.size()); - ASSERT_EQ(5, list.NumNotFlushed()); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Pick tables to flush again - autovector to_flush3; - list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush3); - ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed - ASSERT_EQ(5, list.NumNotFlushed()); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Flush the 4 memtables that were picked in to_flush - s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, - &to_delete); - ASSERT_OK(s); - - // Note: now to_flush contains tables[0,1,2,4]. to_flush2 contains - // tables[3]. - // Current implementation will only commit memtables in the order they were - // created. So TryInstallMemtableFlushResults will install the first 3 tables - // in to_flush and stop when it encounters a table not yet flushed. - ASSERT_EQ(2, list.NumNotFlushed()); - int num_in_history = - std::min(3, static_cast(max_write_buffer_size_to_maintain) / - static_cast(options.write_buffer_size)); - ASSERT_EQ(num_in_history, list.NumFlushed()); - ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); - - // Request a flush again. Should be nothing to flush - list.FlushRequested(); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - - // Flush the 1 memtable that was picked in to_flush2 - s = MemTableListTest::Mock_InstallMemtableFlushResults( - &list, mutable_cf_options, to_flush2, &to_delete); - ASSERT_OK(s); - - // This will actually install 2 tables. The 1 we told it to flush, and also - // tables[4] which has been waiting for tables[3] to commit. - ASSERT_EQ(0, list.NumNotFlushed()); - num_in_history = - std::min(5, static_cast(max_write_buffer_size_to_maintain) / - static_cast(options.write_buffer_size)); - ASSERT_EQ(num_in_history, list.NumFlushed()); - ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); - - for (const auto& m : to_delete) { - // Refcount should be 0 after calling TryInstallMemtableFlushResults. - // Verify this, by Ref'ing then UnRef'ing: - m->Ref(); - ASSERT_EQ(m, m->Unref()); - delete m; - } - to_delete.clear(); - - // Add another table - list.Add(tables[5], &to_delete); - ASSERT_EQ(1, list.NumNotFlushed()); - ASSERT_EQ(5, list.GetLatestMemTableID()); - memtable_id = 4; - // Pick tables to flush. The tables to pick must have ID smaller than or - // equal to 4. Therefore, no table will be selected in this case. - autovector to_flush4; - list.FlushRequested(); - ASSERT_TRUE(list.HasFlushRequested()); - list.PickMemtablesToFlush(memtable_id, &to_flush4); - ASSERT_TRUE(to_flush4.empty()); - ASSERT_EQ(1, list.NumNotFlushed()); - ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); - ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.HasFlushRequested()); - - // Pick tables to flush. The tables to pick must have ID smaller than or - // equal to 5. Therefore, only tables[5] will be selected. - memtable_id = 5; - list.FlushRequested(); - list.PickMemtablesToFlush(memtable_id, &to_flush4); - ASSERT_EQ(1, static_cast(to_flush4.size())); - ASSERT_EQ(1, list.NumNotFlushed()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - ASSERT_FALSE(list.IsFlushPending()); - to_delete.clear(); - - list.current()->Unref(&to_delete); - int to_delete_size = - std::min(num_tables, static_cast(max_write_buffer_size_to_maintain) / - static_cast(options.write_buffer_size)); - ASSERT_EQ(to_delete_size, to_delete.size()); - - for (const auto& m : to_delete) { - // Refcount should be 0 after calling TryInstallMemtableFlushResults. - // Verify this, by Ref'ing then UnRef'ing: - m->Ref(); - ASSERT_EQ(m, m->Unref()); - delete m; - } - to_delete.clear(); -} - -TEST_F(MemTableListTest, EmptyAtomicFlusTest) { - autovector lists; - autovector cf_ids; - autovector options_list; - autovector*> to_flush; - autovector to_delete; - Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list, - to_flush, &to_delete); - ASSERT_OK(s); - ASSERT_TRUE(to_delete.empty()); +namespace { + +void ValidateWbmUsedCounters(const WriteBufferManager& wb, + size_t expected_mutable, size_t expected_immutable, + size_t expected_freed) { + ASSERT_EQ(wb.mutable_memtable_memory_usage(), expected_mutable); + ASSERT_EQ(wb.immmutable_memtable_memory_usage(), expected_immutable); + ASSERT_EQ(wb.memtable_memory_being_freed_usage(), expected_freed); } -TEST_F(MemTableListTest, AtomicFlusTest) { - const int num_cfs = 3; - const int num_tables_per_cf = 2; - SequenceNumber seq = 1; - - auto factory = std::make_shared(); - options.memtable_factory = factory; - ImmutableOptions ioptions(options); - InternalKeyComparator cmp(BytewiseComparator()); - WriteBufferManager wb(options.db_write_buffer_size); +} // namespace - // Create MemTableLists - int min_write_buffer_number_to_merge = 3; - int max_write_buffer_number_to_maintain = 7; - int64_t max_write_buffer_size_to_maintain = - 7 * static_cast(options.write_buffer_size); - autovector lists; - for (int i = 0; i != num_cfs; ++i) { - lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain, - max_write_buffer_size_to_maintain)); - } - - autovector cf_ids; - std::vector> tables(num_cfs); - autovector mutable_cf_options_list; - uint32_t cf_id = 0; - for (auto& elem : tables) { - mutable_cf_options_list.emplace_back(new MutableCFOptions(options)); +TEST_F(MemTableListTest, FlushPendingTest) { + for (auto wbm_enabled : {false, true}) { + const int num_tables = 6; + SequenceNumber seq = 1; + Status s; + + auto factory = std::make_shared(); + options.memtable_factory = factory; + options.db_write_buffer_size = wbm_enabled ? (1024 * 1024 * 1024) : 0U; + ImmutableOptions ioptions(options); + InternalKeyComparator cmp(BytewiseComparator()); + WriteBufferManager wb(options.db_write_buffer_size); + ASSERT_EQ(wb.enabled(), wbm_enabled); + autovector to_delete; + + // Create MemTableList + int min_write_buffer_number_to_merge = 3; + int max_write_buffer_number_to_maintain = 7; + int64_t max_write_buffer_size_to_maintain = + 7 * static_cast(options.write_buffer_size); + MemTableList list(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); + + // Create some MemTables uint64_t memtable_id = 0; - for (int i = 0; i != num_tables_per_cf; ++i) { + std::vector tables; + MutableCFOptions mutable_cf_options(options); + std::vector tables_reserved_mem; + size_t total_reserved_mem = 0U; + for (int i = 0; i < num_tables; i++) { MemTable* mem = - new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb, - kMaxSequenceNumber, cf_id); + new MemTable(cmp, ioptions, mutable_cf_options, &wb, + kMaxSequenceNumber, 0 /* column_family_id */); mem->SetID(memtable_id++); mem->Ref(); + auto new_total_reserved_mem = wb.mutable_memtable_memory_usage(); + if (wbm_enabled) { + ASSERT_GT(new_total_reserved_mem, total_reserved_mem); + } + tables_reserved_mem.push_back(new_total_reserved_mem - + total_reserved_mem); + total_reserved_mem = new_total_reserved_mem; + std::string value; + MergeContext merge_context; ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i), nullptr /* kv_prot_info */)); @@ -860,100 +612,490 @@ TEST_F(MemTableListTest, AtomicFlusTest) { ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "", nullptr /* kv_prot_info */)); - elem.push_back(mem); + tables.push_back(mem); } - cf_ids.push_back(cf_id++); - } - std::vector> flush_candidates(num_cfs); + // Nothing to flush + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + autovector to_flush; + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); + ASSERT_EQ(0, to_flush.size()); + + // Request a flush even though there is nothing to flush + list.FlushRequested(); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Attempt to 'flush' to clear request for flush + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); + ASSERT_EQ(0, to_flush.size()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Request a flush again + list.FlushRequested(); + // No flush pending since the list is empty. + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Add 2 tables + list.Add(tables[0], &to_delete); + list.Add(tables[1], &to_delete); + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_EQ(0, to_delete.size()); + auto expected_mutable_memory_usage = + tables_reserved_mem[0] + tables_reserved_mem[1]; + ValidateWbmUsedCounters(wb, + total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, 0U); + + // Even though we have less than the minimum to flush, a flush is + // pending since we had previously requested a flush and never called + // PickMemtablesToFlush() to clear the flush. + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); + ASSERT_EQ(2, to_flush.size()); + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + + // Revert flush + list.RollbackMemtableFlush(to_flush, 0); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ValidateWbmUsedCounters(wb, + total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, 0U); + to_flush.clear(); + + // Add another table + list.Add(tables[2], &to_delete); + // We now have the minimum to flush regardles of whether FlushRequested() + // was called. + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + expected_mutable_memory_usage += tables_reserved_mem[2]; + ValidateWbmUsedCounters(wb, + total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, 0U); + + // Pick tables to flush + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); + ASSERT_EQ(3, to_flush.size()); + ASSERT_EQ(3, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + + // Pick tables to flush again + autovector to_flush2; + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2); + ASSERT_EQ(0, to_flush2.size()); + ASSERT_EQ(3, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + + // Add another table + list.Add(tables[3], &to_delete); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + expected_mutable_memory_usage += tables_reserved_mem[3]; + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, + expected_mutable_memory_usage - tables_reserved_mem[3]); + + // Request a flush again + list.FlushRequested(); + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush again + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2); + ASSERT_EQ(1, to_flush2.size()); + ASSERT_EQ(4, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + + // Rollback first pick of tables + list.RollbackMemtableFlush(to_flush, 0); + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + // table3 was NOT rolled back (to_flush (tables 0, 1, 2) was rolled back, + // to_flush2 contains table 3) + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, tables_reserved_mem[3]); + to_flush.clear(); + + // Add another tables + list.Add(tables[4], &to_delete); + ASSERT_EQ(5, list.NumNotFlushed()); + // We now have the minimum to flush regardles of whether FlushRequested() + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + expected_mutable_memory_usage += tables_reserved_mem[4]; + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, tables_reserved_mem[3]); + + // Pick tables to flush + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); + // Should pick 4 of 5 since 1 table has been picked in to_flush2 + ASSERT_EQ(4, to_flush.size()); + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + // Now all of the immutables tables are being freed (undergoing flush) + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + + // Pick tables to flush again + autovector to_flush3; + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush3); + ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + + // Flush the 4 memtables that were picked in to_flush + s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, + &to_delete); + ASSERT_OK(s); + + // Note: now to_flush contains tables[0,1,2,4]. to_flush2 contains + // tables[3]. + // Current implementation will only commit memtables in the order they were + // created. So TryInstallMemtableFlushResults will install the first 3 + // tables in to_flush and stop when it encounters a table not yet flushed. + ASSERT_EQ(2, list.NumNotFlushed()); + int num_in_history = + std::min(3, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(num_in_history, list.NumFlushed()); + ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); + // None of the 5 tables has been freed => no change in the counters + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + + // Request a flush again. Should be nothing to flush + list.FlushRequested(); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Flush the 1 memtable that was picked in to_flush2 + s = MemTableListTest::Mock_InstallMemtableFlushResults( + &list, mutable_cf_options, to_flush2, &to_delete); + ASSERT_OK(s); + + // This will actually install 2 tables. The 1 we told it to flush, and also + // tables[4] which has been waiting for tables[3] to commit. + ASSERT_EQ(0, list.NumNotFlushed()); + num_in_history = + std::min(5, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(num_in_history, list.NumFlushed()); + ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); + // None of the 5 tables has been freed => no change in the counters + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + + // This loop will actually do nothing since to_delete is empty + ASSERT_TRUE(to_delete.empty()); + for (const auto& m : to_delete) { + // Refcount should be 0 after calling TryInstallMemtableFlushResults. + // Verify this, by Ref'ing then UnRef'ing: + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } + to_delete.clear(); + + // Add another table + list.Add(tables[5], &to_delete); + expected_mutable_memory_usage += tables_reserved_mem[5]; + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_EQ(5, list.GetLatestMemTableID()); + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, + expected_mutable_memory_usage - tables_reserved_mem[5]); + + memtable_id = 4; + // Pick tables to flush. The tables to pick must have ID smaller than or + // equal to 4. Therefore, no table will be selected in this case. + autovector to_flush4; + list.FlushRequested(); + ASSERT_TRUE(list.HasFlushRequested()); + list.PickMemtablesToFlush(memtable_id, &to_flush4); + ASSERT_TRUE(to_flush4.empty()); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.HasFlushRequested()); + // No change + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, + expected_mutable_memory_usage - tables_reserved_mem[5]); + + // Pick tables to flush. The tables to pick must have ID smaller than or + // equal to 5. Therefore, only tables[5] will be selected. + memtable_id = 5; + list.FlushRequested(); + list.PickMemtablesToFlush(memtable_id, &to_flush4); + ASSERT_EQ(1, static_cast(to_flush4.size())); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_FALSE(list.IsFlushPending()); + // All tables are now flushed or being flushed, but none was deleted + ValidateWbmUsedCounters( + wb, total_reserved_mem - expected_mutable_memory_usage, + expected_mutable_memory_usage, expected_mutable_memory_usage); + to_delete.clear(); + + list.current()->Unref(&to_delete); + int to_delete_size = std::min( + num_tables, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(to_delete_size, to_delete.size()); + + for (const auto& m : to_delete) { + // Refcount should be 0 after calling TryInstallMemtableFlushResults. + // Verify this, by Ref'ing then UnRef'ing: + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } + to_delete.clear(); - // Nothing to flush - for (auto i = 0; i != num_cfs; ++i) { - auto* list = lists[i]; - ASSERT_FALSE(list->IsFlushPending()); - ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); - list->PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, - &flush_candidates[i]); - ASSERT_EQ(0, flush_candidates[i].size()); - } - // Request flush even though there is nothing to flush - for (auto i = 0; i != num_cfs; ++i) { - auto* list = lists[i]; - list->FlushRequested(); - ASSERT_FALSE(list->IsFlushPending()); - ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); + // All memtables have been deleted / freed + ValidateWbmUsedCounters(wb, 0U, 0U, 0U); } +} + +TEST_F(MemTableListTest, EmptyAtomicFlushTest) { + autovector lists; + autovector cf_ids; + autovector options_list; + autovector*> to_flush; autovector to_delete; - // Add tables to the immutable memtalbe lists associated with column families - for (auto i = 0; i != num_cfs; ++i) { - for (auto j = 0; j != num_tables_per_cf; ++j) { - lists[i]->Add(tables[i][j], &to_delete); + Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list, + to_flush, &to_delete); + ASSERT_OK(s); + ASSERT_TRUE(to_delete.empty()); +} + +TEST_F(MemTableListTest, AtomicFlushTest) { + for (auto wbm_enabled : {false, true}) { + const int num_cfs = 3; + const int num_tables_per_cf = 2; + SequenceNumber seq = 1; + + auto factory = std::make_shared(); + options.memtable_factory = factory; + options.db_write_buffer_size = wbm_enabled ? (1024 * 1024 * 1024) : 0U; + ImmutableOptions ioptions(options); + InternalKeyComparator cmp(BytewiseComparator()); + WriteBufferManager wb(options.db_write_buffer_size); + + // Create MemTableLists + int min_write_buffer_number_to_merge = 3; + int max_write_buffer_number_to_maintain = 7; + int64_t max_write_buffer_size_to_maintain = + 7 * static_cast(options.write_buffer_size); + autovector lists; + for (int i = 0; i != num_cfs; ++i) { + lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain)); } - ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed()); - ASSERT_TRUE(lists[i]->IsFlushPending()); - ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire)); - } - std::vector flush_memtable_ids = {1, 1, 0}; - // +----+ - // list[0]: |0 1| - // list[1]: |0 1| - // | +--+ - // list[2]: |0| 1 - // +-+ - // Pick memtables to flush - for (auto i = 0; i != num_cfs; ++i) { - flush_candidates[i].clear(); - lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]); - ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, - static_cast(flush_candidates[i].size())); - } - autovector tmp_lists; - autovector tmp_cf_ids; - autovector tmp_options_list; - autovector*> to_flush; - for (auto i = 0; i != num_cfs; ++i) { - if (!flush_candidates[i].empty()) { - to_flush.push_back(&flush_candidates[i]); - tmp_lists.push_back(lists[i]); - tmp_cf_ids.push_back(i); - tmp_options_list.push_back(mutable_cf_options_list[i]); + + autovector cf_ids; + std::vector> tables(num_cfs); + std::vector tables_cf_reserved_mem(num_cfs, {0U}); + std::vector> tables_reserved_mem(num_cfs, {0U}); + size_t total_reserved_mem = 0U; + autovector mutable_cf_options_list; + uint32_t cf_id = 0; + for (auto& elem : tables) { + mutable_cf_options_list.emplace_back(new MutableCFOptions(options)); + uint64_t memtable_id = 0; + tables_reserved_mem[cf_id].resize(num_tables_per_cf); + for (int i = 0; i != num_tables_per_cf; ++i) { + MemTable* mem = + new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb, + kMaxSequenceNumber, cf_id); + mem->SetID(memtable_id++); + mem->Ref(); + + auto new_total_reserved_mem = wb.mutable_memtable_memory_usage(); + if (wbm_enabled) { + ASSERT_GT(new_total_reserved_mem, total_reserved_mem); + } + + tables_cf_reserved_mem[cf_id] += + new_total_reserved_mem - total_reserved_mem; + tables_reserved_mem[cf_id][i] = + new_total_reserved_mem - total_reserved_mem; + total_reserved_mem = new_total_reserved_mem; + + std::string value; + + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i), + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "", + nullptr /* kv_prot_info */)); + + elem.push_back(mem); + } + cf_ids.push_back(cf_id++); } - } - Status s = Mock_InstallMemtableAtomicFlushResults( - tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete); - ASSERT_OK(s); - for (auto i = 0; i != num_cfs; ++i) { - for (auto j = 0; j != num_tables_per_cf; ++j) { - if (static_cast(j) <= flush_memtable_ids[i]) { - ASSERT_LT(0, tables[i][j]->GetFileNumber()); + std::vector> flush_candidates(num_cfs); + + // Nothing to flush + for (auto i = 0; i != num_cfs; ++i) { + auto* list = lists[i]; + ASSERT_FALSE(list->IsFlushPending()); + ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); + list->PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, + &flush_candidates[i]); + ASSERT_EQ(0, flush_candidates[i].size()); + } + // Request flush even though there is nothing to flush + for (auto i = 0; i != num_cfs; ++i) { + auto* list = lists[i]; + list->FlushRequested(); + ASSERT_FALSE(list->IsFlushPending()); + ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); + } + // ALL memtables are currently MUTABLE + ValidateWbmUsedCounters(wb, total_reserved_mem, 0U, 0U); + + autovector to_delete; + // Add tables to the immutable memtalbe lists associated with column + // families + for (auto i = 0; i != num_cfs; ++i) { + for (auto j = 0; j != num_tables_per_cf; ++j) { + lists[i]->Add(tables[i][j], &to_delete); } + ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed()); + ASSERT_TRUE(lists[i]->IsFlushPending()); + ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire)); + } + // ALL memtables are currently IMMUTABLE + ValidateWbmUsedCounters(wb, 0U, total_reserved_mem, 0U); + + std::vector flush_memtable_ids = {1, 1, 0}; + // +----+ + // list[0]: |0 1| + // list[1]: |0 1| + // | +--+ + // list[2]: |0| 1 + // +-+ + // Pick memtables to flush + auto expected_total_size_being_freed = 0U; + for (auto i = 0; i != num_cfs; ++i) { + flush_candidates[i].clear(); + lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], + &flush_candidates[i]); + ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, + static_cast(flush_candidates[i].size())); + + for (auto cf_table_idx = 0U; cf_table_idx < flush_candidates[i].size(); + ++cf_table_idx) { + expected_total_size_being_freed += tables_reserved_mem[i][cf_table_idx]; + } + } + ValidateWbmUsedCounters(wb, 0U, total_reserved_mem, + expected_total_size_being_freed); + + autovector tmp_lists; + autovector tmp_cf_ids; + autovector tmp_options_list; + autovector*> to_flush; + for (auto i = 0; i != num_cfs; ++i) { + if (!flush_candidates[i].empty()) { + to_flush.push_back(&flush_candidates[i]); + tmp_lists.push_back(lists[i]); + tmp_cf_ids.push_back(i); + tmp_options_list.push_back(mutable_cf_options_list[i]); + } + } + Status s = Mock_InstallMemtableAtomicFlushResults( + tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete); + ASSERT_OK(s); + + for (auto i = 0; i != num_cfs; ++i) { + for (auto j = 0; j != num_tables_per_cf; ++j) { + if (static_cast(j) <= flush_memtable_ids[i]) { + ASSERT_LT(0, tables[i][j]->GetFileNumber()); + } + } + ASSERT_EQ( + static_cast(num_tables_per_cf) - flush_candidates[i].size(), + lists[i]->NumNotFlushed()); } - ASSERT_EQ( - static_cast(num_tables_per_cf) - flush_candidates[i].size(), - lists[i]->NumNotFlushed()); - } - to_delete.clear(); - for (auto list : lists) { - list->current()->Unref(&to_delete); - delete list; - } - for (auto& mutable_cf_options : mutable_cf_options_list) { - if (mutable_cf_options != nullptr) { - delete mutable_cf_options; - mutable_cf_options = nullptr; + // No memtable was freed => No Change + ValidateWbmUsedCounters(wb, 0U, total_reserved_mem, + expected_total_size_being_freed); + + to_delete.clear(); + for (auto list : lists) { + list->current()->Unref(&to_delete); + delete list; } - } - // All memtables in tables array must have been flushed, thus ready to be - // deleted. - ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size()); - for (const auto& m : to_delete) { - // Refcount should be 0 after calling InstallMemtableFlushResults. - // Verify this by Ref'ing and then Unref'ing. - m->Ref(); - ASSERT_EQ(m, m->Unref()); - delete m; + for (auto& mutable_cf_options : mutable_cf_options_list) { + if (mutable_cf_options != nullptr) { + delete mutable_cf_options; + mutable_cf_options = nullptr; + } + } + // All memtables in tables array must have been flushed, thus ready to be + // deleted. + ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size()); + for (const auto& m : to_delete) { + // Refcount should be 0 after calling InstallMemtableFlushResults. + // Verify this by Ref'ing and then Unref'ing. + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } + + // All memtables have been deleted / freed + ValidateWbmUsedCounters(wb, 0U, 0U, 0U); } } diff --git a/db/options_file_test.cc b/db/options_file_test.cc index 3ff7e0952b..206ebe1563 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -113,7 +113,7 @@ int main(int argc, char** argv) { #include int main(int /*argc*/, char** /*argv*/) { - printf("Skipped as Options file is not supported in RocksDBLite.\n"); + printf("Skipped as Options file is not supported in LITE mode.\n"); return 0; } #endif // !ROCKSDB_LITE diff --git a/db/table_cache.cc b/db/table_cache.cc index e548fc6c3f..2528b59e4b 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -202,7 +202,7 @@ Status TableCache::FindTable( handle); if (s.ok()) { // Release ownership of table reader. - table_reader.release(); + (void)table_reader.release(); } } return s; diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 6e6ad5578a..fb9e6b09c1 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -422,7 +422,7 @@ TEST_F(WriteBatchTest, PrepareCommit) { batch.SetSavePoint(); ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"))); Status s = batch.RollbackToSavePoint(); - ASSERT_EQ(s, Status::NotFound()); + ASSERT_TRUE(s.IsNotFound()); ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1"))); ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1"))); ASSERT_EQ(2u, batch.Count()); diff --git a/db/write_controller.cc b/db/write_controller.cc index c5f7443752..2ba1938ab1 100644 --- a/db/write_controller.cc +++ b/db/write_controller.cc @@ -20,17 +20,21 @@ std::unique_ptr WriteController::GetStopToken() { } std::unique_ptr WriteController::GetDelayToken( - uint64_t write_rate) { - if (0 == total_delayed_++) { + DelaySource source, uint64_t write_rate) { + if (TotalDelayed() == 0) { // Starting delay, so reset counters. next_refill_time_ = 0; credit_in_bytes_ = 0; } + + ++total_delayed_[DelaySourceValue(source)]; + // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in // next_refill_time_ will be based on an old rate. This rate will apply // for subsequent additional debts and for the next refill. - set_delayed_write_rate(write_rate); - return std::unique_ptr(new DelayWriteToken(this)); + set_delayed_write_rate(source, write_rate); + return std::unique_ptr( + new DelayWriteToken(this, source)); } std::unique_ptr @@ -52,7 +56,7 @@ uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) { if (total_stopped_.load(std::memory_order_relaxed) > 0) { return 0; } - if (total_delayed_.load(std::memory_order_relaxed) == 0) { + if (TotalDelayed() == 0) { return 0; } @@ -109,8 +113,11 @@ StopWriteToken::~StopWriteToken() { } DelayWriteToken::~DelayWriteToken() { - controller_->total_delayed_--; - assert(controller_->total_delayed_.load() >= 0); + auto total_delayed_before = + controller_->total_delayed_[WriteController::DelaySourceValue(source_)]--; + assert(total_delayed_before > 0); + + controller_->UpdateDelayedWriteRate(); } CompactionPressureToken::~CompactionPressureToken() { diff --git a/db/write_controller.h b/db/write_controller.h index 88bd1417f1..c0a98115cf 100644 --- a/db/write_controller.h +++ b/db/write_controller.h @@ -7,8 +7,12 @@ #include +#include +#include #include #include +#include + #include "rocksdb/rate_limiter.h" namespace ROCKSDB_NAMESPACE { @@ -21,11 +25,18 @@ class WriteControllerToken; // All of the methods here (including WriteControllerToken's destructors) need // to be called while holding DB mutex class WriteController { + public: + enum class DelaySource { kCF = 0, kWBM = 1, kNumSources }; + static constexpr unsigned int DelaySourceValue(DelaySource source) { + return static_cast(source); + } + static constexpr auto kNumDelaySources{ + static_cast(DelaySource::kNumSources)}; + public: explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u, int64_t low_pri_rate_bytes_per_sec = 1024 * 1024) : total_stopped_(0), - total_delayed_(0), total_compaction_pressure_(0), credit_in_bytes_(0), next_refill_time_(0), @@ -38,72 +49,131 @@ class WriteController { // When an actor (column family) requests a stop token, all writes will be // stopped until the stop token is released (deleted) std::unique_ptr GetStopToken(); - // When an actor (column family) requests a delay token, total delay for all - // writes to the DB will be controlled under the delayed write rate. Every - // write needs to call GetDelay() with number of bytes writing to the DB, - // which returns number of microseconds to sleep. + // Delay tokens are managed per delay source. Every delay source controls its + // own delay indepndently of the other sources. Every call to get a new delay + // token for a source, sets the delay for that source (and overwrites the + // previous delay values of that source). The total delay for all writes is + // the smallest delayed write rate of all the individaul sources (that have + // delay tokens). Writes to the DB will be controlled under that delayed write + // rate. Every write needs to call GetDelay() with number of bytes writing to + // the DB, which returns number of microseconds to sleep. + // + // NOTE: Not Thread-Safe std::unique_ptr GetDelayToken( - uint64_t delayed_write_rate); + DelaySource source, uint64_t delayed_write_rate); // When an actor (column family) requests a moderate token, compaction // threads will be increased std::unique_ptr GetCompactionPressureToken(); // these three metods are querying the state of the WriteController bool IsStopped() const; - bool NeedsDelay() const { return total_delayed_.load() > 0; } + bool NeedsDelay() const { return (TotalDelayed() > 0); } + bool NeedsDelay(DelaySource source) const { + assert(DelaySourceValue(source) < delayed_write_rates_.size()); + return (total_delayed_[DelaySourceValue(source)] > 0); + } bool NeedSpeedupCompaction() const { - return IsStopped() || NeedsDelay() || total_compaction_pressure_ > 0; + // Compaction depends only on the Column-Families delay source requirements + return IsStopped() || NeedsDelay(WriteController::DelaySource::kCF) || + total_compaction_pressure_ > 0; } // return how many microseconds the caller needs to sleep after the call // num_bytes: how many number of bytes to put into the DB. // Prerequisite: DB mutex held. uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes); - void set_delayed_write_rate(uint64_t write_rate) { + + // Set the delayed write rate for a specific source. + // The rate for a source is always the last set value. However, it + // may affect the global rate only when there are delay tokens for that + // source. NOTE: Not Thread-Safe + void set_delayed_write_rate(DelaySource source, uint64_t write_rate) { + assert(DelaySourceValue(source) < delayed_write_rates_.size()); + // avoid divide 0 if (write_rate == 0) { write_rate = 1u; } else if (write_rate > max_delayed_write_rate()) { write_rate = max_delayed_write_rate(); } - delayed_write_rate_ = write_rate; + + delayed_write_rates_[DelaySourceValue(source)] = write_rate; + UpdateDelayedWriteRate(); } + // NOTE: Not Thread-Safe void set_max_delayed_write_rate(uint64_t write_rate) { // avoid divide 0 if (write_rate == 0) { write_rate = 1u; } + max_delayed_write_rate_ = write_rate; + for (auto source_value = 0U; source_value < total_delayed_.size(); + ++source_value) { + if (total_delayed_[source_value] == 0U) { + delayed_write_rates_[source_value] = max_delayed_write_rate_; + } + } // update delayed_write_rate_ as well - delayed_write_rate_ = write_rate; + UpdateDelayedWriteRate(); } + // Get the global delay write rate (not specific to any source) + // NOTE: Only sources with delay tokens affect the global rate uint64_t delayed_write_rate() const { return delayed_write_rate_; } + // Returns the last set value for source + uint64_t delayed_write_rate(DelaySource source) const { + assert(DelaySourceValue(source) < delayed_write_rates_.size()); + return delayed_write_rates_[DelaySourceValue(source)]; + } + uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; } RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); } + uint64_t TEST_delayed_write_rate(DelaySource source) const { + return delayed_write_rates_[DelaySourceValue(source)]; + } + private: uint64_t NowMicrosMonotonic(SystemClock* clock); + int TotalDelayed() const { + return std::accumulate(total_delayed_.begin(), total_delayed_.end(), 0); + } + void UpdateDelayedWriteRate() { + // The effective global delayed write rate is the lowest (minimal) + // write rate of all delays set by the sources (with delay tokens) + delayed_write_rate_ = max_delayed_write_rate_; + for (auto delay_source_value = 0; delay_source_value < kNumDelaySources; + ++delay_source_value) { + if ((delayed_write_rates_[delay_source_value] < delayed_write_rate_) && + NeedsDelay(DelaySource(delay_source_value))) { + delayed_write_rate_ = delayed_write_rates_[delay_source_value]; + } + } + } + + private: friend class WriteControllerToken; friend class StopWriteToken; friend class DelayWriteToken; friend class CompactionPressureToken; std::atomic total_stopped_; - std::atomic total_delayed_; + std::array, kNumDelaySources> total_delayed_{}; std::atomic total_compaction_pressure_; // Number of bytes allowed to write without delay - uint64_t credit_in_bytes_; + uint64_t credit_in_bytes_ = 0U; // Next time that we can add more credit of bytes - uint64_t next_refill_time_; + uint64_t next_refill_time_ = 0U; // Write rate set when initialization or by `DBImpl::SetDBOptions` - uint64_t max_delayed_write_rate_; + uint64_t max_delayed_write_rate_ = 0U; // Current write rate (bytes / second) - uint64_t delayed_write_rate_; + std::array delayed_write_rates_{}; + uint64_t delayed_write_rate_ = 0U; std::unique_ptr low_pri_rate_limiter_; }; @@ -132,9 +202,13 @@ class StopWriteToken : public WriteControllerToken { class DelayWriteToken : public WriteControllerToken { public: - explicit DelayWriteToken(WriteController* controller) - : WriteControllerToken(controller) {} + explicit DelayWriteToken(WriteController* controller, + WriteController::DelaySource source) + : WriteControllerToken(controller), source_(source) {} virtual ~DelayWriteToken(); + + private: + WriteController::DelaySource source_; }; class CompactionPressureToken : public WriteControllerToken { diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc index 1f7cf999aa..afba9a81f2 100644 --- a/db/write_controller_test.cc +++ b/db/write_controller_test.cc @@ -34,45 +34,143 @@ class WriteControllerTest : public testing::Test { #define SECS MILLION // in microseconds TEST_F(WriteControllerTest, BasicAPI) { - WriteController controller(40 MBPS); // also set max delayed rate - EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + constexpr uint64_t kMaxDelayedWriteRate = 40 MBPS; + WriteController controller( + kMaxDelayedWriteRate); // also set max delayed rate + + EXPECT_EQ(controller.delayed_write_rate(), kMaxDelayedWriteRate); + EXPECT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + kMaxDelayedWriteRate); + EXPECT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + kMaxDelayedWriteRate); + EXPECT_FALSE(controller.IsStopped()); EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_FALSE(controller.NeedsDelay(WriteController::DelaySource::kCF)); + EXPECT_FALSE(controller.NeedsDelay(WriteController::DelaySource::kWBM)); EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); // set, get - controller.set_delayed_write_rate(20 MBPS); - EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + controller.set_delayed_write_rate(WriteController::DelaySource::kCF, 20 MBPS); + ASSERT_EQ(controller.delayed_write_rate(), kMaxDelayedWriteRate); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + 20 MBPS); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + kMaxDelayedWriteRate); + EXPECT_FALSE(controller.IsStopped()); EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_FALSE(controller.NeedsDelay(WriteController::DelaySource::kCF)); + EXPECT_FALSE(controller.NeedsDelay(WriteController::DelaySource::kWBM)); EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + controller.set_delayed_write_rate(WriteController::DelaySource::kWBM, + 30 MBPS); + ASSERT_EQ(controller.delayed_write_rate(), kMaxDelayedWriteRate); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + 20 MBPS); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + 30 MBPS); + + controller.set_delayed_write_rate(WriteController::DelaySource::kWBM, + 10 MBPS); + ASSERT_EQ(controller.delayed_write_rate(), kMaxDelayedWriteRate); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + 20 MBPS); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + 10 MBPS); + + controller.set_delayed_write_rate(WriteController::DelaySource::kWBM, + 35 MBPS); + ASSERT_EQ(controller.delayed_write_rate(), kMaxDelayedWriteRate); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + 20 MBPS); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + 35 MBPS); + + controller.set_delayed_write_rate(WriteController::DelaySource::kWBM, + controller.max_delayed_write_rate()); + ASSERT_EQ(controller.delayed_write_rate(), kMaxDelayedWriteRate); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + 20 MBPS); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + kMaxDelayedWriteRate); + + // Over the max should set to max + controller.set_delayed_write_rate(WriteController::DelaySource::kCF, + 2 * kMaxDelayedWriteRate); + ASSERT_EQ(controller.delayed_write_rate(), kMaxDelayedWriteRate); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + kMaxDelayedWriteRate); + ASSERT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + kMaxDelayedWriteRate); + { // set with token, get - auto delay_token_0 = controller.GetDelayToken(10 MBPS); + auto delay_token_0 = + controller.GetDelayToken(WriteController::DelaySource::kCF, 10 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS); + EXPECT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + 10 MBPS); + EXPECT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + kMaxDelayedWriteRate); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + + delay_token_0.reset(); + ASSERT_EQ(controller.delayed_write_rate(), kMaxDelayedWriteRate); + EXPECT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kCF), + 10 MBPS); + EXPECT_EQ(controller.delayed_write_rate(WriteController::DelaySource::kWBM), + kMaxDelayedWriteRate); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 20 MB)); + + delay_token_0 = + controller.GetDelayToken(WriteController::DelaySource::kCF, 10 MBPS); EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS); EXPECT_FALSE(controller.IsStopped()); EXPECT_TRUE(controller.NeedsDelay()); + // test with delay EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB)); clock_->now_micros_ += 2 SECS; // pay the "debt" - auto delay_token_1 = controller.GetDelayToken(2 MBPS); + auto delay_token_1 = + controller.GetDelayToken(WriteController::DelaySource::kCF, 2 MBPS); EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB)); clock_->now_micros_ += 10 SECS; // pay the "debt" - auto delay_token_2 = controller.GetDelayToken(1 MBPS); + auto delay_token_2 = + controller.GetDelayToken(WriteController::DelaySource::kWBM, 1 MBPS); EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB)); clock_->now_micros_ += 20 SECS; // pay the "debt" - auto delay_token_3 = controller.GetDelayToken(20 MBPS); + auto delay_token_3 = + controller.GetDelayToken(WriteController::DelaySource::kCF, 20 MBPS); + EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 20 SECS; // pay the "debt" + + auto delay_token_4 = + controller.GetDelayToken(WriteController::DelaySource::kWBM, 20 MBPS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 1 SECS; // pay the "debt" + + auto delay_token_5 = + controller.GetDelayToken(WriteController::DelaySource::kWBM, 30 MBPS); EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB)); clock_->now_micros_ += 1 SECS; // pay the "debt" - // 60M is more than the max rate of 40M. Max rate will be used. EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); - auto delay_token_4 = - controller.GetDelayToken(controller.delayed_write_rate() * 3); + + auto delay_token_6 = controller.GetDelayToken( + WriteController::DelaySource::kCF, controller.delayed_write_rate() * 3); + auto delay_token_7 = + controller.GetDelayToken(WriteController::DelaySource::kWBM, + controller.delayed_write_rate() * 3); + + // 60M is more than the max rate of 40M. Max rate will be used. EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); EXPECT_EQ(static_cast(0.5 SECS), controller.GetDelay(clock_.get(), 20 MB)); @@ -111,8 +209,8 @@ TEST_F(WriteControllerTest, StartFilled) { // Attempt to write two things that combined would be allowed within // a single refill interval - auto delay_token_0 = - controller.GetDelayToken(controller.delayed_write_rate()); + auto delay_token_0 = controller.GetDelayToken( + WriteController::DelaySource::kCF, controller.delayed_write_rate()); // Verify no delay because write rate has not been exceeded within // refill interval. @@ -142,7 +240,8 @@ TEST_F(WriteControllerTest, DebtAccumulation) { // would reset the debt on every GetDelayToken.) uint64_t debt = 0; for (unsigned i = 0; i < tokens.size(); ++i) { - tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + tokens[i] = controller.GetDelayToken(WriteController::DelaySource::kCF, + (i + 1u) MBPS); uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); ASSERT_GT(delay, debt); uint64_t incremental = delay - debt; @@ -159,7 +258,8 @@ TEST_F(WriteControllerTest, DebtAccumulation) { // Debt is accumulated in time, not in bytes, so this new write // limit is not applied to prior requested delays, even it they are // in progress. - tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + tokens[i] = controller.GetDelayToken(WriteController::DelaySource::kCF, + (i + 1u) MBPS); uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); ASSERT_GT(delay, debt); uint64_t incremental = delay - debt; @@ -187,7 +287,8 @@ TEST_F(WriteControllerTest, DebtAccumulation) { } // All tokens released. // Verify that releasing all tokens pays down debt, even with no time passage. - tokens[0] = controller.GetDelayToken(1 MBPS); + tokens[0] = + controller.GetDelayToken(WriteController::DelaySource::kCF, 1 MBPS); ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); } @@ -198,7 +299,8 @@ TEST_F(WriteControllerTest, CreditAccumulation) { std::array, 10> tokens; // Ensure started - tokens[0] = controller.GetDelayToken(1 MBPS); + tokens[0] = + controller.GetDelayToken(WriteController::DelaySource::kCF, 1 MBPS); ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); clock_->now_micros_ += 10 SECS; @@ -208,7 +310,8 @@ TEST_F(WriteControllerTest, CreditAccumulation) { // Spend some credit (burst of I/O) for (unsigned i = 0; i < tokens.size(); ++i) { - tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + tokens[i] = controller.GetDelayToken(WriteController::DelaySource::kCF, + (i + 1u) MBPS); ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB)); // In WriteController, credit is accumulated in bytes, not in time. // After an "unnecessary" delay, all of our time credit will be @@ -218,7 +321,8 @@ TEST_F(WriteControllerTest, CreditAccumulation) { credit -= 63 MB; } // Spend remaining credit - tokens[0] = controller.GetDelayToken(1 MBPS); + tokens[0] = + controller.GetDelayToken(WriteController::DelaySource::kCF, 1 MBPS); ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit)); // Verify ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); @@ -235,7 +339,8 @@ TEST_F(WriteControllerTest, CreditAccumulation) { // All tokens released. // Verify credit is wiped away on new delay. - tokens[0] = controller.GetDelayToken(1 MBPS); + tokens[0] = + controller.GetDelayToken(WriteController::DelaySource::kCF, 1 MBPS); ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); } diff --git a/db_stress_tool/CMakeLists.txt b/db_stress_tool/CMakeLists.txt index 96d70dd0e1..604bca596d 100644 --- a/db_stress_tool/CMakeLists.txt +++ b/db_stress_tool/CMakeLists.txt @@ -13,5 +13,5 @@ add_executable(db_stress${ARTIFACT_SUFFIX} expected_state.cc multi_ops_txns_stress.cc no_batched_ops_stress.cc) -target_link_libraries(db_stress${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS}) +target_link_libraries(db_stress${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${TESTUTILLIB} ${THIRDPARTY_LIBS}) list(APPEND tool_deps db_stress) diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc index 0d5e07632a..8fb5d354f7 100644 --- a/db_stress_tool/db_stress_common.cc +++ b/db_stress_tool/db_stress_common.cc @@ -29,7 +29,6 @@ enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression; enum ROCKSDB_NAMESPACE::ChecksumType checksum_type_e = ROCKSDB_NAMESPACE::kCRC32c; -enum RepFactory FLAGS_rep_factory = kSkipList; std::vector sum_probs(100001); constexpr int64_t zipf_sum_size = 100000; diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index d2b1ae4fc8..51e5e6357f 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -146,6 +146,7 @@ DECLARE_bool(allow_concurrent_memtable_write); DECLARE_double(experimental_mempurge_threshold); DECLARE_bool(enable_write_thread_adaptive_yield); DECLARE_int32(reopen); +DECLARE_string(filter_uri); DECLARE_double(bloom_bits); DECLARE_bool(use_block_based_filter); DECLARE_int32(ribbon_starting_level); @@ -238,6 +239,8 @@ DECLARE_int32(verify_db_one_in); DECLARE_int32(continuous_verification_interval); DECLARE_int32(get_property_one_in); DECLARE_string(file_checksum_impl); +DECLARE_int32(data_block_index_type); +DECLARE_double(data_block_hash_table_util_ratio); #ifndef ROCKSDB_LITE // Options for StackableDB-based BlobDB @@ -305,24 +308,6 @@ extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e; extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e; extern enum ROCKSDB_NAMESPACE::ChecksumType checksum_type_e; -enum RepFactory { kSkipList, kHashSkipList, kVectorRep }; - -inline enum RepFactory StringToRepFactory(const char* ctype) { - assert(ctype); - - if (!strcasecmp(ctype, "skip_list")) - return kSkipList; - else if (!strcasecmp(ctype, "prefix_hash")) - return kHashSkipList; - else if (!strcasecmp(ctype, "vector")) - return kVectorRep; - - fprintf(stdout, "Cannot parse memreptable %s\n", ctype); - return kSkipList; -} - -extern enum RepFactory FLAGS_rep_factory; - namespace ROCKSDB_NAMESPACE { inline enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( const char* ctype) { diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 8b92f93321..948789d144 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -56,11 +56,10 @@ DEFINE_double( DEFINE_string( options_file, "", - "The path to a RocksDB options file. If specified, then db_stress will " - "run with the RocksDB options in the default column family of the " - "specified options file. Note that, when an options file is provided, " - "db_stress will ignore the flag values for all options that may be passed " - "via options file."); + "The path to an options file. If specified, then db_stress will run with " + "the options in the default column family of the specified options file. " + "Note that, when an options file is provided, db_stress will ignore the " + "flag values for all options that may be passed via options file."); DEFINE_int64( active_width, 0, @@ -88,7 +87,7 @@ DEFINE_bool(atomic_flush, false, DEFINE_bool(test_cf_consistency, false, "If set, runs the stress test dedicated to verifying writes to " "multiple column families are consistent. Setting this implies " - "`atomic_flush=true` is set true if `disable_wal=false`.\n"); + "`atomic_flush=true` is set true if `disable_wal=true`.\n"); DEFINE_bool(test_multi_ops_txns, false, "If set, runs stress test dedicated to verifying multi-ops " @@ -227,7 +226,7 @@ DEFINE_int32( DEFINE_bool(disable_auto_compactions, ROCKSDB_NAMESPACE::Options().disable_auto_compactions, - "If true, RocksDB internally will not trigger compactions."); + "If true, compactions will not be triggered internally."); DEFINE_int32(max_background_compactions, ROCKSDB_NAMESPACE::Options().max_background_compactions, @@ -439,6 +438,7 @@ DEFINE_int32(reopen, 10, "Number of times database reopens"); static const bool FLAGS_reopen_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive); +DEFINE_string(filter_uri, "", "Filter Policy URI"); DEFINE_double(bloom_bits, 10, "Bloom filter bits per key. " "Negative means use default settings."); @@ -927,4 +927,12 @@ DEFINE_bool( DEFINE_string(wal_compression, "none", "Algorithm to use for WAL compression. none to disable."); +DEFINE_int32(data_block_index_type, 1, + "The index type that will be used for the data block " + "0 for kDataBlockBinarySearch and 1 for kDataBlockBinaryAndHash."); + +DEFINE_double(data_block_hash_table_util_ratio, 0.75, + "util ratio for data block hash index table. " + "This is only valid if use_data_block_hash_index is " + "set to true"); #endif // GFLAGS diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index e7eb4aade2..8f58f30f29 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -383,7 +383,7 @@ struct ThreadState { // The value of the Get std::string value; // optional state of all keys in the db - std::vector* key_vec; + std::unique_ptr> key_vec; std::string timestamp; }; diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 9fde2ecd71..5de5016b7b 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -19,6 +19,7 @@ #include "rocksdb/sst_file_manager.h" #include "rocksdb/types.h" #include "rocksdb/utilities/object_registry.h" +#include "speedb/version.h" #include "test_util/testutil.h" #include "util/cast_util.h" #include "utilities/backup/backup_engine_impl.h" @@ -30,27 +31,47 @@ namespace ROCKSDB_NAMESPACE { namespace { std::shared_ptr CreateFilterPolicy() { - if (FLAGS_bloom_bits < 0) { - return BlockBasedTableOptions().filter_policy; - } - const FilterPolicy* new_policy; - if (FLAGS_use_block_based_filter) { - if (FLAGS_ribbon_starting_level < 999) { - fprintf( - stderr, - "Cannot combine use_block_based_filter and ribbon_starting_level\n"); + if (!FLAGS_filter_uri.empty()) { + ConfigOptions config_options; + std::shared_ptr policy; + config_options.ignore_unsupported_options = false; + std::string bits_str; + if (FLAGS_bloom_bits > 0) { + bits_str = ":" + FormatDoubleParam(FLAGS_bloom_bits); + fprintf(stderr, "note: appending --bloom-bits (%f) to --filter-uri\n", + FLAGS_bloom_bits); + } + Status s = FilterPolicy::CreateFromString( + config_options, FLAGS_filter_uri + bits_str, &policy); + if (!s.ok() || !policy) { + fprintf(stderr, "Cannot create filter policy(%s%s): %s\n", + FLAGS_filter_uri.c_str(), bits_str.c_str(), s.ToString().c_str()); exit(1); - } else { - new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, true); } - } else if (FLAGS_ribbon_starting_level >= 999) { - // Use Bloom API - new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, false); + return policy; + } else if (FLAGS_bloom_bits < 0) { + return BlockBasedTableOptions().filter_policy; } else { - new_policy = NewRibbonFilterPolicy( - FLAGS_bloom_bits, /* bloom_before_level */ FLAGS_ribbon_starting_level); + const FilterPolicy* new_policy; + if (FLAGS_use_block_based_filter) { + if (FLAGS_ribbon_starting_level < 999) { + fprintf(stderr, + "Cannot combine use_block_based_filter and " + "ribbon_starting_level\n"); + exit(1); + } else { + new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, true); + } + } else if (FLAGS_ribbon_starting_level >= 999) { + // Use Bloom API + new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, false); + } else { + new_policy = NewRibbonFilterPolicy( + FLAGS_bloom_bits, + /* bloom_before_level */ FLAGS_ribbon_starting_level); + } + return std::shared_ptr(new_policy); } - return std::shared_ptr(new_policy); } } // namespace @@ -393,6 +414,44 @@ bool StressTest::VerifySecondaries() { return true; } +static std::vector GetKeyBitVec(DB* db, const ReadOptions& ropt_base) { + ReadOptions ropt = ropt_base; + // When `prefix_extractor` is set, seeking to beginning and scanning + // across prefixes are only supported with `total_order_seek` set. + ropt.total_order_seek = true; + std::unique_ptr iterator(db->NewIterator(ropt)); + + std::vector key_bitvec; + if (FLAGS_test_batches_snapshots) { + // In batched snapshot mode each key/value is inserted 10 times, where + // the key and the values are prefixed with a single ASCII digit in the + // range 0-9. + key_bitvec.resize(FLAGS_max_key * 10); + } else { + key_bitvec.resize(FLAGS_max_key); + } + + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + uint64_t key_offset = 0; + Slice key_str = iterator->key(); + // In batched snapshot mode each key operation is actually 10 operations in + // a single batch, as each operation creates 10 keys from each key by + // prefixing it with an ASCII digit in the range 0-9. + if (FLAGS_test_batches_snapshots) { + const char batch_id = key_str[0]; + assert(batch_id >= '0' && batch_id <= '9'); + key_offset = (batch_id - '0') * FLAGS_max_key; + key_str.remove_prefix(1); + } + + uint64_t key_val; + if (GetIntVal(key_str.ToString(), &key_val)) { + key_bitvec.at(key_offset + key_val) = true; + } + } + return key_bitvec; +} + Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf, ThreadState::SnapshotState& snap_state) { Status s; @@ -415,7 +474,8 @@ Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf, if (!s.ok() && !s.IsNotFound()) { return s; } - if (snap_state.status != s) { + if (snap_state.status.code() != s.code() || + snap_state.status.subcode() != s.subcode()) { return Status::Corruption( "The snapshot gave inconsistent results for key " + ToString(Hash(snap_state.key.c_str(), snap_state.key.size(), 0)) + @@ -430,20 +490,9 @@ Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf, } } if (snap_state.key_vec != nullptr) { - // When `prefix_extractor` is set, seeking to beginning and scanning - // across prefixes are only supported with `total_order_seek` set. - ropt.total_order_seek = true; - std::unique_ptr iterator(db->NewIterator(ropt)); - std::unique_ptr> tmp_bitvec( - new std::vector(FLAGS_max_key)); - for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { - uint64_t key_val; - if (GetIntVal(iterator->key().ToString(), &key_val)) { - (*tmp_bitvec.get())[key_val] = true; - } - } + std::vector tmp_bitvec = GetKeyBitVec(db, ropt); if (!std::equal(snap_state.key_vec->begin(), snap_state.key_vec->end(), - tmp_bitvec.get()->begin())) { + tmp_bitvec.begin())) { return Status::Corruption("Found inconsistent keys at this snapshot"); } } @@ -703,7 +752,6 @@ void StressTest::OperateDb(ThreadState* thread) { MutexLock l(thread->shared->GetMutex()); while (!thread->snapshot_queue.empty()) { db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot); - delete thread->snapshot_queue.front().second.key_vec; thread->snapshot_queue.pop(); } thread->shared->IncVotedReopen(); @@ -973,7 +1021,6 @@ void StressTest::OperateDb(ThreadState* thread) { } while (!thread->snapshot_queue.empty()) { db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot); - delete thread->snapshot_queue.front().second.key_vec; thread->snapshot_queue.pop(); } @@ -1395,9 +1442,7 @@ Status StressTest::TestBackupRestore( const std::vector& /* rand_column_families */, const std::vector& /* rand_keys */) { assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "TestBackupRestore\n"); + fprintf(stderr, "TestBackupRestore is not supported in LITE mode\n"); std::terminate(); } @@ -1406,18 +1451,14 @@ Status StressTest::TestCheckpoint( const std::vector& /* rand_column_families */, const std::vector& /* rand_keys */) { assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "TestCheckpoint\n"); + fprintf(stderr, "TestCheckpoint is not supported in LITE mode\n"); std::terminate(); } void StressTest::TestCompactFiles(ThreadState* /* thread */, ColumnFamilyHandle* /* column_family */) { assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "CompactFiles\n"); + fprintf(stderr, "CompactFiles is not supported in LITE mode\n"); std::terminate(); } #else // ROCKSDB_LITE @@ -2004,27 +2045,18 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread, std::vector* key_vec = nullptr; if (FLAGS_compare_full_db_state_snapshot && (thread->tid == 0)) { - key_vec = new std::vector(FLAGS_max_key); - // When `prefix_extractor` is set, seeking to beginning and scanning - // across prefixes are only supported with `total_order_seek` set. - ropt.total_order_seek = true; - std::unique_ptr iterator(db_->NewIterator(ropt)); - for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { - uint64_t key_val; - if (GetIntVal(iterator->key().ToString(), &key_val)) { - (*key_vec)[key_val] = true; - } - } - } - - ThreadState::SnapshotState snap_state = {snapshot, - rand_column_family, - column_family->GetName(), - keystr, - status_at, - value_at, - key_vec, - ts_str}; + key_vec = new std::vector(GetKeyBitVec(db_, ropt)); + } + + ThreadState::SnapshotState snap_state = { + snapshot, + rand_column_family, + column_family->GetName(), + keystr, + status_at, + value_at, + std::unique_ptr>(key_vec), + ts_str}; uint64_t hold_for = FLAGS_snapshot_hold_ops; if (FLAGS_long_running_snapshots) { // Hold 10% of snapshots for 10x more @@ -2039,20 +2071,19 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread, } } uint64_t release_at = std::min(FLAGS_ops_per_thread - 1, i + hold_for); - thread->snapshot_queue.emplace(release_at, snap_state); + thread->snapshot_queue.emplace(release_at, std::move(snap_state)); } Status StressTest::MaybeReleaseSnapshots(ThreadState* thread, uint64_t i) { while (!thread->snapshot_queue.empty() && i >= thread->snapshot_queue.front().first) { - auto snap_state = thread->snapshot_queue.front().second; + auto& snap_state = thread->snapshot_queue.front().second; assert(snap_state.snapshot); // Note: this is unsafe as the cf might be dropped concurrently. But // it is ok since unclean cf drop is cunnrently not supported by write // prepared transactions. Status s = AssertSame(db_, column_families_[snap_state.cf_at], snap_state); db_->ReleaseSnapshot(snap_state.snapshot); - delete snap_state.key_vec; thread->snapshot_queue.pop(); if (!s.ok()) { return s; @@ -2158,8 +2189,8 @@ uint32_t StressTest::GetRangeHash(ThreadState* thread, const Snapshot* snapshot, } void StressTest::PrintEnv() const { - fprintf(stdout, "RocksDB version : %d.%d\n", kMajorVersion, - kMinorVersion); + fprintf(stdout, "Speedb version : %s\n", + GetSpeedbVersionAsString(false).c_str()); fprintf(stdout, "Format version : %d\n", FLAGS_format_version); fprintf(stdout, "TransactionDB : %s\n", FLAGS_use_txn ? "true" : "false"); @@ -2242,25 +2273,17 @@ void StressTest::PrintEnv() const { FLAGS_file_checksum_impl.c_str()); fprintf(stdout, "Bloom bits / key : %s\n", FormatDoubleParam(FLAGS_bloom_bits).c_str()); + if (!FLAGS_filter_uri.empty()) { + fprintf(stdout, "Filter Policy : %s\n", + FLAGS_filter_uri.c_str()); + } fprintf(stdout, "Max subcompactions : %" PRIu64 "\n", FLAGS_subcompactions); fprintf(stdout, "Use MultiGet : %s\n", FLAGS_use_multiget ? "true" : "false"); - const char* memtablerep = ""; - switch (FLAGS_rep_factory) { - case kSkipList: - memtablerep = "skip_list"; - break; - case kHashSkipList: - memtablerep = "prefix_hash"; - break; - case kVectorRep: - memtablerep = "vector"; - break; - } - - fprintf(stdout, "Memtablerep : %s\n", memtablerep); + fprintf(stdout, "Memtablerep : %s\n", + FLAGS_memtablerep.c_str()); #ifndef NDEBUG KillPoint* kp = KillPoint::GetInstance(); @@ -2298,6 +2321,10 @@ void StressTest::PrintEnv() const { static_cast(FLAGS_user_timestamp_size)); fprintf(stdout, "WAL compression : %s\n", FLAGS_wal_compression.c_str()); + fprintf(stdout, "data_block_index_type : %d\n", + static_cast(FLAGS_data_block_index_type)); + fprintf(stdout, "data_block_hash_table_util_ratio : %f\n", + static_cast(FLAGS_data_block_hash_table_util_ratio)); fprintf(stdout, "------------------------------------------------\n"); } @@ -2338,6 +2365,11 @@ void StressTest::Open() { block_based_options.prepopulate_block_cache = static_cast( FLAGS_prepopulate_block_cache); + block_based_options.data_block_index_type = + static_cast( + FLAGS_data_block_index_type); + block_based_options.data_block_hash_table_util_ratio = + static_cast(FLAGS_data_block_hash_table_util_ratio); options_.table_factory.reset( NewBlockBasedTableFactory(block_based_options)); options_.db_write_buffer_size = FLAGS_db_write_buffer_size; @@ -2484,35 +2516,34 @@ void StressTest::Open() { exit(1); } } - - if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) { + if (strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash") == 0) { + // Needed to use a different default (10K vs 1M) + FLAGS_memtablerep = "prefix_hash:10000"; + } + std::unique_ptr factory; + ConfigOptions config_options; + config_options.ignore_unknown_options = false; + config_options.ignore_unsupported_options = false; + Status status = MemTableRepFactory::CreateFromString( + config_options, FLAGS_memtablerep, &factory); + if (!status.ok() || !factory) { + fprintf(stderr, "MemTableFactory creation failed: %s\n", + status.ToString().c_str()); + exit(1); + } + options_.memtable_factory = std::move(factory); + if (FLAGS_prefix_size == 0 && + options_.memtable_factory->IsInstanceOf("prefix_hash")) { fprintf(stderr, "prefeix_size cannot be zero if memtablerep == prefix_hash\n"); exit(1); } - if (FLAGS_prefix_size != 0 && FLAGS_rep_factory != kHashSkipList) { + if (FLAGS_prefix_size != 0 && + !options_.memtable_factory->IsInstanceOf("prefix_hash")) { fprintf(stderr, "WARNING: prefix_size is non-zero but " "memtablerep != prefix_hash\n"); } - switch (FLAGS_rep_factory) { - case kSkipList: - // no need to do anything - break; -#ifndef ROCKSDB_LITE - case kHashSkipList: - options_.memtable_factory.reset(NewHashSkipListRepFactory(10000)); - break; - case kVectorRep: - options_.memtable_factory.reset(new VectorRepFactory()); - break; -#else - default: - fprintf(stderr, - "RocksdbLite only supports skip list mem table. Skip " - "--rep_factory\n"); -#endif // ROCKSDB_LITE - } if (FLAGS_use_full_merge_v1) { options_.merge_operator = MergeOperators::CreateDeprecatedPutOperator(); @@ -2804,7 +2835,7 @@ void StressTest::Open() { } } #else - fprintf(stderr, "Secondary is not supported in RocksDBLite\n"); + fprintf(stderr, "Secondary is not supported in LITE mode\n"); exit(1); #endif } @@ -2846,7 +2877,7 @@ void StressTest::Open() { } } #else - fprintf(stderr, "TTL is not supported in RocksDBLite\n"); + fprintf(stderr, "TTL is not supported in LITE mode\n"); exit(1); #endif } diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index 2a7430bfaa..32519a3c87 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -120,8 +120,6 @@ int db_stress_tool(int argc, char** argv) { } #endif - FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); - // The number of background threads should be at least as much the // max number of concurrent compactions. db_stress_env->SetBackgroundThreads(FLAGS_max_background_compactions, @@ -302,6 +300,11 @@ int db_stress_tool(int argc, char** argv) { std::vector weights; uint64_t scale_factor = FLAGS_key_window_scale_factor; key_gen_ctx.window = scale_factor * 100; + if (scale_factor == 0 || levels == 0) { + fprintf(stderr, + "max_key_len and key_window_scale_factor should be positive"); + exit(1); + } if (!FLAGS_key_len_percent_dist.empty()) { weights = SplitString(FLAGS_key_len_percent_dist); if (weights.size() != levels) { @@ -316,6 +319,10 @@ int db_stress_tool(int argc, char** argv) { uint64_t val = std::stoull(weight); key_gen_ctx.weights.emplace_back(val * scale_factor); total_weight += val; + if (val == 0) { + fprintf(stderr, "key_len_percent_dist cannot contain zero values"); + exit(1); + } } if (total_weight != 100) { fprintf(stderr, "Sum of all weights in key_len_dist should be 100"); @@ -323,6 +330,12 @@ int db_stress_tool(int argc, char** argv) { } } else { uint64_t keys_per_level = key_gen_ctx.window / levels; + if (keys_per_level == 0) { + fprintf( + stderr, + "max_key_len cannot be greater than key_window_scale_factor * 100"); + exit(1); + } for (unsigned int level = 0; level + 1 < levels; ++level) { key_gen_ctx.weights.emplace_back(keys_per_level); } diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index c14487da8a..11581a4746 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -812,9 +812,7 @@ class NonBatchedOpsStressTest : public StressTest { const std::vector& /* rand_keys */, std::unique_ptr& /* lock */) override { assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "TestIngestExternalFile\n"); + fprintf(stderr, "TestIngestExternalFile is not supported in LITE mode\n"); std::terminate(); } #else diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 40413b569f..fc26e7cf64 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -156,7 +156,7 @@ TEST_P(EnvBasicTestWithParam, Basics) { std::vector children; // Check that the directory is empty. - ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/non_existent")); + ASSERT_TRUE(env_->FileExists(test_dir_ + "/non_existent").IsNotFound()); ASSERT_TRUE(!env_->GetFileSize(test_dir_ + "/non_existent", &file_size).ok()); ASSERT_OK(env_->GetChildren(test_dir_, &children)); ASSERT_EQ(0U, children.size()); @@ -194,7 +194,7 @@ TEST_P(EnvBasicTestWithParam, Basics) { ASSERT_TRUE( !env_->RenameFile(test_dir_ + "/non_existent", test_dir_ + "/g").ok()); ASSERT_OK(env_->RenameFile(test_dir_ + "/f1", test_dir_ + "/g")); - ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/f1")); + ASSERT_TRUE(env_->FileExists(test_dir_ + "/f1").IsNotFound()); ASSERT_OK(env_->FileExists(test_dir_ + "/g")); ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size)); ASSERT_EQ(3U, file_size); @@ -218,7 +218,7 @@ TEST_P(EnvBasicTestWithParam, Basics) { // Check that deleting works. ASSERT_NOK(env_->DeleteFile(test_dir_ + "/non_existent")); ASSERT_OK(env_->DeleteFile(test_dir_ + "/g")); - ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g")); + ASSERT_TRUE(env_->FileExists(test_dir_ + "/g").IsNotFound()); ASSERT_OK(env_->GetChildren(test_dir_, &children)); ASSERT_EQ(0U, children.size()); Status s = env_->GetChildren(test_dir_ + "/non_existent", &children); @@ -324,7 +324,7 @@ TEST_P(EnvMoreTestWithParam, MakeDir) { ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok()); ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/j")); ASSERT_OK(env_->DeleteDir(test_dir_ + "/j")); - ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/j")); + ASSERT_TRUE(env_->FileExists(test_dir_ + "/j").IsNotFound()); } TEST_P(EnvMoreTestWithParam, GetChildren) { diff --git a/env/fs_posix.cc b/env/fs_posix.cc index 17095e92cb..99a171552c 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -164,8 +164,7 @@ class PosixFileSystem : public FileSystem { if (options.use_direct_reads && !options.use_mmap_reads) { #ifdef ROCKSDB_LITE - return IOStatus::IOError(fname, - "Direct I/O not supported in RocksDB lite"); + return IOStatus::IOError(fname, "Direct I/O not supported in LITE mode"); #endif // !ROCKSDB_LITE #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) flags |= O_DIRECT; @@ -219,8 +218,7 @@ class PosixFileSystem : public FileSystem { if (options.use_direct_reads && !options.use_mmap_reads) { #ifdef ROCKSDB_LITE - return IOStatus::IOError(fname, - "Direct I/O not supported in RocksDB lite"); + return IOStatus::IOError(fname, "Direct I/O not supported in LITE mode"); #endif // !ROCKSDB_LITE #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) flags |= O_DIRECT; @@ -296,8 +294,7 @@ class PosixFileSystem : public FileSystem { // offset. // More info here: https://linux.die.net/man/2/pwrite #ifdef ROCKSDB_LITE - return IOStatus::IOError(fname, - "Direct I/O not supported in RocksDB lite"); + return IOStatus::IOError(fname, "Direct I/O not supported in LITE mode"); #endif // ROCKSDB_LITE flags |= O_WRONLY; #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) @@ -388,8 +385,7 @@ class PosixFileSystem : public FileSystem { // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX) if (options.use_direct_writes && !options.use_mmap_writes) { #ifdef ROCKSDB_LITE - return IOStatus::IOError(fname, - "Direct I/O not supported in RocksDB lite"); + return IOStatus::IOError(fname, "Direct I/O not supported in LITE mode"); #endif // !ROCKSDB_LITE flags |= O_WRONLY; #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) @@ -1118,6 +1114,19 @@ class PosixFileSystem : public FileSystem { #endif } + // TODO akanksha: Look into flags and see how to provide support for AbortIO + // in posix for IOUring requests. Currently it calls Poll to wait for requests + // to complete the request. + virtual IOStatus AbortIO(std::vector& io_handles) override { + IOStatus s = Poll(io_handles, io_handles.size()); + // If Poll is not supported then it didn't submit any request and it should + // return OK. + if (s.IsNotSupported()) { + return IOStatus::OK(); + } + return s; + } + #if defined(ROCKSDB_IOURING_PRESENT) // io_uring instance std::unique_ptr thread_local_io_urings_; diff --git a/examples/Makefile b/examples/Makefile index 2fd162136c..a99778d2b5 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -1,5 +1,8 @@ include ../make_config.mk +PROJECT_NAME?=speedb +LIBNAME?=lib$(PROJECT_NAME) + ifndef DISABLE_JEMALLOC ifdef JEMALLOC PLATFORM_CXXFLAGS += -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE @@ -14,42 +17,42 @@ endif CFLAGS += -Wstrict-prototypes -.PHONY: clean librocksdb +.PHONY: clean static_lib all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example -simple_example: librocksdb simple_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +simple_example: static_lib simple_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../$(LIBNAME).a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) -column_families_example: librocksdb column_families_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +column_families_example: static_lib column_families_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../$(LIBNAME).a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) -compaction_filter_example: librocksdb compaction_filter_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +compaction_filter_example: static_lib compaction_filter_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../$(LIBNAME).a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) -compact_files_example: librocksdb compact_files_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +compact_files_example: static_lib compact_files_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../$(LIBNAME).a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) .c.o: $(CC) $(CFLAGS) -c $< -o $@ -I../include -c_simple_example: librocksdb c_simple_example.o - $(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS) +c_simple_example: static_lib c_simple_example.o + $(CXX) $@.o -o$@ ../$(LIBNAME).a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS) -optimistic_transaction_example: librocksdb optimistic_transaction_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +optimistic_transaction_example: static_lib optimistic_transaction_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../$(LIBNAME).a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) -transaction_example: librocksdb transaction_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +transaction_example: static_lib transaction_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../$(LIBNAME).a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) -options_file_example: librocksdb options_file_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +options_file_example: static_lib options_file_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../$(LIBNAME).a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) -multi_processes_example: librocksdb multi_processes_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) +multi_processes_example: static_lib multi_processes_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../$(LIBNAME).a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) clean: rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example -librocksdb: - cd .. && $(MAKE) static_lib +static_lib: + LIBNAME="$(LIBNAME)" $(MAKE) -C .. static_lib diff --git a/examples/optimistic_transaction_example.cc b/examples/optimistic_transaction_example.cc index e0398f66e5..9f1c399ff5 100644 --- a/examples/optimistic_transaction_example.cc +++ b/examples/optimistic_transaction_example.cc @@ -158,6 +158,7 @@ int main() { // Set a new snapshot in the transaction txn->SetSnapshot(); + db->ReleaseSnapshot(read_options.snapshot); read_options.snapshot = db->GetSnapshot(); // Do some reads and writes to key "y" @@ -172,6 +173,7 @@ int main() { assert(s.ok()); delete txn; // Clear snapshot from read options since it is no longer valid + db->ReleaseSnapshot(read_options.snapshot); read_options.snapshot = nullptr; // txn is committed, read the latest values. diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index 6d98f3b534..222166876d 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -229,6 +229,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, // second buffer. std::vector handles; handles.emplace_back(io_handle_); + StopWatch sw(clock_, stats_, POLL_WAIT_MICROS); fs_->Poll(handles, 1).PermitUncheckedError(); } @@ -281,7 +282,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) { offset += length; length = 0; - prefetch_size -= length; + prefetch_size = readahead_size; } // Data is overlapping i.e. some of the data is in curr_ buffer and remaining // in second buffer. @@ -310,7 +311,8 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, // sync prefetching and copy the remaining data to third buffer in the end. // swap the buffers. curr_ = curr_ ^ 1; - prefetch_size -= length; + // Update prefetch_size as length has been updated in CopyDataToBuffer. + prefetch_size = length + readahead_size; } // Update second again if swap happened. diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index 78ed3153c7..94d09bba43 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -14,6 +14,7 @@ #include #include "file/readahead_file_info.h" +#include "monitoring/statistics.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" @@ -64,7 +65,8 @@ class FilePrefetchBuffer { FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0, bool enable = true, bool track_min_offset = false, bool implicit_auto_readahead = false, - bool async_io = false, FileSystem* fs = nullptr) + bool async_io = false, FileSystem* fs = nullptr, + SystemClock* clock = nullptr, Statistics* stats = nullptr) : curr_(0), readahead_size_(readahead_size), initial_auto_readahead_size_(readahead_size), @@ -80,7 +82,9 @@ class FilePrefetchBuffer { del_fn_(nullptr), async_read_in_progress_(false), async_io_(async_io), - fs_(fs) { + fs_(fs), + clock_(clock), + stats_(stats) { // If async_io_ is enabled, data is asynchronously filled in second buffer // while curr_ is being consumed. If data is overlapping in two buffers, // data is copied to third buffer to return continuous buffer. @@ -88,12 +92,24 @@ class FilePrefetchBuffer { } ~FilePrefetchBuffer() { - // Wait for any pending async job before destroying the class object. + // Abort any pending async read request before destroying the class object. if (async_read_in_progress_ && fs_ != nullptr) { std::vector handles; handles.emplace_back(io_handle_); - fs_->Poll(handles, 1).PermitUncheckedError(); + Status s = fs_->AbortIO(handles); + assert(s.ok()); } + + // Prefetch buffer bytes discarded. + uint64_t bytes_discarded = 0; + if (bufs_[curr_].buffer_.CurrentSize() != 0) { + bytes_discarded = bufs_[curr_].buffer_.CurrentSize(); + } + if (bufs_[curr_ ^ 1].buffer_.CurrentSize() != 0) { + bytes_discarded += bufs_[curr_ ^ 1].buffer_.CurrentSize(); + } + RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); + // Release io_handle_. if (io_handle_ != nullptr && del_fn_ != nullptr) { del_fn_(io_handle_); @@ -272,5 +288,7 @@ class FilePrefetchBuffer { bool async_read_in_progress_; bool async_io_; FileSystem* fs_; + SystemClock* clock_; + Statistics* stats_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index 20e569568d..c6287961cf 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -1288,6 +1288,10 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) { { HistogramData async_read_bytes; options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + HistogramData prefetched_bytes_discarded; + options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, + &prefetched_bytes_discarded); + // Not all platforms support iouring. In that case, ReadAsync in posix // won't submit async requests. if (read_async_called) { @@ -1295,6 +1299,7 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) { } else { ASSERT_EQ(async_read_bytes.count, 0); } + ASSERT_GT(prefetched_bytes_discarded.count, 0); } } diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h index bfc7563888..f9f6e5bd07 100644 --- a/file/writable_file_writer.h +++ b/file/writable_file_writer.h @@ -160,7 +160,9 @@ class WritableFileWriter { bool perform_data_verification_; uint32_t buffered_data_crc32c_checksum_; bool buffered_data_with_checksum_; +#ifndef ROCKSDB_LITE Temperature temperature_; +#endif // ROCKSDB_LITE public: WritableFileWriter( @@ -191,8 +193,10 @@ class WritableFileWriter { checksum_finalized_(false), perform_data_verification_(perform_data_verification), buffered_data_crc32c_checksum_(0), - buffered_data_with_checksum_(buffered_data_with_checksum), - temperature_(options.temperature) { + buffered_data_with_checksum_(buffered_data_with_checksum) { +#ifndef ROCKSDB_LITE + temperature_ = options.temperature; +#endif // ROCKSDB_LITE assert(!use_direct_io() || max_buffer_size_ > 0); TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0", reinterpret_cast(max_buffer_size_)); diff --git a/fuzz/Makefile b/fuzz/Makefile index b830405049..57c609e571 100644 --- a/fuzz/Makefile +++ b/fuzz/Makefile @@ -7,11 +7,11 @@ ROOT_DIR = $(abspath $(shell pwd)/../) include $(ROOT_DIR)/make_config.mk -PROTOBUF_CFLAGS = `pkg-config --cflags protobuf` -PROTOBUF_LDFLAGS = `pkg-config --libs protobuf` +PROTOBUF_CFLAGS = $(shell pkg-config --cflags protobuf) +PROTOBUF_LDFLAGS = $(shell pkg-config --libs protobuf) -PROTOBUF_MUTATOR_CFLAGS = `pkg-config --cflags libprotobuf-mutator` -PROTOBUF_MUTATOR_LDFLAGS = `pkg-config --libs libprotobuf-mutator` +PROTOBUF_MUTATOR_CFLAGS = $(shell pkg-config --cflags libprotobuf-mutator) +PROTOBUF_MUTATOR_LDFLAGS = $(shell pkg-config --libs libprotobuf-mutator) ROCKSDB_INCLUDE_DIR = $(ROOT_DIR)/include ROCKSDB_LIB_DIR = $(ROOT_DIR) @@ -23,7 +23,7 @@ ifneq ($(FUZZ_ENV), ossfuzz) CC = $(CXX) CCFLAGS += -Wall -fsanitize=address,fuzzer CFLAGS += $(PLATFORM_CXXFLAGS) $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR) -LDFLAGS += $(PLATFORM_LDFLAGS) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb +LDFLAGS += $(PLATFORM_LDFLAGS) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -l$(LIBNAME:lib%=%) else # OSS-Fuzz sets various environment flags that are used for compilation. # These environment flags depend on which type of sanitizer build is being @@ -39,7 +39,7 @@ else CC = $(CXX) CCFLAGS = $(CXXFLAGS) CFLAGS += $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR) -LDFLAGS += $(PLATFORM_LDFLAGS) $(LIB_FUZZING_ENGINE) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb +LDFLAGS += $(PLATFORM_LDFLAGS) $(LIB_FUZZING_ENGINE) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -l$(LIBNAME:lib%=%) endif .PHONY: gen_proto clean diff --git a/fuzz/db_fuzzer.cc b/fuzz/db_fuzzer.cc index 383a95096d..c66e1b80a6 100644 --- a/fuzz/db_fuzzer.cc +++ b/fuzz/db_fuzzer.cc @@ -1,5 +1,7 @@ #include +#include + #include "rocksdb/db.h" enum OperationType { @@ -42,25 +44,30 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { switch (op) { case kPut: { - std::string key = fuzzed_data.ConsumeRandomLengthString(); - std::string val = fuzzed_data.ConsumeRandomLengthString(); + std::string key = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); + std::string val = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, val); break; } case kGet: { - std::string key = fuzzed_data.ConsumeRandomLengthString(); + std::string key = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); std::string value; db->Get(ROCKSDB_NAMESPACE::ReadOptions(), key, &value); break; } case kDelete: { - std::string key = fuzzed_data.ConsumeRandomLengthString(); + std::string key = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); db->Delete(ROCKSDB_NAMESPACE::WriteOptions(), key); break; } case kGetProperty: { std::string prop; - std::string property_name = fuzzed_data.ConsumeRandomLengthString(); + std::string property_name = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); db->GetProperty(property_name, &prop); break; } @@ -114,9 +121,12 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { column_families, &handles, &db); if (s.ok()) { - std::string key1 = fuzzed_data.ConsumeRandomLengthString(); - std::string val1 = fuzzed_data.ConsumeRandomLengthString(); - std::string key2 = fuzzed_data.ConsumeRandomLengthString(); + std::string key1 = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); + std::string val1 = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); + std::string key2 = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); s = db->Put(ROCKSDB_NAMESPACE::WriteOptions(), handles[1], key1, val1); std::string value; @@ -137,8 +147,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { break; } case kCompactRange: { - std::string slice_start = fuzzed_data.ConsumeRandomLengthString(); - std::string slice_end = fuzzed_data.ConsumeRandomLengthString(); + std::string slice_start = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); + std::string slice_end = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); ROCKSDB_NAMESPACE::Slice begin(slice_start); ROCKSDB_NAMESPACE::Slice end(slice_end); @@ -147,7 +159,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { break; } case kSeekForPrev: { - std::string key = fuzzed_data.ConsumeRandomLengthString(); + std::string key = fuzzed_data.ConsumeRandomLengthString( + fuzzed_data.remaining_bytes()); auto iter = db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()); iter->SeekForPrev(key); delete iter; @@ -155,6 +168,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { } case OP_COUNT: break; + default: { + assert(false); + } } } diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index c856f6c656..9eca00aaca 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -668,6 +668,17 @@ class FileSystem : public Customizable { return IOStatus::OK(); } + // EXPERIMENTAL + // Abort the read IO requests submitted asynchronously. Underlying FS is + // required to support AbortIO API. AbortIO implementation should ensure that + // the all the read requests related to io_handles should be aborted and + // it shouldn't call the callback for these io_handles. + // + // Default implementation is to return IOStatus::OK. + virtual IOStatus AbortIO(std::vector& /*io_handles*/) { + return IOStatus::OK(); + } + // If you're adding methods here, remember to add them to EnvWrapper too. private: @@ -1500,6 +1511,10 @@ class FileSystemWrapper : public FileSystem { return target_->Poll(io_handles, min_completions); } + virtual IOStatus AbortIO(std::vector& io_handles) override { + return target_->AbortIO(io_handles); + } + protected: std::shared_ptr target_; }; diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 954d15b4a1..3fa67dcf97 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -103,6 +103,11 @@ class FilterPolicy : public Customizable { // family (rare), implementations may return Name(). virtual const char* CompatibilityName() const = 0; + // Utility helper to parse the URI passed to the CreateFromString() + // And extract the value of the bits-per-key passed via that URI + // See CreateFromString() below for more details + static double ExtractBitsPerKeyFromUri(const std::string& uri); + // Creates a new FilterPolicy based on the input value string and returns the // result The value might be an ID, and ID with properties, or an old-style // policy string. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index b9bc15753d..6dc5e97b85 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -672,10 +672,10 @@ struct DBOptions { // LOW priority thread pool. For more information, see // Env::SetBackgroundThreads // - // Default: -1 + // Default: 8 // // Dynamically changeable through SetDBOptions() API. - int max_background_compactions = -1; + int max_background_compactions = 8; // This value represents the maximum number of threads that will // concurrently perform a compaction job by breaking it into multiple, @@ -1513,8 +1513,8 @@ struct ReadOptions { bool pin_data; // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we - // schedule a background job in the flush job queue and delete obsolete files - // in background. + // schedule a background job in the compaction job queue and delete obsolete + // files in background. // Default: false bool background_purge_on_iterator_cleanup; diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index dcd3fc3339..9bc7ab196d 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -536,7 +536,12 @@ enum Histograms : uint32_t { // Error handler statistics ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + // Stats related to asynchronous read requests. ASYNC_READ_BYTES, + POLL_WAIT_MICROS, + + // Number of prefetched bytes discarded by RocksDB. + PREFETCHED_BYTES_DISCARDED, HISTOGRAM_ENUM_MAX, }; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index b512b2da90..e7cc454d2b 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -24,10 +24,6 @@ #include #include -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED -#include "port/stack_trace.h" -#endif - #include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { @@ -46,8 +42,7 @@ class Status { ~Status() { #ifdef ROCKSDB_ASSERT_STATUS_CHECKED if (!checked_) { - fprintf(stderr, "Failed to check Status %p\n", this); - port::PrintStack(); + PrintFailure(); abort(); } #endif // ROCKSDB_ASSERT_STATUS_CHECKED @@ -457,6 +452,9 @@ class Status { // Returns the string "OK" for success. std::string ToString() const; + private: + void PrintFailure(); + protected: Code code_; SubCode subcode_; diff --git a/include/rocksdb/utilities/env_mirror.h b/include/rocksdb/utilities/env_mirror.h index 8e96ac4108..b604baa799 100644 --- a/include/rocksdb/utilities/env_mirror.h +++ b/include/rocksdb/utilities/env_mirror.h @@ -60,13 +60,15 @@ class EnvMirror : public EnvWrapper { std::unique_ptr br; Status as = a_->NewDirectory(name, result); Status bs = b_->NewDirectory(name, &br); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status FileExists(const std::string& f) override { Status as = a_->FileExists(f); Status bs = b_->FileExists(f); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } #if defined(_MSC_VER) @@ -79,7 +81,8 @@ class EnvMirror : public EnvWrapper { std::vector ar, br; Status as = a_->GetChildren(dir, &ar); Status bs = b_->GetChildren(dir, &br); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); std::sort(ar.begin(), ar.end()); std::sort(br.begin(), br.end()); if (!as.ok() || ar != br) { @@ -94,32 +97,37 @@ class EnvMirror : public EnvWrapper { Status DeleteFile(const std::string& f) override { Status as = a_->DeleteFile(f); Status bs = b_->DeleteFile(f); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status CreateDir(const std::string& d) override { Status as = a_->CreateDir(d); Status bs = b_->CreateDir(d); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status CreateDirIfMissing(const std::string& d) override { Status as = a_->CreateDirIfMissing(d); Status bs = b_->CreateDirIfMissing(d); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status DeleteDir(const std::string& d) override { Status as = a_->DeleteDir(d); Status bs = b_->DeleteDir(d); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status GetFileSize(const std::string& f, uint64_t* s) override { uint64_t asize, bsize; Status as = a_->GetFileSize(f, &asize); Status bs = b_->GetFileSize(f, &bsize); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); assert(!as.ok() || asize == bsize); *s = asize; return as; @@ -130,7 +138,8 @@ class EnvMirror : public EnvWrapper { uint64_t amtime, bmtime; Status as = a_->GetFileModificationTime(fname, &amtime); Status bs = b_->GetFileModificationTime(fname, &bmtime); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000); *file_mtime = amtime; return as; @@ -139,14 +148,16 @@ class EnvMirror : public EnvWrapper { Status RenameFile(const std::string& s, const std::string& t) override { Status as = a_->RenameFile(s, t); Status bs = b_->RenameFile(s, t); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status LinkFile(const std::string& s, const std::string& t) override { Status as = a_->LinkFile(s, t); Status bs = b_->LinkFile(s, t); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } @@ -160,7 +171,8 @@ class EnvMirror : public EnvWrapper { FileLock *al, *bl; Status as = a_->LockFile(f, &al); Status bs = b_->LockFile(f, &bl); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); if (as.ok()) *l = new FileLockMirror(al, bl); return as; } @@ -169,7 +181,8 @@ class EnvMirror : public EnvWrapper { FileLockMirror* ml = static_cast(l); Status as = a_->UnlockFile(ml->a_); Status bs = b_->UnlockFile(ml->b_); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); delete ml; return as; } diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index a6954fb10f..86f8f3f721 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -13,7 +13,7 @@ // minor or major version number planned for release. #define ROCKSDB_MAJOR 7 #define ROCKSDB_MINOR 2 -#define ROCKSDB_PATCH 0 +#define ROCKSDB_PATCH 2 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h index 7fb18196d7..746efc472e 100644 --- a/include/rocksdb/write_buffer_manager.h +++ b/include/rocksdb/write_buffer_manager.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "rocksdb/cache.h" @@ -35,6 +36,17 @@ class StallInterface { }; class WriteBufferManager final { + public: + // Delay Mechanism (allow_delays_and_stalls==true) definitions + static constexpr uint64_t kStartDelayPercentThreshold = 80U; + + enum class UsageState { kNone, kDelay, kStop }; + + static constexpr uint64_t kNoneDelayedWriteFactor = 0U; + static constexpr uint64_t kMinDelayedWriteFactor = 1U; + static constexpr uint64_t kMaxDelayedWriteFactor = 200U; + static constexpr uint64_t kStopDelayedWriteFactor = kMaxDelayedWriteFactor; + public: // Parameters: // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped. @@ -44,12 +56,23 @@ class WriteBufferManager final { // cost the memory allocated to the cache. It can be used even if _buffer_size // = 0. // - // allow_stall: if set true, it will enable stalling of writes when - // memory_usage() exceeds buffer_size. It will wait for flush to complete and - // memory usage to drop down. + // allow_delays_and_stalls: if set true, it will enable delays and stall as + // described below. + // Delays: if set to true, it will start delaying of writes when + // memory_usage() exceeds the kStartDelayPercentThreshold percent threshold + // of the buffer size. The WBM calculates a delay factor that is increasing + // as memory_usage() increases. When applicable, the WBM will notify its + // registered clients about the applicable delay factor. Clients are + // expected to set their respective delayed write rates accordingly. When + // memory_usage() reaches buffer_size(), the (optional) WBM stall mechanism + // kicks in if enabled. (see allow_delays_and_stalls above) + // Stalls: stalling of writes when memory_usage() exceeds buffer_size. It + // will wait for flush to complete and + // memory usage to drop down. explicit WriteBufferManager(size_t _buffer_size, std::shared_ptr cache = {}, - bool allow_stall = false); + bool allow_delays_and_stalls = true); + // No copying allowed WriteBufferManager(const WriteBufferManager&) = delete; WriteBufferManager& operator=(const WriteBufferManager&) = delete; @@ -69,9 +92,29 @@ class WriteBufferManager final { return memory_used_.load(std::memory_order_relaxed); } + size_t GetMemoryUsagePercentageOfBufferSize() const { + if (enabled()) { + return ((100 * memory_usage()) / buffer_size()); + } else { + return 0U; + } + } + // Returns the total memory used by active memtables. size_t mutable_memtable_memory_usage() const { - return memory_active_.load(std::memory_order_relaxed); + const size_t total = memory_usage(); + const size_t inactive = memory_inactive_.load(std::memory_order_acquire); + return ((inactive >= total) ? 0 : (total - inactive)); + } + + // Returns the total inactive memory used by memtables. + size_t immmutable_memtable_memory_usage() const { + return memory_inactive_.load(std::memory_order_relaxed); + } + + // Returns the total memory marked to be freed but not yet actually freed + size_t memtable_memory_being_freed_usage() const { + return memory_being_freed_.load(std::memory_order_relaxed); } size_t dummy_entries_in_cache_usage() const; @@ -81,11 +124,24 @@ class WriteBufferManager final { return buffer_size_.load(std::memory_order_relaxed); } + // Note that the memory_inactive_ and memory_being_freed_ counters + // are NOT maintained when the WBM is disabled. In addition, memory_used_ is + // maintained only when enabled or cache is provided. Therefore, if switching + // from disabled to enabled, these counters will (or may) be invalid or may + // wraparound void SetBufferSize(size_t new_size) { + [[maybe_unused]] auto was_enabled = enabled(); + buffer_size_.store(new_size, std::memory_order_relaxed); mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed); + + assert(was_enabled == enabled()); + // Check if stall is active and can be ended. MaybeEndWriteStall(); + if (enabled()) { + UpdateUsageState(memory_usage(), 0 /* mem_changed_size */, new_size); + } } // Below functions should be called by RocksDB internally. @@ -113,11 +169,12 @@ class WriteBufferManager final { // We stall the writes untill memory_usage drops below buffer_size. When the // function returns true, all writer threads (including one checking this // condition) across all DBs will be stalled. Stall is allowed only if user - // pass allow_stall = true during WriteBufferManager instance creation. + // pass allow_delays_and_stalls = true during WriteBufferManager instance + // creation. // // Should only be called by RocksDB internally . bool ShouldStall() const { - if (!allow_stall_ || !enabled()) { + if (!allow_delays_and_stalls_ || !enabled()) { return false; } @@ -140,6 +197,17 @@ class WriteBufferManager final { // when checking the soft limit. void ScheduleFreeMem(size_t mem); + // Freeing 'mem' bytes has actually started. + // The process may complete successfully and FreeMem() will be called to + // notifiy successfull completion, or, aborted, and FreeMemCancelled() will be + // called to notify that. + void FreeMemBegin(size_t mem); + + // Freeing 'mem' bytes was aborted and that memory is no longer in the process + // of being freed + void FreeMemAborted(size_t mem); + + // Freeing 'mem' bytes completed successfully void FreeMem(size_t mem); // Add the DB instance to the queue and block the DB. @@ -152,12 +220,46 @@ class WriteBufferManager final { void RemoveDBFromQueue(StallInterface* wbm_stall); + std::string GetPrintableOptions() const; + + public: + bool IsDelayAllowed() const { return allow_delays_and_stalls_; } + std::pair GetUsageStateInfo() const { + return ParseCodedUsageState(GetCodedUsageState()); + } + + private: + // The usage + delay factor are coded in a single (atomic) uint64_t value as + // follows: kNone - as 0 (kNoneCodedUsageState) kStop - as 1 + max delay + // factor (kStopCodedUsageState) kDelay - as the delay factor itself, which + // will actually be used for the delay token + static constexpr uint64_t kNoneCodedUsageState = 0U; + static constexpr uint64_t kStopCodedUsageState = kMaxDelayedWriteFactor + 1; + + void UpdateUsageState(size_t new_memory_used, ssize_t mem_changed_size, + size_t quota); + + uint64_t CalcNewCodedUsageState(size_t new_memory_used, + ssize_t memory_changed_size, size_t quota, + uint64_t old_coded_usage_state); + + uint64_t GetCodedUsageState() const { + return coded_usage_state_.load(std::memory_order_relaxed); + } + + static uint64_t CalcCodedUsageState(UsageState usage_state, + uint64_t delay_factor); + static std::pair ParseCodedUsageState( + uint64_t coded_usage_state); + private: std::atomic buffer_size_; std::atomic mutable_limit_; - std::atomic memory_used_; - // Memory that hasn't been scheduled to free. - std::atomic memory_active_; + std::atomic memory_used_ = 0U; + // Memory that has been scheduled to free. + std::atomic memory_inactive_ = 0U; + // Memory that in the process of being freed + std::atomic memory_being_freed_ = 0U; std::shared_ptr cache_res_mgr_; // Protects cache_res_mgr_ std::mutex cache_res_mgr_mu_; @@ -165,12 +267,14 @@ class WriteBufferManager final { std::list queue_; // Protects the queue_ and stall_active_. std::mutex mu_; - bool allow_stall_; + bool allow_delays_and_stalls_ = true; // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall() // while holding mu_, but it can be read without a lock. std::atomic stall_active_; + std::atomic coded_usage_state_ = kNoneCodedUsageState; - void ReserveMemWithCache(size_t mem); - void FreeMemWithCache(size_t mem); + // Return the new memory usage + size_t ReserveMemWithCache(size_t mem); + size_t FreeMemWithCache(size_t mem); }; } // namespace ROCKSDB_NAMESPACE diff --git a/issue_template.md b/issue_template.md deleted file mode 100644 index ca52f5ead0..0000000000 --- a/issue_template.md +++ /dev/null @@ -1,7 +0,0 @@ -> Note: Please use Issues only for bug reports. For questions, discussions, feature requests, etc. post to dev group: https://groups.google.com/forum/#!forum/rocksdb or https://www.facebook.com/groups/rocksdb.dev - -### Expected behavior - -### Actual behavior - -### Steps to reproduce the behavior diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index 8eada17e88..a6c9f9afe9 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -321,7 +321,7 @@ elseif(${CMAKE_VERSION} VERSION_LESS "3.11.4") # Old CMake message("Using an old CMAKE (${CMAKE_VERSION}) - JNI headers generated in separate step") add_jar( - rocksdbjni_classes + ${PROJECT_NAME}jni_classes SOURCES ${JAVA_MAIN_CLASSES} ${JAVA_TEST_CLASSES} @@ -332,12 +332,12 @@ else () # Java 1.8 or newer prepare the JAR... message("Preparing Jar for JDK ${Java_VERSION_STRING}") add_jar( - rocksdbjni_classes + ${PROJECT_NAME}jni_classes SOURCES ${JAVA_MAIN_CLASSES} ${JAVA_TEST_CLASSES} INCLUDE_JARS ${JAVA_TESTCLASSPATH} - GENERATE_NATIVE_HEADERS rocksdbjni_headers DESTINATION ${JNI_OUTPUT_DIR} + GENERATE_NATIVE_HEADERS ${PROJECT_NAME}jni_headers DESTINATION ${JNI_OUTPUT_DIR} ) endif() @@ -517,9 +517,9 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4") ) create_javah( - TARGET rocksdbjni_headers + TARGET ${PROJECT_NAME}jni_headers CLASSES ${NATIVE_JAVA_CLASSES} - CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH} + CLASSPATH ${PROJECT_NAME}jni_classes ${JAVA_TESTCLASSPATH} OUTPUT_DIR ${JNI_OUTPUT_DIR} ) endif() @@ -528,15 +528,15 @@ if(NOT MSVC) set_property(TARGET ${ROCKSDB_STATIC_LIB} PROPERTY POSITION_INDEPENDENT_CODE ON) endif() -set(ROCKSDBJNI_STATIC_LIB rocksdbjni${ARTIFACT_SUFFIX}) +set(ROCKSDBJNI_STATIC_LIB ${PROJECT_NAME}jni${ARTIFACT_SUFFIX}) add_library(${ROCKSDBJNI_STATIC_LIB} ${JNI_NATIVE_SOURCES}) -add_dependencies(${ROCKSDBJNI_STATIC_LIB} rocksdbjni_headers) -target_link_libraries(${ROCKSDBJNI_STATIC_LIB} ${ROCKSDB_STATIC_LIB} ${ROCKSDB_LIB}) +add_dependencies(${ROCKSDBJNI_STATIC_LIB} ${PROJECT_NAME}jni_headers) +target_link_libraries(${ROCKSDBJNI_STATIC_LIB} ${ROCKS_STATIC_LIB} ${ROCKS_LIB}) if(NOT MINGW) - set(ROCKSDBJNI_SHARED_LIB rocksdbjni-shared${ARTIFACT_SUFFIX}) + set(ROCKSDBJNI_SHARED_LIB ${PROJECT_NAME}jni-shared${ARTIFACT_SUFFIX}) add_library(${ROCKSDBJNI_SHARED_LIB} SHARED ${JNI_NATIVE_SOURCES}) - add_dependencies(${ROCKSDBJNI_SHARED_LIB} rocksdbjni_headers) + add_dependencies(${ROCKSDBJNI_SHARED_LIB} ${PROJECT_NAME}jni_headers) target_link_libraries(${ROCKSDBJNI_SHARED_LIB} ${ROCKSDB_STATIC_LIB} ${ROCKSDB_LIB}) set_target_properties( diff --git a/java/Makefile b/java/Makefile index f580fda277..60e89e8261 100644 --- a/java/Makefile +++ b/java/Makefile @@ -1,3 +1,5 @@ +PROJECT_NAME?=speedb + NATIVE_JAVA_CLASSES = \ org.rocksdb.AbstractCompactionFilter\ org.rocksdb.AbstractCompactionFilterFactory\ @@ -96,10 +98,6 @@ NATIVE_JAVA_TEST_CLASSES = \ org.rocksdb.WriteBatchTest\ org.rocksdb.WriteBatchTestInternalHelper -ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3) -ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3) -ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3) - NATIVE_INCLUDE = ./include ARCH := $(shell getconf LONG_BIT) SHA256_CMD ?= sha256sum @@ -340,32 +338,32 @@ java: java-version sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java - $(AM_V_at)@rm -rf /tmp/rocksdbjni - $(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found - $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni - $(AM_V_at)@rm -rf /tmp/rocksdbjni - $(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni_not_found + $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/$(PROJECT_NAME)jni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni_not_found column_family_sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java - $(AM_V_at)@rm -rf /tmp/rocksdbjni - $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni - $(AM_V_at)@rm -rf /tmp/rocksdbjni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni + $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/$(PROJECT_NAME)jni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni transaction_sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) $(AM_V_at)$(JAVAC_CMD) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/TransactionSample.java - $(AM_V_at)@rm -rf /tmp/rocksdbjni - $(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/rocksdbjni - $(AM_V_at)@rm -rf /tmp/rocksdbjni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni + $(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/$(PROJECT_NAME)jni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni optimistic_transaction_sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) $(AM_V_at)$(JAVAC_CMD) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/OptimisticTransactionSample.java - $(AM_V_at)@rm -rf /tmp/rocksdbjni - $(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni - $(AM_V_at)@rm -rf /tmp/rocksdbjni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni + $(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/$(PROJECT_NAME)jni + $(AM_V_at)@rm -rf /tmp/$(PROJECT_NAME)jni $(JAVA_TEST_LIBDIR): mkdir -p "$(JAVA_TEST_LIBDIR)" @@ -437,12 +435,12 @@ java_test: java resolve_test_deps $(AM_V_at) $(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\ $(TEST_SOURCES) -test: java java_test run_test +test: run_test -run_test: +run_test: java_test $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(ALL_JAVA_TESTS) -run_plugin_test: +run_plugin_test: java_test $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(ROCKSDB_PLUGIN_JAVA_TESTS) db_bench: java diff --git a/java/crossbuild/build-linux-alpine.sh b/java/crossbuild/build-linux-alpine.sh index 561d34141e..900ddc26c1 100755 --- a/java/crossbuild/build-linux-alpine.sh +++ b/java/crossbuild/build-linux-alpine.sh @@ -66,5 +66,5 @@ cd /tmp &&\ cd /rocksdb make jclean clean PORTABLE=1 make -j8 rocksdbjavastatic -cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build -cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build +cp /rocksdb/java/target/libspeedbjni-* /rocksdb-build +cp /rocksdb/java/target/speedbjni-* /rocksdb-build diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh index 176e3456ce..263d7fd8c8 100755 --- a/java/crossbuild/build-linux-centos.sh +++ b/java/crossbuild/build-linux-centos.sh @@ -34,5 +34,5 @@ export PATH=$JAVA_HOME:/usr/local/bin:$PATH cd /rocksdb scl enable devtoolset-2 'make clean-not-downloaded' scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic' -cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build -cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build +cp /rocksdb/java/target/libspeedbjni-* /rocksdb-build +cp /rocksdb/java/target/speedbjni-* /rocksdb-build diff --git a/java/crossbuild/build-linux.sh b/java/crossbuild/build-linux.sh index 74178adb5d..cd862fb95a 100755 --- a/java/crossbuild/build-linux.sh +++ b/java/crossbuild/build-linux.sh @@ -9,7 +9,7 @@ export JAVA_HOME=$(echo /usr/lib/jvm/java-7-openjdk*) cd /rocksdb make jclean clean make -j 4 rocksdbjavastatic -cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build -cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build +cp /rocksdb/java/target/libspeedbjni-* /rocksdb-build +cp /rocksdb/java/target/speedbjni-* /rocksdb-build sudo shutdown -h now diff --git a/java/crossbuild/docker-build-linux-alpine.sh b/java/crossbuild/docker-build-linux-alpine.sh index e3e852efea..64adaa8608 100755 --- a/java/crossbuild/docker-build-linux-alpine.sh +++ b/java/crossbuild/docker-build-linux-alpine.sh @@ -14,4 +14,4 @@ cd /rocksdb-local-build make clean-not-downloaded PORTABLE=1 make -j2 rocksdbjavastatic -cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target +cp java/target/libspeedbjni-linux*.so java/target/speedbjni-*-linux*.jar java/target/speedbjni-*-linux*.jar.sha1 /rocksdb-java-target diff --git a/java/crossbuild/docker-build-linux-centos.sh b/java/crossbuild/docker-build-linux-centos.sh index 16581dec74..d665d6a257 100755 --- a/java/crossbuild/docker-build-linux-centos.sh +++ b/java/crossbuild/docker-build-linux-centos.sh @@ -34,5 +34,5 @@ else PORTABLE=1 make -j2 rocksdbjavastatic fi -cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target +cp java/target/libspeedbjni-linux*.so java/target/speedbjni-*-linux*.jar java/target/speedbjni-*-linux*.jar.sha1 /rocksdb-java-target diff --git a/java/jdb_bench.sh b/java/jdb_bench.sh index 5dfc385e3b..61cf503de2 100755 --- a/java/jdb_bench.sh +++ b/java/jdb_bench.sh @@ -6,8 +6,8 @@ then PLATFORM=32 fi -ROCKS_JAR=`find target -name rocksdbjni*.jar` +SPEEDB_JAR=`find target -name speedbjni*.jar` echo "Running benchmark in $PLATFORM-Bit mode." # shellcheck disable=SC2068 -java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=target -cp "${ROCKS_JAR}:benchmark/target/classes" org.rocksdb.benchmark.DbBenchmark $@ +java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=target -cp "${SPEEDB_JAR}:benchmark/target/classes" org.rocksdb.benchmark.DbBenchmark $@ diff --git a/java/pom.xml.template b/java/pom.xml.template index 4abff4768e..a5a5710aa3 100644 --- a/java/pom.xml.template +++ b/java/pom.xml.template @@ -3,15 +3,15 @@ 4.0.0 org.rocksdb - rocksdbjni - ${ROCKSDB_JAVA_VERSION} + speedbjni + ${LIB_JAVA_VERSION} - RocksDB JNI - RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files + Speedb JNI + Speedb fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files for Mac OSX, and a .dll for Windows x64. - https://rocksdb.org - 2012 + https://speedb.io + 2022 @@ -27,20 +27,20 @@ - scm:git:https://github.com/facebook/rocksdb.git - scm:git:https://github.com/facebook/rocksdb.git - scm:git:https://github.com/facebook/rocksdb.git + scm:git:https://github.com/speedb-io/speedb.git + scm:git:https://github.com/speedb-io/speedb.git + scm:git:https://github.com/speedb-io/speedb.git - Facebook - https://www.facebook.com + Speedb + https://www.speedb.io - Facebook - help@facebook.com + Speedb + hello@speedb.io America/New_York architect @@ -48,16 +48,6 @@ - - - rocksdb - Google Groups - rocksdb-subscribe@googlegroups.com - rocksdb-unsubscribe@googlegroups.com - rocksdb@googlegroups.com - https://groups.google.com/forum/#!forum/rocksdb - - - 1.7 1.7 @@ -123,14 +113,7 @@ Xenu - String fileContents = new File(project.basedir.absolutePath + '/../include/rocksdb/version.h').getText('UTF-8') - matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/) - String major_version = matcher.getAt(0).getAt(1) - matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/) - String minor_version = matcher.getAt(0).getAt(1) - matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/) - String patch_version = matcher.getAt(0).getAt(1) - String version = String.format('%s.%s.%s', major_version, minor_version, patch_version) + String version = "${LIB_JAVA_VERSION}" // Set version to be used in pom.properties project.version = version // Set version to be set as jar name diff --git a/java/rocksjni/native_comparator_wrapper_test.cc b/java/rocksjni/native_comparator_wrapper_test.cc index ac33ca22d9..8cb4b76040 100644 --- a/java/rocksjni/native_comparator_wrapper_test.cc +++ b/java/rocksjni/native_comparator_wrapper_test.cc @@ -15,20 +15,20 @@ namespace ROCKSDB_NAMESPACE { class NativeComparatorWrapperTestStringComparator : public Comparator { - const char* Name() const { + const char* Name() const override { return "NativeComparatorWrapperTestStringComparator"; } - int Compare(const Slice& a, const Slice& b) const { + int Compare(const Slice& a, const Slice& b) const override { return a.ToString().compare(b.ToString()); } void FindShortestSeparator(std::string* /*start*/, - const Slice& /*limit*/) const { + const Slice& /*limit*/) const override { return; } - void FindShortSuccessor(std::string* /*key*/) const { return; } + void FindShortSuccessor(std::string* /*key*/) const override { return; } }; } // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index a94b39065b..4d0d55c124 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5589,6 +5589,10 @@ class HistogramTypeJni { return 0x32; case ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES: return 0x33; + case ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS: + return 0x34; + case ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED: + return 0x35; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5708,6 +5712,10 @@ class HistogramTypeJni { ERROR_HANDLER_AUTORESUME_RETRY_COUNT; case 0x33: return ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES; + case 0x34: + return ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS; + case 0x35: + return ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java index b97cf28b91..ee5f59e19e 100644 --- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java +++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java @@ -16,14 +16,14 @@ public class NativeLibraryLoader { private static final NativeLibraryLoader instance = new NativeLibraryLoader(); private static boolean initialized = false; - private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb"); - private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb"); + private static final String sharedLibraryName = Environment.getSharedLibraryName("speedb"); + private static final String jniLibraryName = Environment.getJniLibraryName("speedb"); private static final /* @Nullable */ String fallbackJniLibraryName = - Environment.getFallbackJniLibraryName("rocksdb"); - private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb"); + Environment.getFallbackJniLibraryName("speedb"); + private static final String jniLibraryFileName = Environment.getJniLibraryFileName("speedb"); private static final /* @Nullable */ String fallbackJniLibraryFileName = - Environment.getFallbackJniLibraryFileName("rocksdb"); - private static final String tempFilePrefix = "librocksdbjni"; + Environment.getFallbackJniLibraryFileName("speedb"); + private static final String tempFilePrefix = "libspeedbjni"; private static final String tempFileSuffix = Environment.getJniLibraryExtension(); /** diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index 3bdab157d2..d51f45db7c 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -120,8 +120,7 @@ public static void loadLibrary(final List paths) { UnsatisfiedLinkError err = null; for (final String path : paths) { try { - System.load(path + "/" + - Environment.getJniLibraryFileName("rocksdbjni")); + System.load(path + "/" + Environment.getJniLibraryFileName("speedbjni")); success = true; break; } catch (final UnsatisfiedLinkError e) { diff --git a/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java b/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java index ab60081a07..6ba0eb3ca9 100644 --- a/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java +++ b/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java @@ -24,8 +24,8 @@ public class NativeLibraryLoaderTest { public void tempFolder() throws IOException { NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp( temporaryFolder.getRoot().getAbsolutePath()); - final Path path = Paths.get(temporaryFolder.getRoot().getAbsolutePath(), - Environment.getJniLibraryFileName("rocksdb")); + final Path path = Paths.get( + temporaryFolder.getRoot().getAbsolutePath(), Environment.getJniLibraryFileName("speedb")); assertThat(Files.exists(path)).isTrue(); assertThat(Files.isReadable(path)).isTrue(); } diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java index 422bed40c6..4193bcb445 100644 --- a/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -1428,7 +1428,7 @@ public void getLiveFiles() throws RocksDBException { assertThat(livefiles.manifestFileSize).isEqualTo(59); assertThat(livefiles.files.size()).isEqualTo(3); assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT"); - assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000005"); + assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000004"); assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000007"); } } diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc index 19e7ea43f5..7810aeedb1 100644 --- a/logging/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -210,7 +210,7 @@ TEST_F(AutoRollLoggerTest, RollLogFileByTime) { InitTestDb(); // -- Test the existence of file during the server restart. - ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile)); + ASSERT_TRUE(default_env->FileExists(kLogFile).IsNotFound()); AutoRollLogger logger(default_env->GetFileSystem(), nsc, kTestDir, "", log_size, time, keep_log_file_num); ASSERT_OK(default_env->FileExists(kLogFile)); @@ -554,7 +554,7 @@ TEST_F(AutoRollLoggerTest, Close) { ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str()); log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1; } - ASSERT_EQ(logger.Close(), Status::OK()); + ASSERT_OK(logger.Close()); std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str()); size_t lines = std::count(std::istreambuf_iterator(inFile), diff --git a/logging/env_logger_test.cc b/logging/env_logger_test.cc index b06e785889..692e71675b 100644 --- a/logging/env_logger_test.cc +++ b/logging/env_logger_test.cc @@ -56,11 +56,11 @@ const std::string EnvLoggerTest::kLogFile = test::PerThreadDBPath("log_file"); TEST_F(EnvLoggerTest, EmptyLogFile) { auto logger = CreateLogger(); - ASSERT_EQ(logger->Close(), Status::OK()); + ASSERT_OK(logger->Close()); // Check the size of the log file. uint64_t file_size; - ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK()); + ASSERT_OK(env_->GetFileSize(kLogFile, &file_size)); ASSERT_EQ(file_size, 0); DeleteLogFile(); } @@ -74,7 +74,7 @@ TEST_F(EnvLoggerTest, LogMultipleLines) { // Flush the logs. logger->Flush(); - ASSERT_EQ(logger->Close(), Status::OK()); + ASSERT_OK(logger->Close()); // Validate whether the log file has 'kNumIter' number of lines. ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); @@ -89,7 +89,7 @@ TEST_F(EnvLoggerTest, Overwrite) { const int kNumIter = 10; WriteLogs(logger, kSampleMessage, kNumIter); - ASSERT_EQ(logger->Close(), Status::OK()); + ASSERT_OK(logger->Close()); // Validate whether the log file has 'kNumIter' number of lines. ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); @@ -101,10 +101,10 @@ TEST_F(EnvLoggerTest, Overwrite) { // File should be empty. uint64_t file_size; - ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK()); + ASSERT_OK(env_->GetFileSize(kLogFile, &file_size)); ASSERT_EQ(file_size, 0); ASSERT_EQ(logger->GetLogFileSize(), 0); - ASSERT_EQ(logger->Close(), Status::OK()); + ASSERT_OK(logger->Close()); } DeleteLogFile(); } @@ -116,7 +116,7 @@ TEST_F(EnvLoggerTest, Close) { const int kNumIter = 10; WriteLogs(logger, kSampleMessage, kNumIter); - ASSERT_EQ(logger->Close(), Status::OK()); + ASSERT_OK(logger->Close()); // Validate whether the log file has 'kNumIter' number of lines. ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); @@ -145,7 +145,7 @@ TEST_F(EnvLoggerTest, ConcurrentLogging) { th.join(); } - ASSERT_EQ(logger->Close(), Status::OK()); + ASSERT_OK(logger->Close()); // Verfiy the log file. ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), diff --git a/memory/allocator.h b/memory/allocator.h index 002ad5f1d8..8eeae590d1 100644 --- a/memory/allocator.h +++ b/memory/allocator.h @@ -42,16 +42,30 @@ class AllocTracker { // Call when we're finished allocating memory so we can free it from // the write buffer's limit. void DoneAllocating(); - + void FreeMemStarted(); + void FreeMemAborted(); void FreeMem(); - bool is_freed() const { return write_buffer_manager_ == nullptr || freed_; } + bool HasMemoryFreeingStarted() const { + return (state_ == State::kFreeMemStarted); + } + + bool IsMemoryFreed() const { return (state_ == State::kFreed); } + + private: + enum class State { kAllocating, kDoneAllocating, kFreeMemStarted, kFreed }; + + private: + bool ShouldUpdateWriteBufferManager() const { + return ((write_buffer_manager_ != nullptr) && + (write_buffer_manager_->enabled() || + write_buffer_manager_->cost_to_cache())); + } private: - WriteBufferManager* write_buffer_manager_; - std::atomic bytes_allocated_; - bool done_allocating_; - bool freed_; + WriteBufferManager* write_buffer_manager_ = nullptr; + State state_ = State::kAllocating; + std::atomic bytes_allocated_ = 0U; }; } // namespace ROCKSDB_NAMESPACE diff --git a/memory/arena.cc b/memory/arena.cc index bcdad5c76f..c2b6f1201a 100644 --- a/memory/arena.cc +++ b/memory/arena.cc @@ -68,7 +68,7 @@ Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size) Arena::~Arena() { if (tracker_ != nullptr) { - assert(tracker_->is_freed()); + assert(tracker_->IsMemoryFreed()); tracker_->FreeMem(); } for (const auto& block : blocks_) { diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc index fe21343471..2d4a9e43ca 100644 --- a/memtable/alloc_tracker.cc +++ b/memtable/alloc_tracker.cc @@ -15,48 +15,87 @@ namespace ROCKSDB_NAMESPACE { AllocTracker::AllocTracker(WriteBufferManager* write_buffer_manager) - : write_buffer_manager_(write_buffer_manager), - bytes_allocated_(0), - done_allocating_(false), - freed_(false) {} + : write_buffer_manager_(write_buffer_manager), bytes_allocated_(0) {} AllocTracker::~AllocTracker() { FreeMem(); } void AllocTracker::Allocate(size_t bytes) { assert(write_buffer_manager_ != nullptr); - if (write_buffer_manager_->enabled() || - write_buffer_manager_->cost_to_cache()) { - bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed); - write_buffer_manager_->ReserveMem(bytes); + assert(state_ == State::kAllocating); + + if (state_ == State::kAllocating) { + if (ShouldUpdateWriteBufferManager()) { + bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed); + write_buffer_manager_->ReserveMem(bytes); + } } } void AllocTracker::DoneAllocating() { - if (write_buffer_manager_ != nullptr && !done_allocating_) { - if (write_buffer_manager_->enabled() || - write_buffer_manager_->cost_to_cache()) { + assert(write_buffer_manager_ != nullptr); + assert(state_ == State::kAllocating); + + if (state_ == State::kAllocating) { + if (ShouldUpdateWriteBufferManager()) { write_buffer_manager_->ScheduleFreeMem( bytes_allocated_.load(std::memory_order_relaxed)); } else { assert(bytes_allocated_.load(std::memory_order_relaxed) == 0); } - done_allocating_ = true; + state_ = State::kDoneAllocating; + } +} + +void AllocTracker::FreeMemStarted() { + assert(write_buffer_manager_ != nullptr); + assert(state_ == State::kDoneAllocating); + + if (state_ == State::kDoneAllocating) { + if (ShouldUpdateWriteBufferManager()) { + write_buffer_manager_->FreeMemBegin( + bytes_allocated_.load(std::memory_order_relaxed)); + } + state_ = State::kFreeMemStarted; + } +} + +void AllocTracker::FreeMemAborted() { + assert(write_buffer_manager_ != nullptr); + // May be called without actually starting to free memory + assert((state_ == State::kDoneAllocating) || + (state_ == State::kFreeMemStarted)); + + if (state_ == State::kFreeMemStarted) { + if (ShouldUpdateWriteBufferManager()) { + write_buffer_manager_->FreeMemAborted( + bytes_allocated_.load(std::memory_order_relaxed)); + } + state_ = State::kDoneAllocating; } } void AllocTracker::FreeMem() { - if (!done_allocating_) { + if (state_ == State::kAllocating) { DoneAllocating(); } - if (write_buffer_manager_ != nullptr && !freed_) { - if (write_buffer_manager_->enabled() || - write_buffer_manager_->cost_to_cache()) { + + // This is necessary so that the WBM will not decrease the memory being + // freed twice in case memory freeing was aborted and then freed via this + // call + if (state_ == State::kDoneAllocating) { + FreeMemStarted(); + } + + if (state_ == State::kFreeMemStarted) { + if (ShouldUpdateWriteBufferManager()) { write_buffer_manager_->FreeMem( bytes_allocated_.load(std::memory_order_relaxed)); } else { assert(bytes_allocated_.load(std::memory_order_relaxed) == 0); } - freed_ = true; } + + state_ = State::kFreed; } + } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc index 1eaa7658f0..ae3201c062 100644 --- a/memtable/memtablerep_bench.cc +++ b/memtable/memtablerep_bench.cc @@ -578,10 +578,9 @@ int main(int argc, char** argv) { std::unique_ptr factory; if (FLAGS_memtablerep == "skiplist") { + // Needed because of a different name/default than CreateFromString factory.reset(new ROCKSDB_NAMESPACE::SkipListFactory); #ifndef ROCKSDB_LITE - } else if (FLAGS_memtablerep == "vector") { - factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory); } else if (FLAGS_memtablerep == "hashskiplist" || FLAGS_memtablerep == "prefix_hash") { factory.reset(ROCKSDB_NAMESPACE::NewHashSkipListRepFactory( @@ -601,12 +600,14 @@ int main(int argc, char** argv) { } else { ROCKSDB_NAMESPACE::ConfigOptions config_options; config_options.ignore_unsupported_options = false; + config_options.ignore_unknown_options = false; ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::MemTableRepFactory::CreateFromString( config_options, FLAGS_memtablerep, &factory); - if (!s.ok()) { - fprintf(stdout, "Unknown memtablerep: %s\n", s.ToString().c_str()); + if (!s.ok() || !factory) { + fprintf(stdout, "Unknown memtablerep[%s]: %s\n", + FLAGS_memtablerep.c_str(), s.ToString().c_str()); exit(1); } } diff --git a/memtable/stl_wrappers.h b/memtable/stl_wrappers.h index e9f8f214ce..ef274b03e5 100644 --- a/memtable/stl_wrappers.h +++ b/memtable/stl_wrappers.h @@ -27,6 +27,9 @@ struct Compare : private Base { inline bool operator()(const char* a, const char* b) const { return compare_(a, b) < 0; } + inline bool operator()(const char* a, const Slice& b) const { + return compare_(a, b) < 0; + } }; } diff --git a/memtable/write_buffer_manager.cc b/memtable/write_buffer_manager.cc index d539d2ed24..75cec7e0a8 100644 --- a/memtable/write_buffer_manager.cc +++ b/memtable/write_buffer_manager.cc @@ -20,13 +20,14 @@ namespace ROCKSDB_NAMESPACE { WriteBufferManager::WriteBufferManager(size_t _buffer_size, std::shared_ptr cache, - bool allow_stall) + bool allow_delays_and_stalls) : buffer_size_(_buffer_size), mutable_limit_(buffer_size_ * 7 / 8), memory_used_(0), - memory_active_(0), + memory_inactive_(0), + memory_being_freed_(0U), cache_res_mgr_(nullptr), - allow_stall_(allow_stall), + allow_delays_and_stalls_(allow_delays_and_stalls), stall_active_(false) { #ifndef ROCKSDB_LITE if (cache) { @@ -58,25 +59,31 @@ std::size_t WriteBufferManager::dummy_entries_in_cache_usage() const { } void WriteBufferManager::ReserveMem(size_t mem) { + auto is_enabled = enabled(); + size_t new_memory_used = 0U; + if (cache_res_mgr_ != nullptr) { - ReserveMemWithCache(mem); - } else if (enabled()) { - memory_used_.fetch_add(mem, std::memory_order_relaxed); + new_memory_used = ReserveMemWithCache(mem); + } else if (is_enabled) { + auto old_memory_used = + memory_used_.fetch_add(mem, std::memory_order_relaxed); + new_memory_used = old_memory_used + mem; } - if (enabled()) { - memory_active_.fetch_add(mem, std::memory_order_relaxed); + if (is_enabled) { + UpdateUsageState(new_memory_used, mem, buffer_size()); } } // Should only be called from write thread -void WriteBufferManager::ReserveMemWithCache(size_t mem) { +size_t WriteBufferManager::ReserveMemWithCache(size_t mem) { #ifndef ROCKSDB_LITE assert(cache_res_mgr_ != nullptr); // Use a mutex to protect various data structures. Can be optimized to a // lock-free solution if it ends up with a performance bottleneck. std::lock_guard lock(cache_res_mgr_mu_); - size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) + mem; + size_t old_mem_used = memory_used_.load(std::memory_order_relaxed); + size_t new_mem_used = old_mem_used + mem; memory_used_.store(new_mem_used, std::memory_order_relaxed); Status s = cache_res_mgr_->UpdateCacheReservation(new_mem_used); @@ -86,34 +93,75 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) { // [TODO] We'll need to improve it in the future and figure out what to do on // error s.PermitUncheckedError(); + + return new_mem_used; #else (void)mem; + return 0U; #endif // ROCKSDB_LITE } void WriteBufferManager::ScheduleFreeMem(size_t mem) { if (enabled()) { - memory_active_.fetch_sub(mem, std::memory_order_relaxed); + memory_inactive_.fetch_add(mem, std::memory_order_relaxed); + } +} + +void WriteBufferManager::FreeMemBegin(size_t mem) { + if (enabled()) { + memory_being_freed_.fetch_add(mem, std::memory_order_relaxed); + } +} + +// Freeing 'mem' bytes was aborted and that memory is no longer in the process +// of being freed +void WriteBufferManager::FreeMemAborted(size_t mem) { + if (enabled()) { + [[maybe_unused]] const auto curr_memory_being_freed = + memory_being_freed_.fetch_sub(mem, std::memory_order_relaxed); + assert(curr_memory_being_freed >= mem); } } void WriteBufferManager::FreeMem(size_t mem) { + const auto is_enabled = enabled(); + size_t new_memory_used = 0U; + if (cache_res_mgr_ != nullptr) { - FreeMemWithCache(mem); - } else if (enabled()) { - memory_used_.fetch_sub(mem, std::memory_order_relaxed); + new_memory_used = FreeMemWithCache(mem); + } else if (is_enabled) { + auto old_memory_used = + memory_used_.fetch_sub(mem, std::memory_order_relaxed); + assert(old_memory_used >= mem); + new_memory_used = old_memory_used - mem; + } + + if (is_enabled) { + [[maybe_unused]] const auto curr_memory_inactive = + memory_inactive_.fetch_sub(mem, std::memory_order_relaxed); + [[maybe_unused]] const auto curr_memory_being_freed = + memory_being_freed_.fetch_sub(mem, std::memory_order_relaxed); + + assert(curr_memory_inactive >= mem); + assert(curr_memory_being_freed >= mem); + + UpdateUsageState(new_memory_used, -mem, buffer_size()); } + // Check if stall is active and can be ended. MaybeEndWriteStall(); } -void WriteBufferManager::FreeMemWithCache(size_t mem) { +size_t WriteBufferManager::FreeMemWithCache(size_t mem) { #ifndef ROCKSDB_LITE assert(cache_res_mgr_ != nullptr); // Use a mutex to protect various data structures. Can be optimized to a // lock-free solution if it ends up with a performance bottleneck. std::lock_guard lock(cache_res_mgr_mu_); - size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem; + + const auto old_mem_used = memory_used_.load(std::memory_order_relaxed); + assert(old_mem_used >= mem); + size_t new_mem_used = old_mem_used - mem; memory_used_.store(new_mem_used, std::memory_order_relaxed); Status s = cache_res_mgr_->UpdateCacheReservation(new_mem_used); @@ -122,14 +170,17 @@ void WriteBufferManager::FreeMemWithCache(size_t mem) { // [TODO] We'll need to improve it in the future and figure out what to do on // error s.PermitUncheckedError(); + + return new_mem_used; #else (void)mem; + return 0U; #endif // ROCKSDB_LITE } void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { assert(wbm_stall != nullptr); - assert(allow_stall_); + assert(allow_delays_and_stalls_); // Allocate outside of the lock. std::list new_node = {wbm_stall}; @@ -154,7 +205,7 @@ void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { void WriteBufferManager::MaybeEndWriteStall() { // Cannot early-exit on !enabled() because SetBufferSize(0) needs to unblock // the writers. - if (!allow_stall_) { + if (!allow_delays_and_stalls_) { return; } @@ -186,7 +237,7 @@ void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) { // Deallocate the removed nodes outside of the lock. std::list cleanup; - if (enabled() && allow_stall_) { + if (enabled() && allow_delays_and_stalls_) { std::unique_lock lock(mu_); for (auto it = queue_.begin(); it != queue_.end();) { auto next = std::next(it); @@ -199,4 +250,160 @@ void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) { wbm_stall->Signal(); } +std::string WriteBufferManager::GetPrintableOptions() const { + std::string ret; + const int kBufferSize = 200; + char buffer[kBufferSize]; + + // The assumed width of the callers display code + int field_width = 85; + + snprintf(buffer, kBufferSize, "%*s: %" ROCKSDB_PRIszt "\n", field_width, + "wbm.size", buffer_size()); + ret.append(buffer); + + snprintf(buffer, kBufferSize, "%*s: %d\n", field_width, + "wbm.allow_delays_and_stalls", IsDelayAllowed()); + ret.append(buffer); + + return ret; +} + +namespace { + +uint64_t CalcDelayFactor(size_t quota, size_t updated_memory_used, + size_t usage_start_delay_threshold) { + assert(updated_memory_used >= usage_start_delay_threshold); + double extra_used_memory = updated_memory_used - usage_start_delay_threshold; + double max_used_memory = quota - usage_start_delay_threshold; + + auto delay_factor = + (WriteBufferManager::kMaxDelayedWriteFactor * extra_used_memory) / + max_used_memory; + if (delay_factor < 1U) { + delay_factor = 1U; + } + return delay_factor; +} + +} // Unnamed Namespace + +uint64_t WriteBufferManager::CalcNewCodedUsageState( + size_t new_memory_used, ssize_t memory_changed_size, size_t quota, + uint64_t old_coded_usage_state) { + auto [old_usage_state, old_delay_factor] = + ParseCodedUsageState(old_coded_usage_state); + + auto new_usage_state = old_usage_state; + auto new_delay_factor = old_delay_factor; + auto usage_start_delay_threshold = + (WriteBufferManager::kStartDelayPercentThreshold * quota) / 100; + auto change_steps = quota / 100; + + if (new_memory_used < usage_start_delay_threshold) { + new_usage_state = WriteBufferManager::UsageState::kNone; + } else if (new_memory_used >= quota) { + new_usage_state = WriteBufferManager::UsageState::kStop; + } else { + new_usage_state = WriteBufferManager::UsageState::kDelay; + } + + auto calc_new_delay_factor = false; + + if (new_usage_state != old_usage_state) { + if (new_usage_state == WriteBufferManager::UsageState::kDelay) { + calc_new_delay_factor = true; + } + } else if (new_usage_state == WriteBufferManager::UsageState::kDelay) { + if (memory_changed_size == 0) { + calc_new_delay_factor = true; + } else { + auto old_memory_used = new_memory_used - memory_changed_size; + // Calculate & notify only if the change is more than one "step" + if ((old_memory_used / change_steps) != + (new_memory_used / change_steps)) { + calc_new_delay_factor = true; + } + } + } + + if (calc_new_delay_factor) { + new_delay_factor = + CalcDelayFactor(quota, new_memory_used, usage_start_delay_threshold); + } + + return CalcCodedUsageState(new_usage_state, new_delay_factor); +} + +uint64_t WriteBufferManager::CalcCodedUsageState(UsageState usage_state, + uint64_t delay_factor) { + switch (usage_state) { + case UsageState::kNone: + return kNoneCodedUsageState; + case UsageState::kDelay: + assert((delay_factor > kNoneCodedUsageState) && + (delay_factor <= kStopCodedUsageState)); + + if (delay_factor <= kNoneCodedUsageState) { + return kNoneCodedUsageState + 1; + } else if (delay_factor > kStopCodedUsageState) { + delay_factor = kStopCodedUsageState; + } + return delay_factor; + case UsageState::kStop: + return kStopCodedUsageState; + default: + assert(0); + // We should never get here (BUG). + return kNoneCodedUsageState; + } +} + +auto WriteBufferManager::ParseCodedUsageState(uint64_t coded_usage_state) + -> std::pair { + if (coded_usage_state <= kNoneCodedUsageState) { + return {UsageState::kNone, kNoneDelayedWriteFactor}; + } else if (coded_usage_state < kStopCodedUsageState) { + return {UsageState::kDelay, coded_usage_state}; + } else { + return {UsageState::kStop, kStopDelayedWriteFactor}; + } +} + +void WriteBufferManager::UpdateUsageState(size_t new_memory_used, + ssize_t memory_changed_size, + size_t quota) { + assert(enabled()); + if (allow_delays_and_stalls_ == false) { + return; + } + + auto done = false; + auto old_coded_usage_state = coded_usage_state_.load(); + auto new_coded_usage_state = old_coded_usage_state; + while (done == false) { + new_coded_usage_state = CalcNewCodedUsageState( + new_memory_used, memory_changed_size, quota, old_coded_usage_state); + + if (old_coded_usage_state != new_coded_usage_state) { + // Try to update the usage state with the usage state calculated by the + // current thread. Failure (has_update_succeeded == false) means one or + // more threads have updated the current state, rendering our own + // calculation irrelevant. In case has_update_succeeded==false, + // old_coded_usage_state will be the value of the state that was updated + // by the other thread(s). + done = coded_usage_state_.compare_exchange_strong(old_coded_usage_state, + new_coded_usage_state); + + if (done == false) { + // Retry. However, + new_memory_used = memory_usage(); + memory_changed_size = 0U; + } + } else { + done = true; + } + } +} + } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc index 709a723e3c..5487a4acd0 100644 --- a/memtable/write_buffer_manager_test.cc +++ b/memtable/write_buffer_manager_test.cc @@ -8,6 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/write_buffer_manager.h" + +#include "rocksdb/cache.h" #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { @@ -16,6 +18,17 @@ class WriteBufferManagerTest : public testing::Test {}; #ifndef ROCKSDB_LITE const size_t kSizeDummyEntry = 256 * 1024; +namespace { +void BeginAndFree(WriteBufferManager& wbf, size_t size) { + wbf.FreeMemBegin(size); + wbf.FreeMem(size); +} + +void ScheduleBeginAndFreeMem(WriteBufferManager& wbf, size_t size) { + wbf.ScheduleFreeMem(size); + BeginAndFree(wbf, size); +} +} // namespace TEST_F(WriteBufferManagerTest, ShouldFlush) { // A write buffer manager of size 10MB std::unique_ptr wbf( @@ -46,7 +59,7 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) { // 15 MB total, 8MB mutable. ASSERT_TRUE(wbf->ShouldFlush()); - wbf->FreeMem(7 * 1024 * 1024); + BeginAndFree(*wbf, 7 * 1024 * 1024); // 8MB total, 8MB mutable. ASSERT_FALSE(wbf->ShouldFlush()); @@ -59,7 +72,7 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) { // 8MB total, 6MB mutable. ASSERT_TRUE(wbf->ShouldFlush()); - wbf->FreeMem(2 * 1024 * 1024); + BeginAndFree(*wbf, 2 * 1024 * 1024); // 6MB total, 6MB mutable. ASSERT_FALSE(wbf->ShouldFlush()); @@ -72,7 +85,7 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) { ASSERT_TRUE(wbf->ShouldFlush()); wbf->ScheduleFreeMem(1 * 1024 * 1024); - wbf->FreeMem(1 * 1024 * 1024); + BeginAndFree(*wbf, 1 * 1024 * 1024); // 7MB total, 7MB mutable. ASSERT_FALSE(wbf->ShouldFlush()); } @@ -116,7 +129,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) { // Free 1MB, memory_used_ = 10061KB // It will not cause any change in cache cost // since memory_used_ > dummy_entries_in_cache_usage * (3/4) - wbf->FreeMem(1 * 1024 * 1024); + ScheduleBeginAndFreeMem(*wbf, 1 * 1024 * 1024); ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); @@ -147,7 +160,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) { // since memory_used_ < dummy_entries_in_cache_usage * (3/4) // and floor((dummy_entries_in_cache_usage - memory_used_) % kSizeDummyEntry) // = 80 - wbf->FreeMem(20 * 1024 * 1024); + BeginAndFree(*wbf, 20 * 1024 * 1024); ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 124 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 124 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), @@ -158,7 +171,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) { // Free 16KB, memory_used_ = 31549KB // It will not release any dummy entry since memory_used_ >= // dummy_entries_in_cache_usage * (3/4) - wbf->FreeMem(16 * 1024); + ScheduleBeginAndFreeMem(*wbf, 16 * 1024); ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 124 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 124 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), @@ -169,7 +182,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) { // since memory_used_ < dummy_entries_in_cache_usage * (3/4) // and floor((dummy_entries_in_cache_usage - memory_used_) % kSizeDummyEntry) // = 80 - wbf->FreeMem(20 * 1024 * 1024); + ScheduleBeginAndFreeMem(*wbf, 20 * 1024 * 1024); ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); @@ -177,7 +190,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) { // Free 1MB, memory_used_ = 10045KB // It will not cause any change in cache cost // since memory_used_ > dummy_entries_in_cache_usage * (3/4) - wbf->FreeMem(1 * 1024 * 1024); + ScheduleBeginAndFreeMem(*wbf, 1 * 1024 * 1024); ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); @@ -215,7 +228,7 @@ TEST_F(WriteBufferManagerTest, NoCapCacheCost) { // Free 9MB, memory_used_ = 1024KB // It will free 36 dummy entries - wbf->FreeMem(9 * 1024 * 1024); + ScheduleBeginAndFreeMem(*wbf, 9 * 1024 * 1024); ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead); @@ -224,7 +237,7 @@ TEST_F(WriteBufferManagerTest, NoCapCacheCost) { // It will not cause any change // since memory_used_ > dummy_entries_in_cache_usage * 3/4 for (int i = 0; i < 40; i++) { - wbf->FreeMem(4 * 1024); + ScheduleBeginAndFreeMem(*wbf, 4 * 1024); } ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024); @@ -257,7 +270,7 @@ TEST_F(WriteBufferManagerTest, CacheFull) { ASSERT_LT(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry); // Free 15MB after encoutering cache full, memory_used_ = 5120KB - wbf->FreeMem(15 * 1024 * 1024); + ScheduleBeginAndFreeMem(*wbf, 15 * 1024 * 1024); ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 20 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 20 * kSizeDummyEntry); ASSERT_LT(cache->GetPinnedUsage(), @@ -283,7 +296,7 @@ TEST_F(WriteBufferManagerTest, CacheFull) { // memory_used_ decreases to 22528KB, 16384KB, 11776KB. // In total, it releases 74 dummy entries for (int i = 0; i < 40; i++) { - wbf->FreeMem(512 * 1024); + ScheduleBeginAndFreeMem(*wbf, 512 * 1024); } ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 46 * kSizeDummyEntry); @@ -293,6 +306,165 @@ TEST_F(WriteBufferManagerTest, CacheFull) { } #endif // ROCKSDB_LITE + +#define VALIDATE_USAGE_STATE(memory_change_size, expected_state, \ + expected_factor) \ + ValidateUsageState(__LINE__, memory_change_size, expected_state, \ + expected_factor) + +class WriteBufferManagerTestWithParams + : public WriteBufferManagerTest, + public ::testing::WithParamInterface> { + public: + void SetUp() override { + wbm_enabled_ = std::get<0>(GetParam()); + cost_cache_ = std::get<1>(GetParam()); + allow_delays_and_stalls_ = std::get<2>(GetParam()); + } + + bool wbm_enabled_; + bool cost_cache_; + bool allow_delays_and_stalls_; +}; +// Test that the write buffer manager sends the expected usage notifications +TEST_P(WriteBufferManagerTestWithParams, UsageNotifications) { + constexpr size_t kQuota = 10 * 1000; + constexpr size_t kStepSize = kQuota / 100; + constexpr size_t kDelayThreshold = + WriteBufferManager::kStartDelayPercentThreshold * kQuota / 100; + constexpr size_t kMaxUsed = kQuota - kDelayThreshold; + + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + + std::unique_ptr wbf; + + auto wbm_quota = (wbm_enabled_ ? kQuota : 0U); + if (cost_cache_) { + wbf.reset( + new WriteBufferManager(wbm_quota, cache, allow_delays_and_stalls_)); + } else { + wbf.reset( + new WriteBufferManager(wbm_quota, nullptr, allow_delays_and_stalls_)); + } + ASSERT_EQ(wbf->enabled(), wbm_enabled_); + + size_t expected_usage = 0U; + auto ExpectedDelayFactor = [&](uint64_t extra_used) { + return (extra_used * WriteBufferManager::kMaxDelayedWriteFactor) / kMaxUsed; + }; + + auto ValidateUsageState = [&](unsigned long line, size_t memory_change_size, + WriteBufferManager::UsageState expected_state, + uint64_t expected_factor) { + const auto location_str = + "write_buffer_manager_test.cc:" + std::to_string(line) + "\n"; + + if (wbm_enabled_ || cost_cache_) { + expected_usage += memory_change_size; + } + ASSERT_EQ(wbf->memory_usage(), expected_usage) << location_str; + + if (wbm_enabled_ && allow_delays_and_stalls_) { + auto [actual_state, actual_delay_factor] = wbf->GetUsageStateInfo(); + ASSERT_EQ(actual_state, expected_state) << location_str; + ASSERT_EQ(actual_delay_factor, expected_factor) << location_str; + } + }; + + // Initial state + VALIDATE_USAGE_STATE(0, WriteBufferManager::UsageState::kNone, + WriteBufferManager::kNoneDelayedWriteFactor); + + auto FreeMem = [&, this](size_t mem) { + wbf->ScheduleFreeMem(mem); + wbf->FreeMemBegin(mem); + wbf->FreeMem(mem); + }; + + // Jump straight to quota + wbf->ReserveMem(kQuota); + VALIDATE_USAGE_STATE(kQuota, WriteBufferManager::UsageState::kStop, + WriteBufferManager::kStopDelayedWriteFactor); + + // And back to 0 again + FreeMem(kQuota); + VALIDATE_USAGE_STATE(-kQuota, WriteBufferManager::UsageState::kNone, + WriteBufferManager::kNoneDelayedWriteFactor); + + // Small reservations, below soft limit + wbf->ReserveMem(1000); + VALIDATE_USAGE_STATE(1000, WriteBufferManager::UsageState::kNone, + WriteBufferManager::kNoneDelayedWriteFactor); + + wbf->ReserveMem(2000); + VALIDATE_USAGE_STATE(2000, WriteBufferManager::UsageState::kNone, + WriteBufferManager::kNoneDelayedWriteFactor); + + FreeMem(3000); + VALIDATE_USAGE_STATE(-3000, WriteBufferManager::UsageState::kNone, + WriteBufferManager::kNoneDelayedWriteFactor); + + // 0 => soft limit + wbf->ReserveMem(kDelayThreshold); + VALIDATE_USAGE_STATE(kDelayThreshold, WriteBufferManager::UsageState::kDelay, + 1U); + + // A bit more, but still within the same "step" => same delay factor + wbf->ReserveMem(kStepSize - 1); + VALIDATE_USAGE_STATE(kStepSize - 1, WriteBufferManager::UsageState::kDelay, + 1U); + + // Cross the step => Delay factor updated + wbf->ReserveMem(1); + VALIDATE_USAGE_STATE(1, WriteBufferManager::UsageState::kDelay, + ExpectedDelayFactor(kStepSize)); + + // Free all => None + FreeMem(expected_usage); + VALIDATE_USAGE_STATE(-expected_usage, WriteBufferManager::UsageState::kNone, + WriteBufferManager::kNoneDelayedWriteFactor); + + // None -> Stop (usage == quota) + wbf->ReserveMem(kQuota); + VALIDATE_USAGE_STATE(kQuota, WriteBufferManager::UsageState::kStop, + WriteBufferManager::kMaxDelayedWriteFactor); + + // Increasing the quota, usage as is => Now in the none + wbf->SetBufferSize(wbm_quota * 2); + VALIDATE_USAGE_STATE(0, WriteBufferManager::UsageState::kNone, + WriteBufferManager::kNoneDelayedWriteFactor); + + // Restoring the quota + wbf->SetBufferSize(wbm_quota); + VALIDATE_USAGE_STATE(0, WriteBufferManager::UsageState::kStop, + WriteBufferManager::kMaxDelayedWriteFactor); + + // 1 byte below quota => Delay with max factor + FreeMem(1); + VALIDATE_USAGE_STATE(-1, WriteBufferManager::UsageState::kDelay, + ExpectedDelayFactor(kMaxUsed - 1)); + + // An entire step below => delay factor updated + FreeMem(kStepSize); + VALIDATE_USAGE_STATE(-kStepSize, WriteBufferManager::UsageState::kDelay, + ExpectedDelayFactor(kMaxUsed - 1 - kStepSize)); + + // Again in the top "step" + wbf->ReserveMem(1); + VALIDATE_USAGE_STATE(1, WriteBufferManager::UsageState::kDelay, + ExpectedDelayFactor(kMaxUsed - kStepSize)); + + // And back to 0 to wrap it up + FreeMem(expected_usage); + VALIDATE_USAGE_STATE(-expected_usage, WriteBufferManager::UsageState::kNone, + WriteBufferManager::kNoneDelayedWriteFactor); +} + +INSTANTIATE_TEST_CASE_P(WriteBufferManagerTestWithParams, + WriteBufferManagerTestWithParams, + ::testing::Combine(testing::Bool(), testing::Bool(), + testing::Bool())); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 566feb1893..388acaf4d9 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -284,6 +284,9 @@ const std::vector> HistogramsNameMap = { {ERROR_HANDLER_AUTORESUME_RETRY_COUNT, "rocksdb.error.handler.autoresume.retry.count"}, {ASYNC_READ_BYTES, "rocksdb.async.read.bytes"}, + {POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"}, + {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"}, + }; std::shared_ptr CreateDBStatistics() { diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index 1fe5503cbe..59e7be3d96 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -604,14 +604,10 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { dbfull()->TEST_WaitForStatsDumpRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to all three cf, flush default cf - // LogNumbers: default: 16, stats: 10, pikachu: 5 - // Since in recovery process, cfd_stats column is created after WAL is - // created, synced and MANIFEST is persisted, its log number which depends on - // logfile_number_ will be different. Since "pikachu" is never flushed, thus - // its log_number should be the smallest of the three. + // LogNumbers: default: 14, stats: 4, pikachu: 4 ASSERT_OK(Flush()); - ASSERT_LT(cfd_test->GetLogNumber(), cfd_stats->GetLogNumber()); - ASSERT_LT(cfd_test->GetLogNumber(), cfd_default->GetLogNumber()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_LT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); ASSERT_OK(Put("foo1", "v1")); ASSERT_OK(Put("bar1", "v1")); diff --git a/options/db_options.cc b/options/db_options.cc index 2d3902dd86..005acef836 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -826,8 +826,12 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.db_write_buffer_size: %" ROCKSDB_PRIszt, db_write_buffer_size); - ROCKS_LOG_HEADER(log, " Options.write_buffer_manager: %p", - write_buffer_manager.get()); + ROCKS_LOG_HEADER( + log, " Options.write_buffer_manager: %p%s%s", + write_buffer_manager.get(), (write_buffer_manager.get() ? "\n" : ""), + (write_buffer_manager.get() + ? write_buffer_manager->GetPrintableOptions().c_str() + : "")); ROCKS_LOG_HEADER(log, " Options.access_hint_on_compaction_start: %d", static_cast(access_hint_on_compaction_start)); ROCKS_LOG_HEADER( diff --git a/options/options_parser.cc b/options/options_parser.cc index 426e300137..6ed31b8ed5 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -29,7 +29,7 @@ namespace ROCKSDB_NAMESPACE { static const std::string option_file_header = - "# This is a RocksDB option file.\n" + "# This is a Speedb option file.\n" "#\n" "# For detailed file format spec, please refer to the example file\n" "# in examples/rocksdb_option_file_example.ini\n" @@ -497,11 +497,11 @@ Status RocksDBOptionsParser::EndSection( Status RocksDBOptionsParser::ValidityCheck() { if (!has_db_options_) { return Status::Corruption( - "A RocksDB Option file must have a single DBOptions section"); + "An Options file must have a single DBOptions section"); } if (!has_default_cf_options_) { return Status::Corruption( - "A RocksDB Option file must have a single CFOptions:default section"); + "An Options file must have a single CFOptions:default section"); } return Status::OK(); diff --git a/options/options_test.cc b/options/options_test.cc index 58070b3ff2..7319cdfe1b 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -3439,7 +3439,7 @@ TEST_F(OptionsParserTest, ParseVersion) { const std::vector invalid_versions = { "a.b.c", "3.2.2b", "3.-12", "3. 1", // only digits and dots are allowed "1.2.3.4", - "1.2.3" // can only contains at most one dot. + "1.2.3", // can only contains at most one dot. "0", // options_file_version must be at least one "3..2", ".", ".1.2", // must have at least one digit before each dot @@ -3603,7 +3603,7 @@ TEST_F(OptionsParserTest, DumpAndParse) { DBOptions base_db_opt; std::vector base_cf_opts; std::vector cf_names = {"default", "cf1", "cf2", "cf3", - "c:f:4:4:4" + "c:f:4:4:4", "p\\i\\k\\a\\chu\\\\\\", "###rocksdb#1-testcf#2###"}; const int num_cf = static_cast(cf_names.size()); diff --git a/plugin/speedb/CMakeLists.txt b/plugin/speedb/CMakeLists.txt new file mode 100644 index 0000000000..5d4f8ed24d --- /dev/null +++ b/plugin/speedb/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (C) 2022 Speedb Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set(speedb_SOURCES + speedb_registry.cc + memtable/hash_spd_rep.cc + paired_filter/speedb_paired_bloom.cc + paired_filter/speedb_paired_bloom_internal.cc) + +set(speedb_FUNC register_SpeedbPlugins) diff --git a/plugin/speedb/memtable/hash_spd_rep.cc b/plugin/speedb/memtable/hash_spd_rep.cc new file mode 100644 index 0000000000..f3f11d5ee5 --- /dev/null +++ b/plugin/speedb/memtable/hash_spd_rep.cc @@ -0,0 +1,990 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ROCKSDB_LITE + +#include "plugin/speedb/memtable/hash_spd_rep.h" + +#include +#include +#include // std::condition_variable +#include +#include + +#include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/stl_wrappers.h" +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/utilities/options_type.h" +#include "util/hash.h" +#include "util/heap.h" +#include "util/murmurhash.h" + +namespace ROCKSDB_NAMESPACE { +namespace { + +enum class IterOption { kNone, kIter, kImmutable }; + +struct SortItem { + // Accessors/mutators for links. Wrapped in methods so we can + // add the appropriate barriers as necessary. + + SortItem* Next() { return next_.load(); } + void SetNext(SortItem* item) { next_.store(item); } + + char* Key() { return key_; } + + bool IsIterPoint() { return iter_op_ != IterOption::kNone; } + + bool IsSorted() { return sorted_; } + + bool IsImmutable() { return iter_op_ == IterOption::kImmutable; } + + void SetSorted() { sorted_ = true; } + + void SetSortSetInfo(void* sort_set_info) { sort_set_info_ = sort_set_info; } + + void* GetSortSetInfo() { return sort_set_info_; } + + SortItem(IterOption iter_op = IterOption::kNone) + : next_(nullptr), iter_op_(iter_op), sorted_(false) {} + + private: + std::atomic next_; + IterOption iter_op_; + bool sorted_; + void* sort_set_info_; + + // Prohibit copying due to the below + SortItem(const SortItem&) = delete; + SortItem& operator=(const SortItem&) = delete; + + public: + char key_[1]; +}; + +struct BucketHeader { + port::Mutex mutex_; // this mutex probably wont cause delay + std::list items_; + + BucketHeader() { items_.clear(); } + + bool InternalContains(const MemTableRep::KeyComparator& comparator, + const char* check_key) { + if (items_.size() == 0) { + return false; + } + + std::list::iterator iter; + + for (iter = items_.begin(); iter != items_.end(); ++iter) { + const char* key = (*iter)->key_; + + if (comparator(check_key, key) == 0) { + return true; + } + } + return false; + } + + bool Contains(const MemTableRep::KeyComparator& comparator, + const char* check_key) { + MutexLock l(&mutex_); + return InternalContains(comparator, check_key); + } + + bool Add(SortItem* sort_item, const MemTableRep::KeyComparator& comparator, + bool check_exist) { + MutexLock l(&mutex_); + if (check_exist) { + if (InternalContains(comparator, sort_item->key_)) return false; + } + + items_.push_front(sort_item); + return true; + } +}; + +enum class SeekOption { + kInitForward, + kInitBackward, + kSwitchForward, + kSwitchBackward, +}; + +class SortHeapItem { + public: + SortHeapItem() : add_vector_(nullptr), is_init_(false) {} + + ~SortHeapItem() {} + + // Move constructor. + SortHeapItem(SortHeapItem&& other) noexcept + : add_vector_(other.add_vector_), + curr_iter_(other.curr_iter_), + idx_(other.idx_), + is_init_(other.is_init_) {} + + // Move assignment operator. + SortHeapItem& operator=(SortHeapItem&& other) noexcept { + if (this != &other) { + // Copy the data pointer and its length from the + // source object. + add_vector_ = other.add_vector_; + curr_iter_ = other.curr_iter_; + idx_ = other.idx_; + is_init_ = other.is_init_; + } + return *this; + } + + char* Key() { return *curr_iter_; } + + char* Get() const { return *curr_iter_; } + + uint32_t GetIdx() { return idx_; } + + void Init(void* add_vector, std::vector::iterator curr_iter, + uint32_t idx) { + if (is_init_) return; + add_vector_ = add_vector; + curr_iter_ = curr_iter; + idx_ = idx; + is_init_ = true; + } + + public: + void* add_vector_; + std::vector::iterator curr_iter_; + uint32_t idx_; + bool is_init_; +}; + +class IteratorComparator { + public: + IteratorComparator(const MemTableRep::KeyComparator& comparator, + bool up_direction) + : comparator_(comparator), up_direction_(up_direction) {} + + bool operator()(SortHeapItem* a, SortHeapItem* b) const { + return ((up_direction_) ? (comparator_(a->Get(), b->Get()) > 0) + : (comparator_(a->Get(), b->Get()) < 0)); + } + + void SetDirection(bool up_direction) { up_direction_ = up_direction; } + + private: + const MemTableRep::KeyComparator& comparator_; + bool up_direction_; +}; + +typedef BinaryHeap IterHeap; + +class IterHeapInfo { + public: + IterHeapInfo(const MemTableRep::KeyComparator& comparator) + : comparator_(comparator), + iter_heap_(new IterHeap(IteratorComparator(comparator, true))) {} + + ~IterHeapInfo() { iter_heap_.get()->clear(); } + + void Init(uint32_t iter_sort_items_num) { + sort_items_.reset(new SortHeapItem[iter_sort_items_num]); + } + + void Reset(bool up_iter_direction) { + iter_heap_.get()->clear(); + iter_heap_.reset( + new IterHeap(IteratorComparator(comparator_, up_iter_direction))); + } + + char* Key() const { + char* requested_key = nullptr; + if (iter_heap_.get()->size() != 0) { + requested_key = iter_heap_.get()->top()->Key(); + } + return requested_key; + } + + bool Valid() const { return iter_heap_.get()->size() != 0; } + + SortHeapItem* Get() { + if (!Valid()) return nullptr; + uint32_t sort_item_idx = iter_heap_.get()->top()->GetIdx(); + return (&sort_items_.get()[sort_item_idx]); + } + + SortHeapItem* Get(uint32_t idx) { return (&sort_items_.get()[idx]); } + + void Update(bool valid, SortHeapItem* sort_item) { + if (valid) { + iter_heap_.get()->replace_top(sort_item); + } else { + iter_heap_.get()->pop(); + } + } + + void Insert(SortHeapItem* sort_item) { iter_heap_.get()->push(sort_item); } + + private: + const MemTableRep::KeyComparator& comparator_; + std::unique_ptr sort_items_; + std::unique_ptr iter_heap_; +}; + +class SortVector { + public: + SortVector(uint32_t size_limit) + : iter_point_(IterOption::kIter), size_limit_(size_limit) { + items_.reserve(size_limit); + smallest_key_ = nullptr; + largest_key_ = nullptr; + } + + ~SortVector() {} + + // Move constructor. + SortVector(SortVector&& other) noexcept : iter_point_(IterOption::kIter) { + items_.reserve(other.size_limit_); + } + + bool IsEmpty(); + + SortItem* GetIterPoint(); + + bool Sort(const MemTableRep::KeyComparator& comparator); + + bool Insert(char* key); + + bool SeekForward(const MemTableRep::KeyComparator& comparator, + const char* seek_key, SortHeapItem* sort_item); + bool SeekBackward(const MemTableRep::KeyComparator& comparator, + const char* seek_key, SortHeapItem* sort_item); + + bool SeekSwitchForward(const MemTableRep::KeyComparator& comparator, + const char* seek_key, SortHeapItem* sort_item); + bool SeekSwitchBackward(const MemTableRep::KeyComparator& comparator, + const char* seek_key, SortHeapItem* sort_item); + + bool Seek(const MemTableRep::KeyComparator& comparator, const char* seek_key, + SeekOption seek_op, SortHeapItem* sort_item, uint32_t idx); + + bool Next(SortHeapItem* sort_item); + + bool Prev(SortHeapItem* sort_item); + + private: + SortItem iter_point_; + std::vector items_; + char* smallest_key_; + char* largest_key_; + uint32_t size_limit_; + port::Mutex mutex_; +}; + +// SortVector implemntation + +SortItem* SortVector::GetIterPoint() { return &iter_point_; } + +bool SortVector::Insert(char* key) { + items_.push_back(key); + return (items_.size() == size_limit_) ? false : true; +} + +bool SortVector::Sort(const MemTableRep::KeyComparator& comparator) { + std::sort(items_.begin(), items_.end(), stl_wrappers::Compare(comparator)); + smallest_key_ = items_.front(); + largest_key_ = items_.back(); + + return (items_.size() != 0) ? true : false; +} + +bool SortVector::SeekForward(const MemTableRep::KeyComparator& comparator, + const char* seek_key, SortHeapItem* sort_item) { + if (seek_key == nullptr) { + sort_item->curr_iter_ = items_.begin(); + } else { + if (comparator(largest_key_, seek_key) >= 0) { + sort_item->curr_iter_ = + std::lower_bound(items_.begin(), items_.end(), seek_key, + stl_wrappers::Compare(comparator)); + } + } + return (sort_item->curr_iter_ == items_.end()) ? false : true; +} + +bool SortVector::SeekBackward(const MemTableRep::KeyComparator& comparator, + const char* seek_key, SortHeapItem* sort_item) { + if (seek_key == nullptr) { + sort_item->curr_iter_ = std::prev(items_.end()); + } else { + if (comparator(smallest_key_, seek_key) <= 0) { + sort_item->curr_iter_ = + std::lower_bound(items_.begin(), items_.end(), seek_key, + stl_wrappers::Compare(comparator)); + if (comparator(*sort_item->curr_iter_, seek_key) > 0) { + // need to backward the curr iter + --sort_item->curr_iter_; + } + } + } + return (sort_item->curr_iter_ == items_.end()) ? false : true; +} + +bool SortVector::SeekSwitchForward(const MemTableRep::KeyComparator& comparator, + const char* seek_key, + SortHeapItem* sort_item) { + if (comparator(largest_key_, seek_key) <= 0) { + // this addvector shouldnt be part of the iterator heap + sort_item->curr_iter_ = items_.end(); + } else { + if (sort_item->curr_iter_ != items_.end()) { + ++sort_item->curr_iter_; + } else { + sort_item->curr_iter_ = + std::upper_bound(items_.begin(), items_.end(), seek_key, + stl_wrappers::Compare(comparator)); + } + } + return (sort_item->curr_iter_ == items_.end()) ? false : true; +} + +bool SortVector::SeekSwitchBackward( + const MemTableRep::KeyComparator& comparator, const char* seek_key, + SortHeapItem* sort_item) { + if (comparator(smallest_key_, seek_key) >= 0) { + // this addvector shouldnt be part of the iterator heap + sort_item->curr_iter_ = items_.end(); + } else { + if (sort_item->curr_iter_ != items_.end()) { + --sort_item->curr_iter_; + } else { + sort_item->curr_iter_ = + std::lower_bound(items_.begin(), items_.end(), seek_key, + stl_wrappers::Compare(comparator)); + sort_item->curr_iter_ = (sort_item->curr_iter_ == items_.begin()) + ? items_.end() + : --sort_item->curr_iter_; + } + } + return (sort_item->curr_iter_ == items_.end()) ? false : true; +} + +bool SortVector::Seek(const MemTableRep::KeyComparator& comparator, + const char* seek_key, SeekOption seek_op, + SortHeapItem* sort_item, uint32_t idx) { + if (items_.size() == 0) return false; + sort_item->Init(this, this->items_.end(), idx); + bool valid = false; + switch (seek_op) { + case SeekOption::kInitForward: + valid = SeekForward(comparator, seek_key, sort_item); + break; + case SeekOption::kInitBackward: + valid = SeekBackward(comparator, seek_key, sort_item); + break; + case SeekOption::kSwitchForward: + valid = SeekSwitchForward(comparator, seek_key, sort_item); + break; + case SeekOption::kSwitchBackward: + valid = SeekSwitchBackward(comparator, seek_key, sort_item); + break; + } + return valid; +} + +bool SortVector::Next(SortHeapItem* sort_item) { + sort_item->curr_iter_++; + return (sort_item->curr_iter_ != items_.end()); +} + +bool SortVector::Prev(SortHeapItem* sort_item) { + if (sort_item->curr_iter_ == items_.begin()) { + sort_item->curr_iter_ = items_.end(); + } else { + sort_item->curr_iter_--; + } + return (sort_item->curr_iter_ != items_.end()); +} + +struct IterSortSettingInfo { + std::list>::iterator iter_anchor_; + std::shared_ptr iter_sort_vector_; + uint32_t iter_size_; +}; + +class SortVectorContainer { + public: + explicit SortVectorContainer(const MemTableRep::KeyComparator& comparator, + uint32_t switch_vector_limit) + : items_count_(0), + comparator_(comparator), + switch_vector_limit_(switch_vector_limit), + immutable_(false), + anchor_item_(IterOption::kIter), + immutable_item_(IterOption::kImmutable), + sort_thread_terminate(false) { + last_item_.store(&anchor_item_); + last_sorted_item_.store(&anchor_item_); + sort_thread_ = std::thread(&SortVectorContainer::SortThread, this); + } + + ~SortVectorContainer() { + { + std::unique_lock lck(sort_thread_mutex_); + sort_thread_terminate.store(true); + sort_thread_cv_.notify_one(); + } + sort_thread_.join(); + empty_iter_sort_vectors_.clear(); + } + + void Insert(SortItem* new_item); + + void InitIterator(IterSortSettingInfo* sort_set_info); + + void SeekIter(std::list>::iterator iter_anchor, + IterHeapInfo* iter_heap_info, const char* seek_key, + SeekOption seek_op); + + bool Next(SortHeapItem* sort_item); + + bool Prev(SortHeapItem* sort_item); + + void AdvanceAndSort(std::shared_ptr sort_vector); + + void Sort(); + void SortThread(); + void Immutable(); + + public: + // an atomic add item private the ability add without any lock + std::atomic last_item_; + // an atomic item count allow us know when to create new sort vector + std::atomic items_count_; + port::Mutex mutex_; + + std::list> sort_vectors_; + // this vector list is becuase we might did query on a quite memtable + // BEFORE the memtable was immutable so no need to add a new sort vector to + // heap. it needs to be immpmented better + std::list> empty_iter_sort_vectors_; + const MemTableRep::KeyComparator& comparator_; + + uint32_t switch_vector_limit_; + std::atomic immutable_; + SortItem anchor_item_; + + port::RWMutex rwlock_; // this is protect from being immutable and get iter + // in the same time + SortItem immutable_item_; + std::thread sort_thread_; + std::mutex sort_thread_mutex_; + std::atomic sort_thread_terminate; + std::condition_variable sort_thread_cv_; + std::mutex notify_sorted_mutex_; + std::condition_variable notify_sorted_cv_; + + std::atomic last_sorted_item_; +}; + +// SortVectorContainer implemanmtation + +void SortVectorContainer::Insert(SortItem* new_item) { + uint32_t items_count = items_count_.fetch_add(1); + SortItem* prev_item = last_item_.exchange(new_item); + prev_item->SetNext(new_item); + + if ((items_count % switch_vector_limit_) == 0) { + // notify thread to create new + std::unique_lock lck(sort_thread_mutex_); + sort_thread_cv_.notify_one(); + } +} + +void SortVectorContainer::InitIterator(IterSortSettingInfo* sort_set_info) { + SortItem* sort_item; + bool immutable = false; + + { + ReadLock rl(&rwlock_); // see Immutable function + immutable = immutable_.load(); + if (immutable) { + sort_item = &immutable_item_; + } else { + sort_set_info->iter_sort_vector_ = + std::make_shared(switch_vector_limit_); + sort_item = sort_set_info->iter_sort_vector_->GetIterPoint(); + sort_item->SetSortSetInfo(static_cast(sort_set_info)); + SortItem* prev_item = last_item_.exchange(sort_item); + prev_item->SetNext(sort_item); + { + std::unique_lock lck(sort_thread_mutex_); + sort_thread_cv_.notify_one(); + } + } + } + + { + std::unique_lock notify_lck(notify_sorted_mutex_); + while (!sort_item->IsSorted()) notify_sorted_cv_.wait(notify_lck); + } + if (immutable) { + // we are sorted and set! j + sort_set_info->iter_anchor_ = sort_vectors_.begin(); + sort_set_info->iter_size_ = static_cast(sort_vectors_.size()); + } else { + // the info was set in the sort item + } + return; +} + +void SortVectorContainer::SeekIter( + std::list>::iterator iter_anchor, + IterHeapInfo* iter_heap_info, const char* seek_key, SeekOption seek_op) { + std::list>::iterator iter; + uint32_t idx; + for (iter = iter_anchor, idx = 0; iter != sort_vectors_.end(); + ++iter, ++idx) { + SortHeapItem* sort_item = iter_heap_info->Get(idx); + + bool valid = (*iter)->Seek(comparator_, seek_key, seek_op, sort_item, idx); + + if (valid) iter_heap_info->Insert(sort_item); + } +} + +bool SortVectorContainer::Next(SortHeapItem* sort_item) { + return (static_cast(sort_item->add_vector_))->Next(sort_item); +} + +bool SortVectorContainer::Prev(SortHeapItem* sort_item) { + return (static_cast(sort_item->add_vector_))->Prev(sort_item); +} + +void SortVectorContainer::AdvanceAndSort( + std::shared_ptr sort_vector) { + sort_vectors_.front()->Sort(comparator_); + std::shared_ptr push_sort_vector = sort_vector; + if (!push_sort_vector) + push_sort_vector = std::make_shared(switch_vector_limit_); + sort_vectors_.push_front(push_sort_vector); +} + +void SortVectorContainer::Sort() { sort_vectors_.front()->Sort(comparator_); } + +void SortVectorContainer::SortThread() { + bool should_exit = false; + SortItem* last_loop_item = last_sorted_item_.load(); + // create first vector + sort_vectors_.push_front(std::make_shared(switch_vector_limit_)); + + while (!should_exit) { + { + std::unique_lock lck(sort_thread_mutex_); + while (!last_loop_item->Next() && !sort_thread_terminate.load()) + sort_thread_cv_.wait(lck); + } + // go over the items list - create vector if needed and sort + while (last_loop_item->Next()) { + std::list>::iterator last_sort_iter = + sort_vectors_.begin(); + if (last_loop_item->Next()->IsIterPoint()) { // an iter item + if (last_loop_item->Next()->IsImmutable()) { + // this is the last item! need to sort last vector and exit + Sort(); + last_loop_item->Next()->SetSorted(); + } else { + IterSortSettingInfo* sort_set_info = + static_cast( + last_loop_item->Next()->GetSortSetInfo()); + sort_set_info->iter_size_ = + static_cast(sort_vectors_.size()); + + if (!last_loop_item->IsIterPoint()) { + // sort the previous vector and create new one + AdvanceAndSort(sort_set_info->iter_sort_vector_); + } else { + // need to add to the empty_iter_sort_vectors_ + // TBD AYELET + empty_iter_sort_vectors_.push_back( + std::make_shared(switch_vector_limit_)); + } + sort_set_info->iter_anchor_ = last_sort_iter; + last_loop_item->Next()->SetSorted(); + } + last_sorted_item_.store(last_loop_item->Next()); + { + // notify waiters iterators + std::unique_lock notify_lck(notify_sorted_mutex_); + notify_sorted_cv_.notify_all(); + } + } else { + if (!(*last_sort_iter)->Insert(last_loop_item->Next()->Key())) { + // we reach limit vector size sort and create new vector + AdvanceAndSort(nullptr); + } + } + last_loop_item = last_loop_item->Next(); + } + if (sort_thread_terminate.load() || last_loop_item->IsImmutable()) { + should_exit = true; // thread should be terminated + } + } +} + +void SortVectorContainer::Immutable() { + { + // make sure that no iter requests being performed + WriteLock wl(&rwlock_); + SortItem* prev_item = last_item_.exchange(&immutable_item_); + prev_item->SetNext(&immutable_item_); + immutable_.store(true); + } + { + std::unique_lock lck(sort_thread_mutex_); + sort_thread_cv_.notify_one(); + } +} + +class HashLocklessRep : public MemTableRep { + public: + HashLocklessRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, size_t bucket_size, + uint32_t add_vector_limit_size); + + KeyHandle Allocate(const size_t len, char** buf) override; + + void Insert(KeyHandle handle) override; + + bool InsertKey(KeyHandle handle) override; + + void InsertWithHintConcurrently(KeyHandle handle, void** hint) override; + + bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override; + + void InsertConcurrently(KeyHandle handle) override; + + bool InsertKeyConcurrently(KeyHandle handle) override; + + void MarkReadOnly() override; + bool Contains(const char* key) const override; + + size_t ApproximateMemoryUsage() override; + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override; + + ~HashLocklessRep() override {} + + MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; + + private: + size_t bucket_size_; + + std::unique_ptr buckets_; + + const MemTableRep::KeyComparator& compare_; + + std::shared_ptr sort_vectors_cont_; + + size_t GetHash(const char* key) const { + Slice slice = UserKey(key); + return MurmurHash(slice.data(), static_cast(slice.size()), 0) % + bucket_size_; + } + bool InsertWithCheck(KeyHandle handle); + + BucketHeader* GetBucket(size_t i) const { return &buckets_.get()[i]; } + + BucketHeader* GetBucket(const char* key) const { + return GetBucket(GetHash(key)); + } + + class Iterator : public MemTableRep::Iterator { + public: + // Initialize an iterator over the specified list. + // The returned iterator is not valid. + explicit Iterator(std::shared_ptr sort_vectors_cont, + const MemTableRep::KeyComparator& comparator) + : sort_vectors_cont_(sort_vectors_cont), + iter_heap_info_(comparator), + up_iter_direction_(true) { + IterSortSettingInfo sort_set_info; + sort_vectors_cont_->InitIterator(&sort_set_info); + iter_anchor_ = sort_set_info.iter_anchor_; + iter_sort_items_num_ = sort_set_info.iter_size_; + // allocate iter_heap_info + iter_heap_info_.Init(iter_sort_items_num_); + } + + ~Iterator() override {} + + // Returns true if the iterator is positioned at a valid node. + bool Valid() const override { return iter_heap_info_.Valid(); } + + // Returns the key at the current position. + const char* key() const override { return iter_heap_info_.Key(); } + + void InternalSeek(const char* seek_key, SeekOption seek_op) { + return sort_vectors_cont_->SeekIter(iter_anchor_, &iter_heap_info_, + seek_key, seek_op); + } + + void Reset(bool up_iter_direction) { + up_iter_direction_ = up_iter_direction; + iter_heap_info_.Reset(up_iter_direction_); + } + + void ReverseDirection(bool up_iter_direction) { + char* seek_key = iter_heap_info_.Key(); + Reset(up_iter_direction); + InternalSeek(seek_key, (up_iter_direction) ? SeekOption::kSwitchForward + : SeekOption::kSwitchBackward); + } + + void Advance() { + SortHeapItem* sort_item = iter_heap_info_.Get(); + bool valid = (up_iter_direction_) ? sort_vectors_cont_->Next(sort_item) + : sort_vectors_cont_->Prev(sort_item); + iter_heap_info_.Update(valid, sort_item); + } + // Advances to the next position. + void Next() override { + if (!up_iter_direction_) { + ReverseDirection(true); + } else { + Advance(); + } + } + + // Advances to the previous position. + void Prev() override { + if (up_iter_direction_) { + ReverseDirection(false); + } else { + Advance(); + } + } + + // Advance to the first entry with a key >= target + void Seek(const Slice& user_key, const char* memtable_key) override { + Reset(true); + InternalSeek(memtable_key ? memtable_key : EncodeKey(&tmp_, user_key), + SeekOption::kInitForward); + } + + // Retreat to the last entry with a key <= target + void SeekForPrev(const Slice& user_key, const char* memtable_key) override { + Reset(false); + InternalSeek(memtable_key ? memtable_key : EncodeKey(&tmp_, user_key), + SeekOption::kInitBackward); + } + + // Position at the first entry in list. + // Final state of iterator is Valid() if list is not empty. + void SeekToFirst() override { + Reset(true); + InternalSeek(nullptr, SeekOption::kInitForward); + } + + // Position at the last entry in list. + // Final state of iterator is Valid() if list is not empty. + void SeekToLast() override { + Reset(false); + InternalSeek(nullptr, SeekOption::kInitBackward); + } + + private: + std::shared_ptr sort_vectors_cont_; + std::list>::iterator iter_anchor_; + uint32_t iter_sort_items_num_; + IterHeapInfo iter_heap_info_; + bool up_iter_direction_; + + protected: + std::string tmp_; // For passing to EncodeKey + }; + + class IteratorEmpty : public MemTableRep::Iterator { + public: + IteratorEmpty() {} + + ~IteratorEmpty() override {} + + // Returns true if the iterator is positioned at a valid node. + bool Valid() const override { return false; } + + // Returns the key at the current position. + const char* key() const override { return nullptr; } + + // Advances to the next position. + void Next() override { return; } + + // Advances to the previous position. + void Prev() override { return; } + + // Advance to the first entry with a key >= target + void Seek(const Slice& /* internal_key */, + const char* /* memtable_key */) override { + return; + } + + // Retreat to the last entry with a key <= target + void SeekForPrev(const Slice& /* internal_key */, + const char* /* memtable_key */) override { + return; + } + + // Position at the first entry in list. + // Final state of iterator is Valid() if list is not empty. + void SeekToFirst() override { return; } + + // Position at the last entry in list. + // Final state of iterator is Valid() if list is not empty. + void SeekToLast() override { return; } + }; +}; + +HashLocklessRep::HashLocklessRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, size_t bucket_size, + uint32_t add_list_limit_size) + : MemTableRep(allocator), bucket_size_(bucket_size), compare_(compare) { + sort_vectors_cont_ = + std::make_shared(compare, add_list_limit_size); + buckets_.reset(new BucketHeader[bucket_size]); +} + +KeyHandle HashLocklessRep::Allocate(const size_t len, char** buf) { + char* mem = allocator_->AllocateAligned(sizeof(SortItem) + len); + SortItem* sort_item = new (mem) SortItem(); + *buf = sort_item->key_; + return static_cast(sort_item); +} + +void HashLocklessRep::Insert(KeyHandle handle) { + SortItem* sort_item = static_cast(handle); + BucketHeader* bucket = GetBucket(sort_item->key_); + bucket->Add(sort_item, this->compare_, false); + // insert to later sorter list + sort_vectors_cont_->Insert(sort_item); + + return; +} + +bool HashLocklessRep::InsertWithCheck(KeyHandle handle) { + SortItem* sort_item = static_cast(handle); + BucketHeader* bucket = GetBucket(sort_item->key_); + + if (!bucket->Add(sort_item, this->compare_, true)) { + return false; + } + + // insert to later sorter list + sort_vectors_cont_->Insert(sort_item); + + return true; +} + +bool HashLocklessRep::InsertKey(KeyHandle handle) { + return InsertWithCheck(handle); +} + +void HashLocklessRep::InsertWithHintConcurrently(KeyHandle handle, void**) { + Insert(handle); +} + +bool HashLocklessRep::InsertKeyWithHintConcurrently(KeyHandle handle, void**) { + return InsertWithCheck(handle); +} + +void HashLocklessRep::InsertConcurrently(KeyHandle handle) { Insert(handle); } + +bool HashLocklessRep::InsertKeyConcurrently(KeyHandle handle) { + return InsertWithCheck(handle); +} + +bool HashLocklessRep::Contains(const char* key) const { + BucketHeader* bucket = GetBucket(key); + + return bucket->Contains(this->compare_, key); +} + +void HashLocklessRep::MarkReadOnly() { sort_vectors_cont_->Immutable(); } + +size_t HashLocklessRep::ApproximateMemoryUsage() { + // Memory is always allocated from the allocator. + return 0; +} + +void HashLocklessRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + BucketHeader* bucket = GetBucket(k.memtable_key().data()); + MutexLock l(&bucket->mutex_); + + for (auto iter = bucket->items_.begin(); iter != bucket->items_.end(); + ++iter) { + if (!callback_func(callback_args, (*iter)->key_)) { + break; + } + } +} + +MemTableRep::Iterator* HashLocklessRep::GetIterator(Arena* arena) { + bool empty_iter = (sort_vectors_cont_->items_count_.load() == 0); + if (!sort_vectors_cont_->immutable_.load()) empty_iter = true; + if (arena != nullptr) { + void* mem; + if (empty_iter) { + mem = arena->AllocateAligned(sizeof(IteratorEmpty)); + return new (mem) IteratorEmpty(); + } else { + mem = arena->AllocateAligned(sizeof(Iterator)); + return new (mem) Iterator(sort_vectors_cont_, compare_); + } + } + if (empty_iter) { + return new IteratorEmpty(); + } else { + return new Iterator(sort_vectors_cont_, compare_); + } +} + +static std::unordered_map hash_spd_factory_info = { +#ifndef ROCKSDB_LITE + {"bucket_count", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}}, +#endif +}; +} // namespace + +HashSpdRepFactory::HashSpdRepFactory(size_t bucket_count) + : bucket_count_(bucket_count) { + RegisterOptions("", &bucket_count_, &hash_spd_factory_info); +} + +MemTableRep* HashSpdRepFactory::CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* /*transform*/, Logger* /*logger*/) { + return new HashLocklessRep(compare, allocator, bucket_count_, 10000); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/plugin/speedb/memtable/hash_spd_rep.h b/plugin/speedb/memtable/hash_spd_rep.h new file mode 100644 index 0000000000..740d8d628b --- /dev/null +++ b/plugin/speedb/memtable/hash_spd_rep.h @@ -0,0 +1,45 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/memtablerep.h" + +namespace ROCKSDB_NAMESPACE { + +class HashSpdRepFactory : public MemTableRepFactory { + public: + explicit HashSpdRepFactory(size_t bucket_count = 1000000); + ~HashSpdRepFactory() override {} + + using MemTableRepFactory::CreateMemTableRep; + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, + const SliceTransform* transform, + Logger* logger) override; + bool IsInsertConcurrentlySupported() const override { return true; } + bool CanHandleDuplicatedKey() const override { return true; } + + static const char* kClassName() { return "speedb.HashSpdRepFactory"; } + const char* Name() const override { return kClassName(); } + + private: + size_t bucket_count_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/plugin/speedb/paired_filter/speedb_db_bloom_filter_test.cc b/plugin/speedb/paired_filter/speedb_db_bloom_filter_test.cc new file mode 100644 index 0000000000..633dcc2de9 --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_db_bloom_filter_test.cc @@ -0,0 +1,2724 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "plugin/speedb/paired_filter/speedb_paired_bloom.h" +#include "port/stack_trace.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/format.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +std::shared_ptr Create(double bits_per_key, + const std::string& name) { + if (name == SpdbPairedBloomFilterPolicy::kClassName()) { + return std::make_shared(bits_per_key); + } else { + return nullptr; + } +} +const std::string kSpdbPairedBloom = SpdbPairedBloomFilterPolicy::kClassName(); + +} // namespace + +// DB tests related to Speedb's Paired Block Bloom Filter. + +class SpdbDBBloomFilterTest : public DBTestBase { + public: + SpdbDBBloomFilterTest() + : DBTestBase("speedb_db_bloom_filter_test", /*env_do_fsync=*/true) {} +}; + +class SpdbDBBloomFilterTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { + protected: + bool partition_filters_; + + public: + SpdbDBBloomFilterTestWithParam() + : DBTestBase("speedb_db_bloom_filter_test", /*env_do_fsync=*/true) {} + + ~SpdbDBBloomFilterTestWithParam() override {} + + void SetUp() override { partition_filters_ = std::get<0>(GetParam()); } +}; + +class SpdbDBBloomFilterTestDefFormatVersion + : public SpdbDBBloomFilterTestWithParam {}; + +class SliceTransformLimitedDomainGeneric : public SliceTransform { + static constexpr size_t kPrefixLen = 5U; + + const char* Name() const override { + return "SliceTransformLimitedDomainGeneric"; + } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), kPrefixLen); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= kPrefixLen; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == kPrefixLen; + } +}; + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST_P(SpdbDBBloomFilterTestDefFormatVersion, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + options_override.partition_filters = partition_filters_; + options_override.metadata_block_size = 32; + Options options = CurrentOptions(options_override); + if (partition_filters_) { + auto* table_options = + options.table_factory->GetOptions(); + if (table_options != nullptr && + table_options->index_type != + BlockBasedTableOptions::kTwoLevelIndexSearch) { + // In the current implementation partitioned filters depend on + // partitioned indexes + continue; + } + } + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + + ASSERT_OK(Put(1, "a", "b")); + bool value_found = false; + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + ASSERT_OK(Flush(1)); + value.clear(); + + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "a")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "c")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while ( + ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); +} + +TEST_P(SpdbDBBloomFilterTestWithParam, + GetFilterByPrefixBloomCustomPrefixExtractor) { + Options options = last_options_; + options.prefix_extractor = + std::make_shared(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + if (partition_filters_) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + ASSERT_OK(dbfull()->Flush(fo)); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ( + 1, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ( + 2, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ro.total_order_seek = true; + // NOTE: total_order_seek no longer affects Get() + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + // No bloom on extractor changed +#ifndef ROCKSDB_LITE + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); +#endif // ROCKSDB_LITE + + get_perf_context()->Reset(); +} + +TEST_P(SpdbDBBloomFilterTestWithParam, GetFilterByPrefixBloom) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + if (partition_filters_) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + ASSERT_OK(dbfull()->Flush(fo)); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + + ro.total_order_seek = true; + // NOTE: total_order_seek no longer affects Get() + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + // No bloom on extractor changed +#ifndef ROCKSDB_LITE + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); +#endif // ROCKSDB_LITE + + get_perf_context()->Reset(); +} + +TEST_P(SpdbDBBloomFilterTestWithParam, WholeKeyFilterProp) { + for (bool partition_filters : {true, false}) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + bbto.whole_key_filtering = false; + if (partition_filters) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(dbfull()->Flush(fo)); + + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Reopen with whole key filtering enabled and prefix extractor + // NULL. Bloom filter should be off for both of whole key and + // prefix bloom. + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor.reset(); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + // Write DB with only full key filtering. + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Reopen with both of whole key off and prefix extractor enabled. + // Still no bloom filter should be used. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Try to create a DB with mixed files: + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + options.prefix_extractor.reset(); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + // Try to create a DB with mixed files. + ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); + // In this case needs insert some keys to make sure files are + // not filtered out by key ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(Flush()); + + // Now we have two files: + // File 1: An older file with prefix bloom. + // File 2: A newer file with whole bloom filter. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + + // Reopen with the same setting: only whole key is used + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + + // Restart with both filters are allowed + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + // File 1 will has it filtered out. + // File 2 will not, as prefix `foo` exists in the file. + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + + // Restart with only prefix bloom is allowed. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; + } + } + ASSERT_EQ(12, bloom_filter_useful_all_levels); + get_perf_context()->Reset(); + } +} + +TEST_P(SpdbDBBloomFilterTestWithParam, BloomFilter) { + do { + Options options = CurrentOptions(); + env_->count_random_reads_ = true; + options.env = env_; + // ChangeCompactOptions() only changes compaction style, which does not + // trigger reset of table_factory + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + const auto kBpk = 20U; + const auto bytes_per_key = kBpk / 8; + table_options.filter_policy = Create(kBpk, kSpdbPairedBloom); + ASSERT_FALSE(table_options.filter_policy == nullptr); + table_options.partition_filters = partition_filters_; + if (partition_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + ASSERT_GE(table_options.format_version, 5U); + // value delta encoding challenged more with index interval > 1 + table_options.index_block_restart_interval = 8; + table_options.metadata_block_size = 32; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Compact(1, "a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Flush(1)); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.store(true, std::memory_order_release); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + if (partition_filters_) { + // Without block cache, we read an extra partition filter per each + // level*read and a partition index per each read + ASSERT_LE(reads, 4 * N + 2 * N / 100); + } else { + ASSERT_LE(reads, N + 2 * N / 100); + } + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + if (partition_filters_) { + // With partitioned filter we read one extra filter per level per each + // missed read. + ASSERT_LE(reads, 2 * N + 3 * N / 100); + } else { + ASSERT_LE(reads, 3 * N / 100); + } + +#ifndef ROCKSDB_LITE + // Sanity check some table properties + std::map props; + ASSERT_TRUE(db_->GetMapProperty( + handles_[1], DB::Properties::kAggregatedTableProperties, &props)); + uint64_t nkeys = N + N / 100; + uint64_t filter_size = ParseUint64(props["filter_size"]); + // TODO: Our Filter has a min size of 8192 bytes (64 X 128) => The upper + // limit depends on the number of filters + // => Adapt the caclulation + // // // EXPECT_LE(filter_size, + // // // (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ + // 8); Always Bloom + EXPECT_GE(filter_size, static_cast(bytes_per_key * nkeys)); + + uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]); + EXPECT_EQ(num_filter_entries, nkeys); + + // // // fprintf(stderr, "filter_size:%d, num_filter_entries:%d, + // nkeys:%d\n", (int)filter_size, (int)num_filter_entries, (int)nkeys); +#endif // ROCKSDB_LITE + + env_->delay_sstable_sync_.store(false, std::memory_order_release); + Close(); + } while (ChangeCompactOptions()); +} + +namespace { + +class AlwaysTrueBitsBuilder : public FilterBitsBuilder { + public: + void AddKey(const Slice&) override {} + size_t EstimateEntriesAdded() override { return 0U; } + Slice Finish(std::unique_ptr* /* buf */) override { + // Interpreted as "always true" filter (0 probes over 1 byte of + // payload, 5 bytes metadata) + return Slice("\0\0\0\0\0\0", 6); + } + using FilterBitsBuilder::Finish; + size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; } +}; + +class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy { + public: + explicit AlwaysTrueFilterPolicy(bool skip) : skip_(skip) {} + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const override { + if (skip_) { + return nullptr; + } else { + return new AlwaysTrueBitsBuilder(); + } + } + + private: + bool skip_; +}; + +} // namespace + +TEST_P(SpdbDBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { + constexpr int maxKey = 10; + auto PutFn = [&]() { + int i; + // Put + for (i = 0; i < maxKey; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + Flush(); + }; + auto GetFn = [&]() { + int i; + // Get OK + for (i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(Key(i))); + } + // Get NotFound + for (; i < maxKey * 2; i++) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + }; + auto PutAndGetFn = [&]() { + PutFn(); + GetFn(); + }; +#ifndef ROCKSDB_LITE + std::map props; + const auto& kAggTableProps = DB::Properties::kAggregatedTableProperties; +#endif // ROCKSDB_LITE + + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.partition_filters = partition_filters_; + if (partition_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + + // Test 1: bits per key < 0.5 means skip filters -> no filter + // constructed or read. + table_options.filter_policy = Create(0.4, kSpdbPairedBloom); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor contruction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Test 2: use custom API to skip filters -> no filter constructed + // or read. + table_options.filter_policy.reset( + new AlwaysTrueFilterPolicy(/* skip */ true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor construction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Control test: using an actual filter with 100% FP rate -> the filter + // is constructed and checked on read. + table_options.filter_policy.reset( + new AlwaysTrueFilterPolicy(/* skip */ false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify filter is accessed (and constructed) + EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), + maxKey * 2); + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + maxKey); +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_NE(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Test 3 (options test): Able to read existing filters with longstanding + // generated options file entry `filter_policy=rocksdb.BuiltinBloomFilter` + ASSERT_OK(FilterPolicy::CreateFromString(ConfigOptions(), + "rocksdb.BuiltinBloomFilter", + &table_options.filter_policy)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + GetFn(); + + // Verify filter is accessed + EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), + maxKey * 2); + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + maxKey); + + // But new filters are not generated (configuration details unknown) + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor construction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE +} + +INSTANTIATE_TEST_CASE_P(DBBloomFilterTestWithParam, + SpdbDBBloomFilterTestWithParam, + ::testing::Values(false, true)); + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +INSTANTIATE_TEST_CASE_P(FormatDef, SpdbDBBloomFilterTestDefFormatVersion, + ::testing::Values(false, true)); + +INSTANTIATE_TEST_CASE_P(FormatDef, SpdbDBBloomFilterTestWithParam, + ::testing::Values(false, true)); + +INSTANTIATE_TEST_CASE_P(FormatLatest, SpdbDBBloomFilterTestWithParam, + ::testing::Values(false, true)); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(SpdbDBBloomFilterTest, BloomFilterRate) { + while (ChangeFilterOptions()) { + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + Options options = CurrentOptions(options_override); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + ASSERT_GE( + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful, + maxKey * 0.98); + get_perf_context()->Reset(); + } +} + +namespace { +struct CompatibilityConfig { + std::shared_ptr policy; + bool partitioned; + uint32_t format_version; + + void SetInTableOptions(BlockBasedTableOptions* table_options) { + table_options->filter_policy = policy; + table_options->partition_filters = partitioned; + if (partitioned) { + table_options->index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } else { + table_options->index_type = + BlockBasedTableOptions::IndexType::kBinarySearch; + } + table_options->format_version = format_version; + } +}; +// // // // High bits per key -> almost no FPs +// // // std::shared_ptr kCompatibilityBloomPolicy{ +// // // NewBloomFilterPolicy(20)}; +// // // // bloom_before_level=-1 -> always use Ribbon +// // // std::shared_ptr kCompatibilityRibbonPolicy{ +// // // NewRibbonFilterPolicy(20, -1)}; + +// // // std::vector kCompatibilityConfigs = { +// // // {Create(20, kDeprecatedBlock), false, +// // // BlockBasedTableOptions().format_version}, +// // // {kCompatibilityBloomPolicy, false, +// BlockBasedTableOptions().format_version}, +// // // {kCompatibilityBloomPolicy, true, +// BlockBasedTableOptions().format_version}, +// // // {kCompatibilityBloomPolicy, false, /* legacy Bloom */ 4U}, +// // // {kCompatibilityRibbonPolicy, false, +// // // BlockBasedTableOptions().format_version}, +// // // {kCompatibilityRibbonPolicy, true, +// BlockBasedTableOptions().format_version}, +// // // }; +} // namespace + +// // // TEST_F(SpdbDBBloomFilterTest, BloomFilterCompatibility) { +// // // Options options = CurrentOptions(); +// // // options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +// // // options.level0_file_num_compaction_trigger = +// // // static_cast(kCompatibilityConfigs.size()) + 1; +// // // options.max_open_files = -1; + +// // // Close(); + +// // // // Create one file for each kind of filter. Each file covers a +// distinct key +// // // // range. +// // // for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) { +// // // BlockBasedTableOptions table_options; +// // // kCompatibilityConfigs[i].SetInTableOptions(&table_options); +// // // ASSERT_TRUE(table_options.filter_policy != nullptr); +// // // options.table_factory.reset(NewBlockBasedTableFactory(table_options)); +// // // Reopen(options); + +// // // std::string prefix = ToString(i) + "_"; +// // // ASSERT_OK(Put(prefix + "A", "val")); +// // // ASSERT_OK(Put(prefix + "Z", "val")); +// // // ASSERT_OK(Flush()); +// // // } + +// // // // Test filter is used between each pair of {reader,writer} +// configurations, +// // // // because any built-in FilterPolicy should be able to read filters +// from any +// // // // other built-in FilterPolicy +// // // for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) { +// // // BlockBasedTableOptions table_options; +// // // kCompatibilityConfigs[i].SetInTableOptions(&table_options); +// // // options.table_factory.reset(NewBlockBasedTableFactory(table_options)); +// // // Reopen(options); +// // // for (size_t j = 0; j < kCompatibilityConfigs.size(); ++j) { +// // // std::string prefix = ToString(j) + "_"; +// // // ASSERT_EQ("val", Get(prefix + "A")); // Filter positive +// // // ASSERT_EQ("val", Get(prefix + "Z")); // Filter positive +// // // // Filter negative, with high probability +// // // ASSERT_EQ("NOT_FOUND", Get(prefix + "Q")); +// // // // FULL_POSITIVE does not include block-based filter case (j == +// 0) +// // // EXPECT_EQ(TestGetAndResetTickerCount(options, +// BLOOM_FILTER_FULL_POSITIVE), +// // // j == 0 ? 0 : 2); +// // // EXPECT_EQ(TestGetAndResetTickerCount(options, +// BLOOM_FILTER_USEFUL), 1); +// // // } +// // // } +// // // } + +/* + * A cache wrapper that tracks peaks and increments of filter + * construction cache reservation. + * p0 + * / \ p1 + * / \ /\ + * / \/ \ + * a / b \ + * peaks = {p0, p1} + * increments = {p1-a, p2-b} + */ +class FilterConstructResPeakTrackingCache : public CacheWrapper { + public: + explicit FilterConstructResPeakTrackingCache(std::shared_ptr target) + : CacheWrapper(std::move(target)), + cur_cache_res_(0), + cache_res_peak_(0), + cache_res_increment_(0), + last_peak_tracked_(false), + cache_res_increments_sum_(0) {} + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + Status s = target_->Insert(key, value, charge, deleter, handle, priority); + if (deleter == kNoopDeleterForFilterConstruction) { + if (last_peak_tracked_) { + cache_res_peak_ = 0; + cache_res_increment_ = 0; + last_peak_tracked_ = false; + } + cur_cache_res_ += charge; + cache_res_peak_ = std::max(cache_res_peak_, cur_cache_res_); + cache_res_increment_ += charge; + } + return s; + } + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override { + auto deleter = GetDeleter(handle); + if (deleter == kNoopDeleterForFilterConstruction) { + if (!last_peak_tracked_) { + cache_res_peaks_.push_back(cache_res_peak_); + cache_res_increments_sum_ += cache_res_increment_; + last_peak_tracked_ = true; + } + cur_cache_res_ -= GetCharge(handle); + } + bool is_successful = target_->Release(handle, erase_if_last_ref); + return is_successful; + } + + std::deque GetReservedCachePeaks() { return cache_res_peaks_; } + + std::size_t GetReservedCacheIncrementSum() { + return cache_res_increments_sum_; + } + + private: + static const Cache::DeleterFn kNoopDeleterForFilterConstruction; + + std::size_t cur_cache_res_; + std::size_t cache_res_peak_; + std::size_t cache_res_increment_; + bool last_peak_tracked_; + std::deque cache_res_peaks_; + std::size_t cache_res_increments_sum_; +}; + +const Cache::DeleterFn + FilterConstructResPeakTrackingCache::kNoopDeleterForFilterConstruction = + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::TEST_GetNoopDeleterForRole(); + +// To align with the type of hash entry being reserved in implementation. +using FilterConstructionReserveMemoryHash = uint64_t; + +class DBFilterConstructionReserveMemoryTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBFilterConstructionReserveMemoryTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true), + num_key_(0), + reserve_table_builder_memory_(std::get<0>(GetParam())), + partition_filters_(std::get<1>(GetParam())), + detect_filter_construct_corruption_(std::get<2>(GetParam())) { + if (!reserve_table_builder_memory_) { + // For these cases, we only interested in whether filter construction + // cache resevation happens instead of its accuracy. Therefore we don't + // need many keys. + num_key_ = 5; + } else if (partition_filters_) { + // For PartitionFilter case, since we set + // table_options.metadata_block_size big enough such that each partition + // trigger at least 1 dummy entry reservation each for hash entries and + // final filter, we need a large number of keys to ensure we have at least + // two partitions. + num_key_ = 18 * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } else { + // For Bloom Filter + FullFilter case, since we design the num_key_ to + // make hash entry cache reservation be a multiple of dummy entries, the + // correct behavior of charging final filter on top of it will trigger at + // least another dummy entry insertion. Therefore we can assert that + // behavior and we don't need a large number of keys to verify we + // indeed charge the final filter for cache reservation, even though final + // filter is a lot smaller than hash entries. + num_key_ = 1 * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + + // TODO: Add support for this test for our filter !!!!!!!!!!!!!!!!!! + } + } + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + + // We set cache capacity big enough to prevent cache full for convenience in + // calculation. + constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024; + + table_options.reserve_table_builder_memory = reserve_table_builder_memory_; + table_options.filter_policy = Create(10, kSpdbPairedBloom); + table_options.partition_filters = partition_filters_; + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size big enough so that each + // partition trigger at least 1 dummy entry insertion each for hash + // entries and final filter. + table_options.metadata_block_size = 409000; + } + table_options.detect_filter_construct_corruption = + detect_filter_construct_corruption_; + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + cache_ = std::make_shared( + (NewLRUCache(lo))); + table_options.block_cache = cache_; + + return table_options; + } + + std::size_t GetNumKey() { return num_key_; } + + bool ReserveTableBuilderMemory() { return reserve_table_builder_memory_; } + + bool PartitionFilters() { return partition_filters_; } + + std::shared_ptr + GetFilterConstructResPeakTrackingCache() { + return cache_; + } + + private: + std::size_t num_key_; + bool reserve_table_builder_memory_; + bool partition_filters_; + std::shared_ptr cache_; + bool detect_filter_construct_corruption_; +}; + +INSTANTIATE_TEST_CASE_P(DBFilterConstructionReserveMemoryTestWithParam, + DBFilterConstructionReserveMemoryTestWithParam, + ::testing::Values(std::make_tuple(true, false, false), + std::make_tuple(true, false, true), + std::make_tuple(true, true, false), + std::make_tuple(true, true, true))); + +// TODO: Speed up this test, and reduce disk space usage (~700MB) +// The current test inserts many keys (on the scale of dummy entry size) +// in order to make small memory user (e.g, final filter, partitioned hash +// entries/filter/banding) , which is proportional to the number of +// keys, big enough so that its cache reservation triggers dummy entry insertion +// and becomes observable in the test. +// +// However, inserting that many keys slows down this test and leaves future +// developers an opportunity to speed it up. +// +// Possible approaches & challenges: +// 1. Use sync point during cache reservation of filter construction +// +// Benefit: It does not rely on triggering dummy entry insertion +// but the sync point to verify small memory user is charged correctly. +// +// Challenge: this approach is intrusive. +// +// 2. Make dummy entry size configurable and set it small in the test +// +// Benefit: It increases the precision of cache reservation and therefore +// small memory usage can still trigger insertion of dummy entry. +// +// Challenge: change CacheReservationManager related APIs and a hack +// might be needed to control the size of dummmy entry of +// CacheReservationManager used in filter construction for testing +// since CacheReservationManager is not exposed at the high level. +// +TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { + // // // Options options = CurrentOptions(); + // // // // We set write_buffer_size big enough so that in the case where + // there is + // // // // filter construction cache reservation, flush won't be triggered + // before we + // // // // manually trigger it for clean testing + // // // options.write_buffer_size = 640 << 20; + // // // BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + // // // + // options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + // // // std::shared_ptr cache = + // // // GetFilterConstructResPeakTrackingCache(); + // // // options.create_if_missing = true; + // // // // Disable auto compaction to prevent its unexpected side effect + // // // // to the number of keys per partition designed by us in the test + // // // options.disable_auto_compactions = true; + // // // DestroyAndReopen(options); + // // // int num_key = static_cast(GetNumKey()); + // // // for (int i = 0; i < num_key; i++) { + // // // ASSERT_OK(Put(Key(i), Key(i))); + // // // } + + // // // ASSERT_EQ(cache->GetReservedCacheIncrementSum(), 0) + // // // << "Flush was triggered too early in the test case with filter " + // // // "construction cache reservation - please make sure no flush + // triggered " + // // // "during the key insertions above"; + + // // // ASSERT_OK(Flush()); + + // // // bool reserve_table_builder_memory = ReserveTableBuilderMemory(); + // // // std::string policy = kSpdbPairedBloom; + // // // bool partition_filters = PartitionFilters(); + // // // bool detect_filter_construct_corruption = + // // // table_options.detect_filter_construct_corruption; + + // // // std::deque filter_construction_cache_res_peaks = + // // // cache->GetReservedCachePeaks(); + // // // std::size_t filter_construction_cache_res_increments_sum = + // // // cache->GetReservedCacheIncrementSum(); + + // // // if (!reserve_table_builder_memory) { + // // // EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0); + // // // return; + // // // } + + // // // const std::size_t kDummyEntrySize = CacheReservationManagerImpl< + // // // CacheEntryRole::kFilterConstruction>::GetDummyEntrySize(); + + // // // const std::size_t predicted_hash_entries_cache_res = + // // // num_key * sizeof(FilterConstructionReserveMemoryHash); + // // // ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0) + // // // << "It's by this test's design that + // predicted_hash_entries_cache_res is " + // // // "a multipe of dummy entry"; + + // // // const std::size_t predicted_hash_entries_cache_res_dummy_entry_num = + // // // predicted_hash_entries_cache_res / kDummyEntrySize; + // // // const std::size_t predicted_final_filter_cache_res = + // // // static_cast( + // // // std::ceil(1.0 * + // predicted_hash_entries_cache_res_dummy_entry_num / 6 * 1)) * + // kDummyEntrySize; + // // // const std::size_t predicted_banding_cache_res = + // // // static_cast( + // // // std::ceil(predicted_hash_entries_cache_res_dummy_entry_num + // * 2.5)) * + // // // kDummyEntrySize; + +#if 0 + if (policy == kFastLocalBloom) { + /* kFastLocalBloom + FullFilter + * p0 + * / \ + * b / \ + * / \ + * / \ + * 0/ \ + * hash entries = b - 0, final filter = p0 - b + * p0 = hash entries + final filter + * + * The test is designed in a way such that the reservation for b is a + * multiple of dummy entries so that reservation for (p0 - b) + * will trigger at least another dummy entry insertion. + * + * kFastLocalBloom + FullFilter + + * detect_filter_construct_corruption + * The peak p0 stays the same as + * (kFastLocalBloom + FullFilter) but just lasts + * longer since we release hash entries reservation later. + * + * kFastLocalBloom + PartitionedFilter + * p1 + * / \ + * p0 b'/ \ + * / \ / \ + * b / \ / \ + * / \ / \ + * / a \ + * 0/ \ + * partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a + * parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b' + * + * (increment p0 - 0) + (increment p1 - a) + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + final filter + * + * kFastLocalBloom + PartitionedFilter + + * detect_filter_construct_corruption + * The peak p0, p1 stay the same as + * (kFastLocalBloom + PartitionedFilter) but just + * last longer since we release hash entries reservation later. + * + */ + if (!partition_filters) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) + << "Filter construction cache reservation should have only 1 peak in " + "case: kFastLocalBloom + FullFilter"; + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + EXPECT_GT(filter_construction_cache_res_peak, + predicted_hash_entries_cache_res) + << "The testing number of hash entries is designed to make hash " + "entries cache reservation be multiples of dummy entries" + " so the correct behavior of charging final filter on top of it" + " should've triggered at least another dummy entry insertion"; + + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + return; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache reservation should have multiple peaks " + "in case: kFastLocalBloom + " + "PartitionedFilter"; + std::size_t predicted_filter_construction_cache_res_increments_sum = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 0.9); + EXPECT_LE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 1.1); + return; + } + } +#endif +} + +class DBFilterConstructionCorruptionTestWithParam + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBFilterConstructionCorruptionTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true) {} + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + table_options.detect_filter_construct_corruption = std::get<0>(GetParam()); + table_options.filter_policy = Create(20, kSpdbPairedBloom); + table_options.partition_filters = std::get<1>(GetParam()); + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size small enough so we can + // trigger filter partitioning with GetNumKey() amount of keys + table_options.metadata_block_size = 10; + } + + return table_options; + } + + // Return an appropriate amount of keys for testing + // to generate a long filter (i.e, size >= 8 + kMetadataLen) + std::size_t GetNumKey() { return 5000; } +}; + +INSTANTIATE_TEST_CASE_P(DBFilterConstructionCorruptionTestWithParam, + DBFilterConstructionCorruptionTestWithParam, + ::testing::Values(std::make_tuple(false, false), + std::make_tuple(true, false), + std::make_tuple(true, true))); + +TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + int num_key = static_cast(GetNumKey()); + Status s; + + // Case 1: No corruption in filter construction + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + s = Flush(); + EXPECT_TRUE(s.ok()); + + // Case 2: Corruption of hash entries in filter construction + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + if (table_options.detect_filter_construct_corruption) { + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE( + s.ToString().find("Filter's hash entries checksum mismatched") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + // Case 3: Corruption of filter content in filter construction + DestroyAndReopen(options); + + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperFilter", [&](void* arg) { + std::pair*, std::size_t>* TEST_arg_pair = + (std::pair*, std::size_t>*)arg; + std::size_t filter_size = TEST_arg_pair->second; + // 5 is the kMetadataLen and + assert(filter_size >= 8 + 5); + std::unique_ptr* filter_content_to_corrupt = + TEST_arg_pair->first; + std::memset(filter_content_to_corrupt->get(), '\0', 8); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + if (table_options.detect_filter_construct_corruption) { + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(s.ToString().find("Corrupted filter content") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperFilter"); +} + +// RocksDB lite does not support dynamic options +#ifndef ROCKSDB_LITE +TEST_P(DBFilterConstructionCorruptionTestWithParam, + DynamicallyTurnOnAndOffDetectConstructCorruption) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + // We intend to turn on + // table_options.detect_filter_construct_corruption dynamically + // therefore we override this test parmater's value + table_options.detect_filter_construct_corruption = false; + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + + int num_key = static_cast(GetNumKey()); + Status s; + + DestroyAndReopen(options); + + // Case 1: !table_options.detect_filter_construct_corruption + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + ASSERT_FALSE(table_options.detect_filter_construct_corruption); + EXPECT_TRUE(s.ok()); + + // Case 2: dynamically turn on + // table_options.detect_filter_construct_corruption + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{detect_filter_construct_corruption=true;}"}})); + + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + auto updated_table_options = + db_->GetOptions().table_factory->GetOptions(); + EXPECT_TRUE(updated_table_options->detect_filter_construct_corruption); + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(s.ToString().find("Filter's hash entries checksum mismatched") != + std::string::npos); + + // Case 3: dynamically turn off + // table_options.detect_filter_construct_corruption + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{detect_filter_construct_corruption=false;}"}})); + updated_table_options = + db_->GetOptions().table_factory->GetOptions(); + EXPECT_FALSE(updated_table_options->detect_filter_construct_corruption); +} +#endif // ROCKSDB_LITE + +namespace { +// // // // NOTE: This class is referenced by HISTORY.md as a model for a +// wrapper +// // // // FilterPolicy selecting among configurations based on context. +// // // class LevelAndStyleCustomFilterPolicy : public FilterPolicy { +// // // public: +// // // explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int +// bpk_l0_other, +// // // int bpk_otherwise) +// // // : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)), +// // // policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)), +// // // policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {} + +// // // const char* Name() const override { +// // // return "LevelAndStyleCustomFilterPolicy"; +// // // } + +// // // // OK to use built-in policy name because we are deferring to a +// // // // built-in builder. We aren't changing the serialized format. +// // // const char* CompatibilityName() const override { +// // // return policy_fifo_->CompatibilityName(); +// // // } + +// // // FilterBitsBuilder* GetBuilderWithContext( +// // // const FilterBuildingContext& context) const override { +// // // if (context.compaction_style == kCompactionStyleFIFO) { +// // // return policy_fifo_->GetBuilderWithContext(context); +// // // } else if (context.level_at_creation == 0) { +// // // return policy_l0_other_->GetBuilderWithContext(context); +// // // } else { +// // // return policy_otherwise_->GetBuilderWithContext(context); +// // // } +// // // } + +// // // FilterBitsReader* GetFilterBitsReader(const Slice& contents) const +// override { +// // // // OK to defer to any of them; they all can parse built-in filters +// // // // from any settings. +// // // return policy_fifo_->GetFilterBitsReader(contents); +// // // } + +// // // private: +// // // const std::unique_ptr policy_fifo_; +// // // const std::unique_ptr policy_l0_other_; +// // // const std::unique_ptr policy_otherwise_; +// // // }; + +// // // static std::map +// // // table_file_creation_reason_to_string{ +// // // {TableFileCreationReason::kCompaction, "kCompaction"}, +// // // {TableFileCreationReason::kFlush, "kFlush"}, +// // // {TableFileCreationReason::kMisc, "kMisc"}, +// // // {TableFileCreationReason::kRecovery, "kRecovery"}, +// // // }; + +// // // class TestingContextCustomFilterPolicy +// // // : public LevelAndStyleCustomFilterPolicy { +// // // public: +// // // explicit TestingContextCustomFilterPolicy(int bpk_fifo, int +// bpk_l0_other, +// // // int bpk_otherwise) +// // // : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, +// bpk_otherwise) { +// // // } + +// // // FilterBitsBuilder* GetBuilderWithContext( +// // // const FilterBuildingContext& context) const override { +// // // test_report_ += "cf="; +// // // test_report_ += context.column_family_name; +// // // test_report_ += ",s="; +// // // test_report_ += +// // // OptionsHelper::compaction_style_to_string[context.compaction_style]; +// // // test_report_ += ",n="; +// // // test_report_ += ROCKSDB_NAMESPACE::ToString(context.num_levels); +// // // test_report_ += ",l="; +// // // test_report_ += +// ROCKSDB_NAMESPACE::ToString(context.level_at_creation); +// // // test_report_ += ",b="; +// // // test_report_ += +// ROCKSDB_NAMESPACE::ToString(int{context.is_bottommost}); +// // // test_report_ += ",r="; +// // // test_report_ += +// table_file_creation_reason_to_string[context.reason]; +// // // test_report_ += "\n"; + +// // // return +// LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context); +// // // } + +// // // std::string DumpTestReport() { +// // // std::string rv; +// // // std::swap(rv, test_report_); +// // // return rv; +// // // } + +// // // private: +// // // mutable std::string test_report_; +// // // }; +} // namespace + +// // // TEST_F(SpdbDBBloomFilterTest, ContextCustomFilterPolicy) { +// // // auto policy = std::make_shared(15, +// 8, 5); +// // // Options options; +// // // for (bool fifo : {true, false}) { +// // // options = CurrentOptions(); +// // // options.max_open_files = fifo ? -1 : options.max_open_files; +// // // options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +// // // options.compaction_style = +// // // fifo ? kCompactionStyleFIFO : kCompactionStyleLevel; + +// // // BlockBasedTableOptions table_options; +// // // table_options.filter_policy = policy; +// // // table_options.format_version = 5; +// // // options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + +// // // TryReopen(options); +// // // CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); + +// // // const int maxKey = 10000; +// // // for (int i = 0; i < maxKey / 2; i++) { +// // // ASSERT_OK(Put(1, Key(i), Key(i))); +// // // } +// // // // Add a large key to make the file contain wide range +// // // ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); +// // // Flush(1); +// // // EXPECT_EQ(policy->DumpTestReport(), +// // // fifo ? +// "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" +// // // : +// "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); + +// // // for (int i = maxKey / 2; i < maxKey; i++) { +// // // ASSERT_OK(Put(1, Key(i), Key(i))); +// // // } +// // // Flush(1); +// // // EXPECT_EQ(policy->DumpTestReport(), +// // // fifo ? +// "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" +// // // : +// "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); + +// // // // Check that they can be found +// // // for (int i = 0; i < maxKey; i++) { +// // // ASSERT_EQ(Key(i), Get(1, Key(i))); +// // // } +// // // // Since we have two tables / two filters, we might have Bloom +// checks on +// // // // our queries, but no more than one "useful" per query on a found +// key. +// // // EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), +// maxKey); + +// // // // Check that we have two filters, each about +// // // // fifo: 0.12% FP rate (15 bits per key) +// // // // level: 2.3% FP rate (8 bits per key) +// // // for (int i = 0; i < maxKey; i++) { +// // // ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); +// // // } +// // // { +// // // auto useful_count = +// // // TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); +// // // EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975)); +// // // EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98)); +// // // } + +// // // if (!fifo) { // FIFO only has L0 +// // // // Full compaction +// // // ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], +// nullptr, +// // // nullptr)); +// // // EXPECT_EQ(policy->DumpTestReport(), +// // // "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n"); + +// // // // Check that we now have one filter, about 9.2% FP rate (5 bits +// per key) +// // // for (int i = 0; i < maxKey; i++) { +// // // ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); +// // // } +// // // { +// // // auto useful_count = +// // // TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); +// // // EXPECT_GE(useful_count, maxKey * 0.90); +// // // EXPECT_LE(useful_count, maxKey * 0.91); +// // // } +// // // } else { +// // // #ifndef ROCKSDB_LITE +// // // // Also try external SST file +// // // { +// // // std::string file_path = dbname_ + "/external.sst"; +// // // SstFileWriter sst_file_writer(EnvOptions(), options, +// handles_[1]); +// // // ASSERT_OK(sst_file_writer.Open(file_path)); +// // // ASSERT_OK(sst_file_writer.Put("key", "value")); +// // // ASSERT_OK(sst_file_writer.Finish()); +// // // } +// // // // Note: kCompactionStyleLevel is default, ignored if num_levels +// == -1 +// // // EXPECT_EQ(policy->DumpTestReport(), +// // // "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n"); +// // // #endif +// // // } + +// // // // Destroy +// // // ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); +// // // ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); +// // // handles_[1] = nullptr; +// // // } +// // // } + +class SliceTransformLimitedDomain : public SliceTransform { + const char* Name() const override { return "SliceTransformLimitedDomain"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5 && src[0] == 'x'; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5 && dst[0] == 'x'; + } +}; + +TEST_F(SpdbDBBloomFilterTest, PrefixExtractorFullFilter) { + BlockBasedTableOptions bbto; + // Full Filter Block + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1111_AAAA", "val1")); + ASSERT_OK(Put("x1112_AAAA", "val2")); + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val5")); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("x1111_AAAA"), "val1"); + ASSERT_EQ(Get("x1112_AAAA"), "val2"); + ASSERT_EQ(Get("x1113_AAAA"), "val3"); + ASSERT_EQ(Get("x1114_AAAA"), "val4"); + // Was not added to filter but rocksdb will try to read it from the filter + ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); +} + +TEST_F(SpdbDBBloomFilterTest, PrefixExtractorBlockFilter) { + BlockBasedTableOptions bbto; + // Full Filter Block + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val1")); + ASSERT_OK(Put("zzzzz_AAAB", "val2")); + ASSERT_OK(Put("zzzzz_AAAC", "val3")); + ASSERT_OK(Put("zzzzz_AAAD", "val4")); + + ASSERT_OK(Flush()); + + std::vector iter_res; + auto iter = db_->NewIterator(ReadOptions()); + // Seek to a key that was not in Domain + for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { + iter_res.emplace_back(iter->value().ToString()); + } + + std::vector expected_res = {"val1", "val2", "val3", "val4"}; + ASSERT_EQ(iter_res, expected_res); + delete iter; +} + +TEST_F(SpdbDBBloomFilterTest, MemtableWholeKeyBloomFilter) { + // regression test for #2743. the range delete tombstones in memtable should + // be added even when Get() skips searching due to its prefix bloom filter + const int kMemtableSize = 1 << 20; // 1MB + const int kMemtablePrefixFilterSize = 1 << 13; // 8KB + const int kPrefixLen = 4; + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + Options options = CurrentOptions(options_override); + options.memtable_prefix_bloom_size_ratio = + static_cast(kMemtablePrefixFilterSize) / kMemtableSize; + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen)); + options.write_buffer_size = kMemtableSize; + options.memtable_whole_key_filtering = false; + Reopen(options); + std::string key1("AAAABBBB"); + std::string key2("AAAACCCC"); // not in DB + std::string key3("AAAADDDD"); + std::string key4("AAAAEEEE"); + std::string value1("Value1"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + // same prefix, bloom filter false positive + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // enable whole key bloom filter + options.memtable_whole_key_filtering = true; + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key3, value3, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // verify whole key filtering does not depend on prefix_extractor + options.prefix_extractor.reset(); + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key4, value4, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); +} + +TEST_F(SpdbDBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + Options options = CurrentOptions(options_override); + options.memtable_prefix_bloom_size_ratio = 0.015; + options.memtable_whole_key_filtering = true; + Reopen(options); + std::string key1("AA"); + std::string key2("BB"); + std::string key3("CC"); + std::string key4("DD"); + std::string key_not("EE"); + std::string value1("Value1"); + std::string value2("Value2"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key2, value2, WriteOptions())); + ASSERT_OK(Flush()); + ASSERT_OK(Put(key3, value3, WriteOptions())); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(key4, value4, WriteOptions())); + + // Delete key2 and key3 + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ")); + + // Read without snapshot + auto results = MultiGet({key_not, key1, key2, key3, key4}); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], "NOT_FOUND"); + ASSERT_EQ(results[3], "NOT_FOUND"); + ASSERT_EQ(results[4], value4); + + // Also check Get + ASSERT_EQ(Get(key1), value1); + ASSERT_EQ(Get(key2), "NOT_FOUND"); + ASSERT_EQ(Get(key3), "NOT_FOUND"); + ASSERT_EQ(Get(key4), value4); + + // Read with snapshot + results = MultiGet({key_not, key1, key2, key3, key4}, snapshot); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], value2); + ASSERT_EQ(results[3], value3); + ASSERT_EQ(results[4], "NOT_FOUND"); + + // Also check Get + ASSERT_EQ(Get(key1, snapshot), value1); + ASSERT_EQ(Get(key2, snapshot), value2); + ASSERT_EQ(Get(key3, snapshot), value3); + ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND"); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(SpdbDBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { + constexpr size_t kPrefixSize = 8; + const std::string kKey = "key"; + assert(kKey.size() < kPrefixSize); + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + Options options = CurrentOptions(options_override); + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize)); + options.memtable_prefix_bloom_size_ratio = 0.25; + Reopen(options); + ASSERT_OK(Put(kKey, "v")); + ASSERT_EQ("v", Get(kKey)); + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + iter->Seek(kKey); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(kKey, iter->key()); + iter->SeekForPrev(kKey); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(kKey, iter->key()); +} + +#ifndef ROCKSDB_LITE +namespace { +static const std::string kPlainTable = "test_PlainTableBloom"; +} // namespace + +class BloomStatsTestWithParam + : public SpdbDBBloomFilterTest, + public testing::WithParamInterface> { + public: + BloomStatsTestWithParam() { + partition_filters_ = std::get<1>(GetParam()); + + options_.create_if_missing = true; + options_.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4)); + options_.memtable_prefix_bloom_size_ratio = + 8.0 * 1024.0 / static_cast(options_.write_buffer_size); + BlockBasedTableOptions table_options; + if (partition_filters_) { + table_options.partition_filters = partition_filters_; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.filter_policy = Create(10, kSpdbPairedBloom); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options_.env = env_; + + get_perf_context()->Reset(); + DestroyAndReopen(options_); + } + + ~BloomStatsTestWithParam() override { + get_perf_context()->Reset(); + Destroy(options_); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + bool partition_filters_; + Options options_; +}; + +// 1 Insert 2 K-V pairs into DB +// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 +// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 +// 4 Call Flush() to create SST +// 5 Call Get() for both keys - expext SST bloom hit stat to be 2 +// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 +// Test both: block and plain SST +TEST_P(BloomStatsTestWithParam, BloomStatsTest) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + + // sanity checks + ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); + + Flush(); + + // sanity checks + ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); + + // check SST bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); +} + +// Same scenario as in BloomStatsTest but using an iterator +TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + + // check memtable bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + + Flush(); + + iter.reset(dbfull()->NewIterator(ReadOptions())); + + // Check SST bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + // The seek doesn't check block-based bloom filter because last index key + // starts with the same prefix we're seeking to. + uint64_t expected_hits = 2; + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); +} + +// // INSTANTIATE_TEST_CASE_P( +// // BloomStatsTestWithParam, BloomStatsTestWithParam, +// // ::testing::Values(false, true)); + +namespace { +void PrefixScanInit(SpdbDBBloomFilterTest* dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + ASSERT_OK(dbtest->Flush()); + ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, + nullptr)); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } +} +} // namespace + +TEST_F(SpdbDBBloomFilterTest, PrefixScan) { + while (ChangeFilterOptions()) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + ASSERT_EQ(key.difference_offset(prefix), 8); + ASSERT_EQ(prefix.difference_offset(key), 8); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + assert(!options.unordered_write); + // It is incompatible with allow_concurrent_memtable_write=false + options.allow_concurrent_memtable_write = false; + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // 11 RAND I/Os + DestroyAndReopen(options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (!iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + } // end of while +} + +// TODO: The filter builder is created always with OFFM = false, both for us and +// rocksdb. Is that how it's supposed to be? +TEST_F(SpdbDBBloomFilterTest, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; + options.arena_block_size = 4 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 256 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.compression = kNoCompression; + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + CreateAndReopenWithCF({"mypikachu"}, options); + + int numkeys = 200000; + + // Generate randomly shuffled keys, so the updates are almost + // random. + std::vector keys; + keys.reserve(numkeys); + for (int i = 0; i < numkeys; i += 2) { + keys.push_back(i); + } + RandomShuffle(std::begin(keys), std::end(keys)); + int num_inserted = 0; + for (int key : keys) { + ASSERT_OK(Put(1, Key(key), "val")); + if (++num_inserted % 1000 == 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + } + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + if (NumTableFilesAtLevel(0, 1) == 0) { + // No Level 0 file. Create one. + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + for (int i = 1; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); + } + + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + // Now we have three sorted run, L0, L5 and L6 with most files in L6 have + // no bloom filter. Most keys be checked bloom filters twice. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); + ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; + } + } + ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2); + ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2); + + for (int i = 0; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + // Part 2 (read path): rewrite last level with blooms, then verify they get + // cached only if !optimize_filters_for_hits + options.disable_auto_compactions = true; + options.num_levels = 9; + options.optimize_filters_for_hits = false; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + MoveFilesToLevel(7 /* level */, 1 /* column family index */); + + std::string value = Get(1, Key(0)); + uint64_t prev_cache_filter_hits = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_hits + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Now that we know the filter blocks exist in the last level files, see if + // filter caching is skipped for this optimization + options.optimize_filters_for_hits = true; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + value = Get(1, Key(0)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // Check filter block ignored for files preloaded during DB::Open() + options.max_open_files = -1; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + uint64_t prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Check filter block ignored for file trivially-moved to bottom level + bbto.block_cache.reset(); + options.max_open_files = 100; // setting > -1 makes it not preload all files + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + ASSERT_OK(Put(1, Key(numkeys + 1), "val")); + ASSERT_OK(Flush(1)); + + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + compact_options.change_level = true; + compact_options.target_level = 7; + ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) + .IsNotSupported()); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + value = Get(1, Key(numkeys + 1)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + + // Check filter block not cached for iterator + bbto.block_cache.reset(); + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + std::unique_ptr iter(db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + get_perf_context()->Reset(); +} + +int CountIter(std::unique_ptr& iter, const Slice& key) { + int count = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + count++; + } + EXPECT_OK(iter->status()); + return count; +} + +// use iterate_upper_bound to hint compatiability of existing bloom filters. +// The BF is considered compatible if 1) upper bound and seek key transform +// into the same string, or 2) the transformed seek key is of the same length +// as the upper bound and two keys are adjacent according to the comparator. +TEST_F(SpdbDBBloomFilterTest, DynamicBloomFilterUpperBound) { + auto bfp_impl = kSpdbPairedBloom; + int using_full_builder = true; + Options options; + options.create_if_missing = true; + options.env = CurrentOptions().env; + options.prefix_extractor.reset(NewCappedPrefixTransform(4)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(20, bfp_impl); + table_options.index_shortening = BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("abcdxxx0", "val1")); + ASSERT_OK(Put("abcdxxx1", "val2")); + ASSERT_OK(Put("abcdxxx2", "val3")); + ASSERT_OK(Put("abcdxxx3", "val4")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // prefix_extractor has not changed, BF will always be read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + } + { + Slice upper_bound("abcdzzzz"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.FixedPrefix.5"); + { + // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); + // should check bloom filter since upper bound meets requirement + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx01, abcey) is not valid bound since upper bound is too long for + // the BF in SST (capped:4) + Slice upper_bound("abcey"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); + // should skip bloom filter since upper bound is too long + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx02, abcdy) is a valid bound since the prefix is the same + Slice upper_bound("abcdy"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx02"), 4); + // should check bloom filter since upper bound matches transformed seek + // key + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the + // same prefix, 2) the prefixes are not consecutive + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); + // should skip bloom filter since mismatch is found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); + { + // [abc, abd) is not a valid bound since the upper bound is too short + // for BF (capped:4) + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}})); + { + // set back to capped:4 and verify BF is always read + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } +} + +// Create multiple SST files each with a different prefix_extractor config, +// verify iterators can read all SST files using the latest config. +TEST_F(SpdbDBBloomFilterTest, DynamicBloomFilterMultipleSST) { + auto bfp_impl = kSpdbPairedBloom; + int using_full_builder = true; + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy = Create(20, bfp_impl); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Slice upper_bound("foz90000"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + + // first SST with fixed:1 BF + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foq1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + using_full_builder); + ASSERT_EQ(CountIter(iter, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + // second SST with capped:3 BF + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foq5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // BF is cappped:3 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // both counters are incremented because BF is "not changed" for 1 of the + // 2 SST files, so filter is checked once and found no match. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.FixedPrefix.2"); + // third SST with fixed:2 BF + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foq8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // BF is fixed:2 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); + // the first and last BF are checked + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 4 + using_full_builder * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // only last BF is checked and not found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 5 + using_full_builder * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + } + + // iter_old can only see the first SST, so checked plus 1 + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + using_full_builder * 3); + // iter was created after the first setoptions call so only full filter + // will check the filter + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + using_full_builder * 4); + + { + // keys in all three SSTs are visible to iterator + // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2) + // so +2 for checked counter + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 7 + using_full_builder * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 8 + using_full_builder * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + { + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 6); + // all three SST are checked because the current options has the same as + // the remaining SST (capped:3) + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 9 + using_full_builder * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 10 + using_full_builder * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); + } + // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? +} + +// Create a new column family in a running DB, change prefix_extractor +// dynamically, verify the iterator created on the new column family behaves +// as expected +// TODO: No filter is created here (in rocksdb's test it's the same) => Why is +// this test in this suite? +TEST_F(SpdbDBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { + auto bfp_impl = kSpdbPairedBloom; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(20, bfp_impl); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu0"}, options); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + // create a new CF and set prefix_extractor dynamically + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + CreateColumnFamilies({"ramen_dojo_0"}, options); + ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + ASSERT_OK(Put(2, "foo3", "bar3")); + ASSERT_OK(Put(2, "foo4", "bar4")); + ASSERT_OK(Put(2, "foo5", "bar5")); + ASSERT_OK(Put(2, "foq6", "bar6")); + ASSERT_OK(Put(2, "fpq7", "bar7")); + dbfull()->Flush(FlushOptions()); + { + std::unique_ptr iter(db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK( + dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), + "rocksdb.FixedPrefix.2"); + { + std::unique_ptr iter(db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2])); + handles_[2] = nullptr; + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_[1] = nullptr; +} + +// Verify it's possible to change prefix_extractor at runtime and iterators +// behaves as expected +TEST_F(SpdbDBBloomFilterTest, DynamicBloomFilterOptions) { + auto bfp_impl = kSpdbPairedBloom; + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(20, bfp_impl); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foo5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foo8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + + ReadOptions read_options; + read_options.prefix_same_as_start = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + { + std::unique_ptr iter(db_->NewIterator(read_options)); + // "fp*" should be skipped + ASSERT_EQ(CountIter(iter, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + + // iterator created before should not be affected and see all keys + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_old, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/plugin/speedb/paired_filter/speedb_paired_bloom.cc b/plugin/speedb/paired_filter/speedb_paired_bloom.cc new file mode 100644 index 0000000000..32c796dba6 --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_paired_bloom.cc @@ -0,0 +1,125 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "plugin/speedb/paired_filter/speedb_paired_bloom.h" + +#include "plugin/speedb/paired_filter/speedb_paired_bloom_internal.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/block_based/filter_policy_internal.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +SpdbPairedBloomFilterPolicy::SpdbPairedBloomFilterPolicy(double bits_per_key) { + constexpr double kMinBitsPerKey = speedb_filter::kMinMillibitsPerKey / 1000; + + // Sanitize bits_per_key + if (bits_per_key < 0.5) { + // Round down to no filter + bits_per_key = 0; + } else if (bits_per_key < kMinBitsPerKey) { + // Minimum 1 bit per key (equiv) when creating filter + bits_per_key = kMinBitsPerKey; + } else if (!(bits_per_key < kMaxBitsPerKey)) { // including NaN + bits_per_key = kMaxBitsPerKey; + } + + // Includes a nudge toward rounding up, to ensure on all platforms + // that doubles specified with three decimal digits after the decimal + // point are interpreted accurately. + millibits_per_key_ = static_cast(bits_per_key * 1000.0 + 0.500001); +} + +FilterBitsBuilder* SpdbPairedBloomFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + if (millibits_per_key_ == 0) { + // "No filter" special case + return nullptr; + } + + // TODO: The code below is duplicates from + // BloomLikeFilterPolicy::GetFastLocalBloomBuilderWithContext + // TODO: See if it may be refactored to a static method + bool offm = context.table_options.optimize_filters_for_memory; + + // TODO: Refactor this to a static method of BloomLikeFilterPolicy + bool reserve_filter_construction_mem = + (context.table_options.reserve_table_builder_memory && + context.table_options.block_cache); + std::shared_ptr cache_res_mgr; + if (reserve_filter_construction_mem) { + cache_res_mgr = std::make_shared< + CacheReservationManagerImpl>( + context.table_options.block_cache); + } + + return new SpdbPairedBloomBitsBuilder( + millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, + cache_res_mgr, context.table_options.detect_filter_construct_corruption, + std::bind(&SpdbPairedBloomFilterPolicy::GetFilterBitsReader, this, + std::placeholders::_1)); +} + +FilterBitsReader* SpdbPairedBloomFilterPolicy::GetFilterBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast(contents.size()); + const auto trailer_len = speedb_filter::FilterMetadata::kMetadataLen; + if (len_with_meta <= trailer_len) { + // filter is empty or broken. Treat like zero keys added. + return new AlwaysFalseFilter(); + } + + const auto len = len_with_meta - trailer_len; + const char* metadata_start = &contents.data()[len]; + + auto trailer_data = + speedb_filter::FilterMetadata::ReadMetadata(metadata_start); + switch (trailer_data.filter_type) { + case speedb_filter::FilterType::kPairedBlockBloom: + return new SpdbPairedBloomBitsReader(contents.data(), + trailer_data.num_probes, len); + break; + + case speedb_filter::FilterType::kFutureUnknown: + return new AlwaysTrueFilter(); + break; + + default: + assert(0); + return new AlwaysTrueFilter(); + } +} + +std::string SpdbPairedBloomFilterPolicy::GetId() const { + return Name() + + BloomLikeFilterPolicy::GetBitsPerKeySuffix(millibits_per_key_); +} + +bool SpdbPairedBloomFilterPolicy::IsInstanceOf(const std::string& name) const { + if (name == kClassName()) { + return true; + } else { + return FilterPolicy::IsInstanceOf(name); + } +} + +const char* SpdbPairedBloomFilterPolicy::kClassName() { + return "speedb_paired_bloom_filter"; +} + +const char* SpdbPairedBloomFilterPolicy::kNickName() { + return "speedb.PairedBloomFilter"; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/paired_filter/speedb_paired_bloom.h b/plugin/speedb/paired_filter/speedb_paired_bloom.h new file mode 100644 index 0000000000..25c0e5be6d --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_paired_bloom.h @@ -0,0 +1,95 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "rocksdb/filter_policy.h" + +namespace ROCKSDB_NAMESPACE { + +// Forward Declarations +class ObjectLibrary; +struct FilterBuildingContext; + +// In the default cache-local bloom filter in RocksDB +// (FastLocalBloomFilterPolicy) the trade-off between memory and false positive +// rate is significantly worse than the theoretical standard bloom filter, +// however it is significantly faster in terms of CPU. This trade-off +// deteriorates performance/memory footprint especially in use cases in which +// large accuracy of the filter is needed (typically from ~20 bits-per-key). +// +// For really high bits-per-key there could be orders of magnitude difference in +// the false positive rate. Ribbon filter is generally better than bloom filter +// in the trade-off (takes ~30% less memory to obtain the same false positive +// rate. However, its construction and use is slower by a factor of ~4 than +// bloom filter, so in use cases that require fast testing and construction +// ribbon filter cannot be used. +// +// This filter is fast and low on CPU consumption on the one hand, but with a +// better memory footprint- FPR trade-off on the other hand. +// +class SpdbPairedBloomFilterPolicy : public FilterPolicy { + public: + // Max supported BPK. Filters using higher BPK-s will use the max + static constexpr double kMaxBitsPerKey = 100.0; + + public: + explicit SpdbPairedBloomFilterPolicy(double bits_per_key); + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override; + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; + + // Plug-In Support + public: + static const char* kClassName(); + const char* Name() const override { return kClassName(); } + static const char* kNickName(); + const char* NickName() const override { return kNickName(); } + + std::string GetId() const override; + + bool IsInstanceOf(const std::string& name) const override; + + // This filter is NOT compatible with RocksDB's built-in filter, only with + // itself + const char* CompatibilityName() const override { + return kCompatibilityName(); + } + static const char* kCompatibilityName() { return kClassName(); } + + private: + // This filter supports fractional bits per key. For predictable behavior + // of 0.001-precision values across floating point implementations, we + // round to thousandths of a bit (on average) per key. + int millibits_per_key_; + + // State for implementing optimize_filters_for_memory. Essentially, this + // tracks a surplus or deficit in total FP rate of filters generated by + // builders under this policy vs. what would have been generated without + // optimize_filters_for_memory. + // + // To avoid floating point weirdness, the actual value is + // Sum over all generated filters f: + // (predicted_fp_rate(f) - predicted_fp_rate(f|o_f_f_m=false)) * 2^32 + mutable std::atomic aggregate_rounding_balance_; +}; + +// Plug-In Support +extern "C" { +int register_SpdbPairedBloomFilter(ROCKSDB_NAMESPACE::ObjectLibrary& library, + const std::string&); +} // extern "C" + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/paired_filter/speedb_paired_bloom_internal.cc b/plugin/speedb/paired_filter/speedb_paired_bloom_internal.cc new file mode 100644 index 0000000000..60ab5c8bfe --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_paired_bloom_internal.cc @@ -0,0 +1,845 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "plugin/speedb/paired_filter/speedb_paired_bloom_internal.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "port/likely.h" // for LIKELY +#include "port/port.h" // for PREFETCH +#include "test_util/sync_point.h" +#include "util/bloom_impl.h" +#include "util/fastrange.h" + +#ifdef HAVE_AVX2 +#include +#endif + +namespace ROCKSDB_NAMESPACE { + +namespace { + +using InBatchBlockIdx = uint8_t; + +// We currently assume the in-batch block index fits within the 1st byte (8 +// bits) of the block and it is a power of 2 +static_assert(speedb_filter::kPairedBloomBatchSizeInBlocks <= (1 << 8U)); +static_assert((speedb_filter::kPairedBloomBatchSizeInBlocks > 0) && + ((speedb_filter::kPairedBloomBatchSizeInBlocks & + (speedb_filter::kPairedBloomBatchSizeInBlocks - 1)) == 0)); + +// Number of bits to point to any block in a batch (in-batch block index) +static const uint32_t kInBatchIdxNumBits = + std::ceil(std::log2(speedb_filter::kPairedBloomBatchSizeInBlocks)); + +// kBlockSizeInBytes must be a power of 2 (= Cacheline size) +constexpr uint32_t kBlockSizeInBytes = 64U; +static_assert((kBlockSizeInBytes > 0) && + ((kBlockSizeInBytes & (kBlockSizeInBytes - 1)) == 0)); +constexpr uint32_t kBlockSizeInBits = kBlockSizeInBytes * 8U; +static const uint32_t kBlockSizeNumBits = + std::ceil(std::log2(kBlockSizeInBits)); +static const uint32_t kNumBlockSizeBitsShiftBits = 32 - kBlockSizeNumBits; + +// Number of bits to represent kBlockSizeInBytes +static const uint32_t kNumBitsForBlockSize = std::log2(kBlockSizeInBytes); +static const uint32_t KNumBitsInBlockBloom = + kBlockSizeInBits - kInBatchIdxNumBits; + +constexpr uint32_t kBatchSizeInBytes = + speedb_filter::kPairedBloomBatchSizeInBlocks * kBlockSizeInBytes; + +constexpr uint64_t kNumMillibitsInByte = 8 * 1000U; + +[[maybe_unused]] constexpr uint32_t kMaxSupportLenWithMetadata = 0xffffffffU; +constexpr uint32_t kMaxSupportedSizeNoMetadata = 0xffffffc0U; + +constexpr size_t kMaxNumProbes = 30U; +static_assert(kMaxNumProbes % 2 == 0U); + +static const uint8_t kInBatchIdxMask = (uint8_t{1U} << kInBatchIdxNumBits) - 1; +static const uint8_t kFirstByteBitsMask = ~kInBatchIdxMask; + +// ================================================================================================== +// +// Helper Functions +// + +inline uint32_t HashToGlobalBlockIdx(uint32_t h1, uint32_t len_bytes) { + return FastRange32(h1, len_bytes >> kNumBitsForBlockSize); +} + +inline void PrefetchBlock(const char* block_address) { + PREFETCH(block_address, 0 /* rw */, 1 /* locality */); + PREFETCH(block_address + kBlockSizeInBytes - 1, 0 /* rw */, 1 /* locality */); +} + +inline uint32_t GetContainingBatchIdx(uint32_t global_block_idx) { + return (global_block_idx / speedb_filter::kPairedBloomBatchSizeInBlocks); +} + +inline uint8_t GetInBatchBlockIdx(uint32_t global_block_idx) { + return (global_block_idx % speedb_filter::kPairedBloomBatchSizeInBlocks); +} + +inline uint32_t GetHashSetSelector(uint32_t first_in_batch_block_idx, + uint32_t second_in_batch_block_idx) { + assert((first_in_batch_block_idx < + speedb_filter::kPairedBloomBatchSizeInBlocks) && + (second_in_batch_block_idx < + speedb_filter::kPairedBloomBatchSizeInBlocks)); + return (first_in_batch_block_idx < second_in_batch_block_idx) ? 0U : 1U; +} + +inline uint32_t GetFirstGlobalBlockIdxOfBatch(uint32_t batch_idx) { + return batch_idx * speedb_filter::kPairedBloomBatchSizeInBlocks; +} + +inline char* GetBlockAddress(char* data, uint32_t global_block_idx) { + return (data + global_block_idx * kBlockSizeInBytes); +} + +inline const char* GetBlockAddress(const char* data, + uint32_t global_block_idx) { + return (data + global_block_idx * kBlockSizeInBytes); +} + +inline double CalcAdjustedBitsPerKey(size_t millibits_per_key) { + return ((millibits_per_key * KNumBitsInBlockBloom) / kBlockSizeInBits / 1000); +} + +inline double CalcRawNumProbes(size_t millibits_per_key) { + static const auto log_2 = std::log(2); + return (log_2 * CalcAdjustedBitsPerKey(millibits_per_key)); +} + +inline size_t CalcNumProbes(size_t millibits_per_key) { + double raw_num_probes = CalcRawNumProbes(millibits_per_key); + + // Num probes must be even + auto num_probes = static_cast(std::ceil(raw_num_probes / 2.0) * 2); + assert(num_probes % 2 == 0U); + + return std::min(num_probes, kMaxNumProbes); +} + +// False positive rate of a standard Bloom filter, for given ratio of +// filter memory bits to added keys, and number of probes per operation. +// (The false positive rate is effectively independent of scale, assuming +// the implementation scales OK.) +inline double SpdbStandardFpRate(double bits_per_key, double raw_num_probes) { + // Standard very-good-estimate formula. See + // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives + return std::pow(1.0 - std::exp(-raw_num_probes / bits_per_key), + raw_num_probes); +} + +class BuildBlock { + public: + BuildBlock() = default; + BuildBlock(char* data, uint32_t global_block_idx, bool prefetch_block); + + uint8_t GetInBatchBlockIdxOfPair() const; + void SetInBatchBlockIdxOfPair(InBatchBlockIdx pair_batch_block_idx); + void SetBlockBloomBits(uint32_t hash, uint32_t set_idx, size_t hash_set_size); + + private: + char* const block_address_ = nullptr; +}; + +inline BuildBlock::BuildBlock(char* data, uint32_t global_block_idx, + bool prefetch_block) + : block_address_(GetBlockAddress(data, global_block_idx)) { + if (prefetch_block) { + PrefetchBlock(block_address_); + } +} + +inline uint8_t BuildBlock::GetInBatchBlockIdxOfPair() const { + return static_cast(*block_address_) & kInBatchIdxMask; +} + +inline void BuildBlock::SetInBatchBlockIdxOfPair( + InBatchBlockIdx pair_batch_block_idx) { + assert(((*block_address_ & kInBatchIdxMask) == 0U) || + ((*block_address_ & kInBatchIdxMask) == pair_batch_block_idx)); + + *block_address_ = + (pair_batch_block_idx | (*block_address_ & kFirstByteBitsMask)); +} + +inline int GetBitPosInBlockForHash(uint32_t hash, uint32_t set_idx) { + assert(set_idx <= 1U); + + int bitpos = 0; + + if (set_idx == 0) { + bitpos = hash >> 23; + if (LIKELY(bitpos > 6)) { + return bitpos; + } + hash <<= 9; + } else { + constexpr uint32_t mask = 0x007FC000; + bitpos = (hash & mask) >> 14; + if (LIKELY(bitpos > 6)) { + return bitpos; + } + } + + return kInBatchIdxNumBits + + (static_cast(KNumBitsInBlockBloom * + (hash >> kBlockSizeNumBits)) >> + (kNumBlockSizeBitsShiftBits)); +} + +inline void BuildBlock::SetBlockBloomBits(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) { + for (auto i = 0U; i < hash_set_size; ++i) { + int bitpos = GetBitPosInBlockForHash(hash, set_idx); + block_address_[bitpos >> 3] |= (char{1} << (bitpos & kInBatchIdxNumBits)); + hash *= 0x9e3779b9; + } +} + +class ReadBlock { + public: + ReadBlock(const char* data, uint32_t global_block_idx, bool prefetch_block); + + uint8_t GetInBatchBlockIdxOfPair() const; + bool AreAllBlockBloomBitsSet(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const; + + private: +#ifdef HAVE_AVX2 + bool AreAllBlockBloomBitsSetAvx2(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const; +#endif + bool AreAllBlockBloomBitsSetNonAvx2(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const; + + private: + const char* const block_address_; +}; + +inline ReadBlock::ReadBlock(const char* data, uint32_t global_block_idx, + bool prefetch_block) + : block_address_(GetBlockAddress(data, global_block_idx)) { + if (prefetch_block) { + PrefetchBlock(block_address_); + } +} + +inline uint8_t ReadBlock::GetInBatchBlockIdxOfPair() const { + return static_cast(*block_address_) & kInBatchIdxMask; +} + +bool ReadBlock::AreAllBlockBloomBitsSet(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const { +#ifdef HAVE_AVX2 + // The AVX2 code currently supports only cache-line / block sizes of 64 bytes + // (512 bits) + if (kBlockSizeInBits == 512) { + return AreAllBlockBloomBitsSetAvx2(hash, set_idx, hash_set_size); + } else { + return AreAllBlockBloomBitsSetNonAvx2(hash, set_idx, hash_set_size); + } +#else + return AreAllBlockBloomBitsSetNonAvx2(hash, set_idx, hash_set_size); +#endif +} + +#ifdef HAVE_AVX2 +const __m256i mask_vec = _mm256_set1_epi32(0x007FC000); +const __m256i max_bitpos_vec = _mm256_set1_epi32(7); +const __m256i fast_range_vec = _mm256_set1_epi32(KNumBitsInBlockBloom); +const __m256i num_idx_bits_vec = _mm256_set1_epi32(kInBatchIdxNumBits); + +// Powers of 32-bit golden ratio, mod 2**32. +const __m256i multipliers = + _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, + 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749); + +bool ReadBlock::AreAllBlockBloomBitsSetAvx2(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const { + assert(kBlockSizeInBytes == 64U); + + int rem_probes = static_cast(hash_set_size); + + // NOTE: This code is an adaptation of the equivalent code for RocksDB's + // bloom filter testing code using AVX2. + // See bloom_impl.h for more details + + for (;;) { + // Eight copies of hash + __m256i hash_vector = _mm256_set1_epi32(hash); + + // Same effect as repeated multiplication by 0x9e3779b9 thanks to + // associativity of multiplication. + hash_vector = _mm256_mullo_epi32(hash_vector, multipliers); + + __m256i orig_hash_vector = hash_vector; + + if (set_idx == 0) { + // hash >> 23 + hash_vector = _mm256_srli_epi32(hash_vector, 23); + } else { + // hash & mask (0x007FC000) + hash_vector = _mm256_and_si256(hash_vector, mask_vec); + + // hash >> 14 + hash_vector = _mm256_srli_epi32(hash_vector, 14); + } + + // // Find the bit positions that are < 7 + __m256i smaller_than_7_vec = + _mm256_cmpgt_epi32(max_bitpos_vec, hash_vector); + + if (_mm256_testz_si256(smaller_than_7_vec, smaller_than_7_vec) == false) { + __m256i hash_vector_fast_range = orig_hash_vector; + + if (set_idx == 0) { + // << 9 + hash_vector_fast_range = _mm256_slli_epi32(orig_hash_vector, 9); + } + + // AVX2 code to calculate the equivalent of + // GetBitPosInBlockForHash1stPass() for up to 8 hashes + + // Shift right the hashes by kBlockSizeNumBits + hash_vector_fast_range = + _mm256_srli_epi32(hash_vector_fast_range, kBlockSizeNumBits); + + // Multiplying by 505 => The result (lower 32 bits will be in the range + // 0-504 (in the 9 MSB bits). + hash_vector_fast_range = + _mm256_mullo_epi32(hash_vector_fast_range, fast_range_vec); + hash_vector_fast_range = + _mm256_srli_epi32(hash_vector_fast_range, kNumBlockSizeBitsShiftBits); + + // Add 7 to get the final bit position in the range 7 - 511 (In the 9 MSB + // bits) + hash_vector_fast_range = + _mm256_add_epi32(hash_vector_fast_range, num_idx_bits_vec); + + hash_vector = _mm256_blendv_epi8(hash_vector, hash_vector_fast_range, + smaller_than_7_vec); + } + + hash_vector = _mm256_slli_epi32(hash_vector, kNumBlockSizeBitsShiftBits); + + auto [is_done, answer] = FastLocalBloomImpl::CheckBitsPositionsInBloomBlock( + rem_probes, hash_vector, block_address_); + if (is_done) { + return answer; + } + + // otherwise + // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power + hash *= 0xab25f4c1; + rem_probes -= 8; + } +} + +#endif // HAVE_AVX2 + +bool ReadBlock::AreAllBlockBloomBitsSetNonAvx2(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const { + for (auto i = 0U; i < hash_set_size; ++i) { + int bitpos = GetBitPosInBlockForHash(hash, set_idx); + if ((block_address_[bitpos >> 3] & + (char{1} << (bitpos & kInBatchIdxNumBits))) == 0) { + return false; + } + hash *= 0x9e3779b9; + } + return true; +} + +} // Unnamed namespace + +// ================================================================================================== +namespace speedb_filter { + +void FilterMetadata::WriteMetadata(char* metadata, [[maybe_unused]] size_t len, + const Fields& fields) { + assert(len == kMetadataLen); + + // Init the metadata to all Zeros + std::memset(metadata, 0x0, kMetadataLen); + + metadata[0] = static_cast(speedb_filter::FilterType::kPairedBlockBloom); + + assert(fields.num_probes <= 30U); + metadata[1] = static_cast(fields.num_probes); + // rest of metadata stays zero +} + +auto FilterMetadata::ReadMetadata(const char* metadata) -> Fields { + char filter_type = *metadata; + char block_and_probes = *(metadata + 1); + + // TODO: Avoid the use of magic numbers + size_t num_probes = (block_and_probes & 0x1F); + if (num_probes < 1 || num_probes > 30) { + // Reserved / future safe + return {num_probes, FilterType::kFutureUnknown}; + } + + uint16_t rest = DecodeFixed16(metadata + 2); + if (rest != 0) { + // Reserved, possibly for hash seed + // Future safe + return {num_probes, FilterType::kFutureUnknown}; + } + + if (speedb_filter::FilterType(filter_type) == + speedb_filter::FilterType::kPairedBlockBloom) { // FastLocalBloom + // TODO: Avoid the use of magic numbers + auto log2_block_bytes = ((block_and_probes >> 5) & 7); + if (log2_block_bytes == 0U) { // Only block size supported for now + return {num_probes, FilterType::kPairedBlockBloom}; + } + } + + return {num_probes, FilterType::kFutureUnknown}; +} + +} // namespace speedb_filter + +// ================================================================================================== +SpdbPairedBloomBitsBuilder::SpdbPairedBloomBitsBuilder( + const int millibits_per_key, + std::atomic* aggregate_rounding_balance, + const std::shared_ptr& cache_res_mgr, + bool detect_filter_construct_corruption, + const FilterBitsReaderCreateFunc& reader_create_func) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, + std::move(cache_res_mgr), + detect_filter_construct_corruption), + millibits_per_key_(millibits_per_key), + reader_create_func_(reader_create_func) { + assert(millibits_per_key >= speedb_filter::kMinMillibitsPerKey); +} + +void SpdbPairedBloomBitsBuilder::InitVars(uint64_t len_no_metadata) { + assert((len_no_metadata % kBatchSizeInBytes) == 0U); + num_blocks_ = len_no_metadata / kBlockSizeInBytes; + num_blocks_ = std::max(num_blocks_, + speedb_filter::kPairedBloomBatchSizeInBlocks); + // num_blocks must be event and a multiple of the batch size + assert(num_blocks_ > 0U); + assert(num_blocks_ % 2 == 0); + assert(num_blocks_ % speedb_filter::kPairedBloomBatchSizeInBlocks == 0); + + num_batches_ = num_blocks_ / speedb_filter::kPairedBloomBatchSizeInBlocks; + // There must be at least 1 batch + assert(num_batches_ > 0U); + + pairing_table_.resize(num_batches_); + AddCacheReservation(num_batches_ * + sizeof(decltype(pairing_table_)::value_type)); + + num_probes_ = CalcNumProbes(millibits_per_key_); +} + +Slice SpdbPairedBloomBitsBuilder::Finish(std::unique_ptr* buf, + Status* status) { + const size_t num_entries = hash_entries_info_.entries.size(); + size_t len_with_metadata = CalculateSpace(num_entries); + + std::unique_ptr mutable_buf; + std::unique_ptr + final_filter_cache_res_handle; + len_with_metadata = + AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + assert(mutable_buf); + assert(len_with_metadata >= speedb_filter::FilterMetadata::kMetadataLen); + // Max size supported by implementation + assert(len_with_metadata <= kMaxSupportLenWithMetadata); + + // Cache reservation for mutable_buf + if (cache_res_mgr_) { + Status s = cache_res_mgr_->MakeCacheReservation( + len_with_metadata * sizeof(char), &final_filter_cache_res_handle); + s.PermitUncheckedError(); + } + + uint32_t len_no_metadata = static_cast( + len_with_metadata - speedb_filter::FilterMetadata::kMetadataLen); + InitVars(len_no_metadata); + + if (len_no_metadata > 0) { + TEST_SYNC_POINT_CALLBACK( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries", + &hash_entries_info_.entries); + AddAllEntries(mutable_buf.get(), len_no_metadata); + Status verify_hash_entries_checksum_status = + MaybeVerifyHashEntriesChecksum(); + if (!verify_hash_entries_checksum_status.ok()) { + if (status) { + *status = verify_hash_entries_checksum_status; + } + return FinishAlwaysTrue(buf); + } + } + + bool keep_entries_for_postverify = detect_filter_construct_corruption_; + if (!keep_entries_for_postverify) { + ResetEntries(); + } + + speedb_filter::FilterMetadata::Fields metadata_fields{ + num_probes_, speedb_filter::FilterType::kPairedBlockBloom}; + speedb_filter::FilterMetadata::WriteMetadata( + &mutable_buf[len_no_metadata], + speedb_filter::FilterMetadata::kMetadataLen, metadata_fields); + + auto TEST_arg_pair __attribute__((__unused__)) = + std::make_pair(&mutable_buf, len_with_metadata); + TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter", + &TEST_arg_pair); + + Slice rv(mutable_buf.get(), len_with_metadata); + *buf = std::move(mutable_buf); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + if (status) { + *status = Status::OK(); + } + return rv; +} + +size_t SpdbPairedBloomBitsBuilder::ApproximateNumEntries( + size_t len_with_metadata) { + size_t len_no_meta = + len_with_metadata >= speedb_filter::FilterMetadata::kMetadataLen + ? RoundDownUsableSpace(len_with_metadata) - + speedb_filter::FilterMetadata::kMetadataLen + : 0; + return static_cast(kNumMillibitsInByte * len_no_meta / + millibits_per_key_); +} + +size_t SpdbPairedBloomBitsBuilder::CalculateSpace(size_t num_entries) { + size_t len_without_metadata = + num_entries * millibits_per_key_ / kNumMillibitsInByte; + // Make sure we have enough space for at least 1 batch + len_without_metadata = + std::max(len_without_metadata, kBatchSizeInBytes); + return RoundDownUsableSpace(len_without_metadata + + speedb_filter::FilterMetadata::kMetadataLen); +} + +size_t SpdbPairedBloomBitsBuilder::GetNumProbes() { + return CalcNumProbes(millibits_per_key_); +} + +double SpdbPairedBloomBitsBuilder::EstimatedFpRate( + size_t /*num_entries*/, size_t /*len_with_metadata*/) { + auto raw_num_probes = CalcRawNumProbes(millibits_per_key_); + + double adjusted_bits_per_key = CalcAdjustedBitsPerKey(millibits_per_key_); + return SpdbStandardFpRate(adjusted_bits_per_key, raw_num_probes); +} + +size_t SpdbPairedBloomBitsBuilder::RoundDownUsableSpace(size_t available_size) { + size_t rv = available_size - speedb_filter::FilterMetadata::kMetadataLen; + + if (rv >= kMaxSupportedSizeNoMetadata) { + // Max supported for this data structure implementation + rv = kMaxSupportedSizeNoMetadata; + } + + // round down to multiple of a Batch + rv = std::max((rv / kBatchSizeInBytes) * kBatchSizeInBytes, + kBatchSizeInBytes); + + return rv + speedb_filter::FilterMetadata::kMetadataLen; +} + +FilterBitsReader* SpdbPairedBloomBitsBuilder::GetBitsReader( + const Slice& filter_content) { + assert(reader_create_func_ != nullptr); + return reader_create_func_ ? reader_create_func_(filter_content) : nullptr; +} + +void SpdbPairedBloomBitsBuilder::InitBlockHistogram() { + blocks_histogram_.resize(num_batches_); + AddCacheReservation(num_batches_ * + sizeof(decltype(blocks_histogram_)::value_type)); + + for (auto batch_idx = 0U; batch_idx < blocks_histogram_.size(); ++batch_idx) { + for (auto in_batch_block_idx = 0U; + in_batch_block_idx < blocks_histogram_[batch_idx].size(); + ++in_batch_block_idx) { + blocks_histogram_[batch_idx][in_batch_block_idx] + .original_in_batch_block_idx = in_batch_block_idx; + } + } +} + +void SpdbPairedBloomBitsBuilder::BuildBlocksHistogram(uint32_t data_len_bytes) { + for (const auto& hash : hash_entries_info_.entries) { + const uint32_t global_block_idx = + HashToGlobalBlockIdx(Lower32of64(hash), data_len_bytes); + const uint8_t in_batch_block_idx = GetInBatchBlockIdx(global_block_idx); + const uint32_t batch_idx = GetContainingBatchIdx(global_block_idx); + + ++blocks_histogram_[batch_idx][in_batch_block_idx].num_keys; + } +} + +void SpdbPairedBloomBitsBuilder::SortBatchBlocks(uint32_t batch_idx) { + assert(batch_idx < num_batches_); + BatchBlocksHistogram& batch_blocks_histrogram = blocks_histogram_[batch_idx]; + std::stable_sort(batch_blocks_histrogram.begin(), + batch_blocks_histrogram.end()); +} + +void SpdbPairedBloomBitsBuilder::PairBatchBlocks(uint32_t batch_idx) { + assert(batch_idx < num_batches_); + BatchBlocksHistogram& batch_blocks_histrogram = blocks_histogram_[batch_idx]; + auto& batch_pairing_info = pairing_table_[batch_idx]; + + for (auto in_batch_block_idx = 0U; + in_batch_block_idx < speedb_filter::kPairedBloomBatchSizeInBlocks; + ++in_batch_block_idx) { + const auto pair_in_batch_block_idx = + batch_blocks_histrogram.size() - in_batch_block_idx - 1; + auto original_in_batch_block_idx = + batch_blocks_histrogram[in_batch_block_idx].original_in_batch_block_idx; + + batch_pairing_info[original_in_batch_block_idx].pair_in_batch_block_idx = + batch_blocks_histrogram[pair_in_batch_block_idx] + .original_in_batch_block_idx; + batch_pairing_info[original_in_batch_block_idx].hash_set_selector = + GetHashSetSelector(original_in_batch_block_idx, + batch_blocks_histrogram[pair_in_batch_block_idx] + .original_in_batch_block_idx); + } +} + +void SpdbPairedBloomBitsBuilder::PairBlocks() { + for (auto batch_idx = 0U; batch_idx < num_batches_; ++batch_idx) { + SortBatchBlocks(batch_idx); + PairBatchBlocks(batch_idx); + } +} + +void SpdbPairedBloomBitsBuilder::SetBlocksPairs(char* data) { + for (auto batch_idx = 0U; batch_idx < pairing_table_.size(); ++batch_idx) { + for (auto in_batch_block_idx = 0U; + in_batch_block_idx < speedb_filter::kPairedBloomBatchSizeInBlocks; + ++in_batch_block_idx) { + uint32_t global_block_idx = + batch_idx * speedb_filter::kPairedBloomBatchSizeInBlocks + + in_batch_block_idx; + BuildBlock block(data, global_block_idx, false /* prefetch */); + const uint32_t pair_in_batch_block_idx = + pairing_table_[batch_idx][in_batch_block_idx].pair_in_batch_block_idx; + block.SetInBatchBlockIdxOfPair(pair_in_batch_block_idx); + } + } +} + +// +// Build the blocks in similarly to how Rocksd does it +// The idea is to trigger blocks prefetching in batches, and access the +// prefetched blocks in batches. +void SpdbPairedBloomBitsBuilder::BuildBlocks(char* data, + uint32_t data_len_bytes) { + const size_t num_entries = hash_entries_info_.entries.size(); + constexpr size_t kBufferMask = 7; + static_assert(((kBufferMask + 1) & kBufferMask) == 0, + "Must be power of 2 minus 1"); + + constexpr auto kArraySize = kBufferMask + 1; + std::array primary_blocks; + std::array secondary_blocks; + std::array primary_hash_selectors; + std::array upper_32_bits_of_hashes; + + auto const hash_set_size = num_probes_ / 2; + + size_t i = 0; + std::deque::iterator hash_entries_it = + hash_entries_info_.entries.begin(); + + for (; i <= kBufferMask && i < num_entries; ++i) { + uint64_t hash = *hash_entries_it; + + // Primary Block + uint32_t primary_global_block_idx = + HashToGlobalBlockIdx(Lower32of64(hash), data_len_bytes); + new (&primary_blocks[i]) BuildBlock(data, primary_global_block_idx, true); + + const uint32_t batch_idx = GetContainingBatchIdx(primary_global_block_idx); + const uint8_t primary_in_batch_block_idx = + GetInBatchBlockIdx(primary_global_block_idx); + const uint32_t secondary_in_batch_block_idx = + pairing_table_[batch_idx][primary_in_batch_block_idx] + .pair_in_batch_block_idx; + + primary_hash_selectors[i] = GetHashSetSelector( + primary_in_batch_block_idx, secondary_in_batch_block_idx); + + const uint32_t secondary_global_block_idx = + GetFirstGlobalBlockIdxOfBatch(batch_idx) + secondary_in_batch_block_idx; + new (&secondary_blocks[i]) + BuildBlock(data, secondary_global_block_idx, true); + + upper_32_bits_of_hashes[i] = Upper32of64(hash); + ++hash_entries_it; + } + + // Process and buffer + for (; i < num_entries; ++i) { + auto idx = i & kBufferMask; + uint32_t& upper_32_bits_of_hash_ref = upper_32_bits_of_hashes[idx]; + auto& primary_block_ref = primary_blocks[idx]; + auto& secondary_block_ref = secondary_blocks[idx]; + auto& primary_hash_selector_ref = primary_hash_selectors[idx]; + + primary_block_ref.SetBlockBloomBits( + upper_32_bits_of_hash_ref, primary_hash_selector_ref, hash_set_size); + secondary_block_ref.SetBlockBloomBits(upper_32_bits_of_hash_ref, + 1 - primary_hash_selector_ref, + hash_set_size); + // And buffer + uint64_t hash = *hash_entries_it; + + // Primary Block + uint32_t primary_global_block_idx = + HashToGlobalBlockIdx(Lower32of64(hash), data_len_bytes); + new (&primary_block_ref) BuildBlock(data, primary_global_block_idx, true); + + const uint32_t batch_idx = GetContainingBatchIdx(primary_global_block_idx); + const uint8_t primary_in_batch_block_idx = + GetInBatchBlockIdx(primary_global_block_idx); + const uint32_t secondary_in_batch_block_idx = + pairing_table_[batch_idx][primary_in_batch_block_idx] + .pair_in_batch_block_idx; + primary_hash_selector_ref = GetHashSetSelector( + primary_in_batch_block_idx, secondary_in_batch_block_idx); + const uint32_t secondary_global_block_idx = + GetFirstGlobalBlockIdxOfBatch(batch_idx) + secondary_in_batch_block_idx; + new (&secondary_block_ref) + BuildBlock(data, secondary_global_block_idx, true); + + upper_32_bits_of_hash_ref = Upper32of64(hash); + ++hash_entries_it; + } + + // Finish processing + for (i = 0; i <= kBufferMask && i < num_entries; ++i) { + primary_blocks[i].SetBlockBloomBits( + upper_32_bits_of_hashes[i], primary_hash_selectors[i], hash_set_size); + secondary_blocks[i].SetBlockBloomBits(upper_32_bits_of_hashes[i], + 1 - primary_hash_selectors[i], + hash_set_size); + } +} + +void SpdbPairedBloomBitsBuilder::AddAllEntries(char* data, + uint32_t data_len_bytes) { + InitBlockHistogram(); + BuildBlocksHistogram(data_len_bytes); + PairBlocks(); + SetBlocksPairs(data); + BuildBlocks(data, data_len_bytes); + CleanupBuildData(); +} + +void SpdbPairedBloomBitsBuilder::CleanupBuildData() { + blocks_histogram_.clear(); + blocks_histogram_.shrink_to_fit(); + + pairing_table_.clear(); + pairing_table_.shrink_to_fit(); + + internal_cache_res_handles_.clear(); + internal_cache_res_handles_.shrink_to_fit(); +} + +void SpdbPairedBloomBitsBuilder::AddCacheReservation( + std::size_t incremental_memory_used) { + if (cache_res_mgr_) { + std::unique_ptr + filter_cache_res_handle; + Status s = cache_res_mgr_->MakeCacheReservation(incremental_memory_used, + &filter_cache_res_handle); + s.PermitUncheckedError(); + + internal_cache_res_handles_.push_back(std::move(filter_cache_res_handle)); + } +} + +// ======================================================================================================================= +bool SpdbPairedBloomBitsReader::HashMayMatch(const uint64_t hash) { + uint32_t primary_global_block_idx = + HashToGlobalBlockIdx(Lower32of64(hash), data_len_bytes_); + // Not prefetching as performance seems to improve + // TODO: Needs additional verification + ReadBlock primary_block(data_, primary_global_block_idx, true /* prefetch */); + + uint8_t primary_in_batch_block_idx = + GetInBatchBlockIdx(primary_global_block_idx); + uint8_t secondary_in_batch_block_idx = + primary_block.GetInBatchBlockIdxOfPair(); + auto primary_block_hash_selector = GetHashSetSelector( + primary_in_batch_block_idx, secondary_in_batch_block_idx); + + auto const hash_set_size = num_probes_ / 2; + + const uint32_t upper_32_bits_of_hash = Upper32of64(hash); + if (primary_block.AreAllBlockBloomBitsSet(upper_32_bits_of_hash, + primary_block_hash_selector, + hash_set_size) == false) { + return false; + } + + uint32_t secondary_block_hash_selector = 1 - primary_block_hash_selector; + uint32_t batch_idx = GetContainingBatchIdx(primary_global_block_idx); + uint32_t secondary_global_block_idx = + GetFirstGlobalBlockIdxOfBatch(batch_idx) + secondary_in_batch_block_idx; + + ReadBlock secondary_block(data_, secondary_global_block_idx, + true /* prefetch */); + return secondary_block.AreAllBlockBloomBitsSet( + upper_32_bits_of_hash, secondary_block_hash_selector, hash_set_size); +} + +bool SpdbPairedBloomBitsReader::MayMatch(const Slice& key) { + uint64_t hash = GetSliceHash64(key); + return HashMayMatch(hash); +} + +// TODO: COPY Rocksdb's approach for multi-keys to improve performance +// (prefetch blocks) +void SpdbPairedBloomBitsReader::MayMatch(int num_keys, Slice** keys, + bool* may_match) { + for (auto i = 0; i < num_keys; ++i) { + may_match[i] = MayMatch(*keys[i]); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/paired_filter/speedb_paired_bloom_internal.h b/plugin/speedb/paired_filter/speedb_paired_bloom_internal.h new file mode 100644 index 0000000000..15e0c36d49 --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_paired_bloom_internal.h @@ -0,0 +1,202 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "table/block_based/filter_policy_internal.h" + +namespace ROCKSDB_NAMESPACE { + +namespace speedb_filter { +inline constexpr size_t kPairedBloomBatchSizeInBlocks = 128U; +// Max supported BPK. Filters using higher BPK-s will use the max +inline constexpr int kMinMillibitsPerKey = 1000.0; + +// Types of proprietary Speedb's filters +enum class FilterType : uint8_t { + kPairedBlockBloom = 1, + kFutureUnknown = 0xFF, // User to indicate an unrecognized filter type from a + // future version +}; + +// Bloom Filter's data provided by Speedb: +// 0 |-----------------------------------| +// | Raw Paired Bloom filter data | +// | ... | +// len |-----------------------------------| +// | bytes Spdb Filter Types | +// | 1: SpdbPairedBloom | +// | other: reserved | +// len+1 |-----------------------------------| +// | byte for block_and_probes | +// | 0 in top 3 bits -> 6 -> 64-byte | +// | reserved: | +// | 1 in top 3 bits -> 7 -> 128-byte| +// | 2 in top 3 bits -> 8 -> 256-byte| +// | ... | +// | num_probes in bottom 5 bits, | +// | except 0 and 31 reserved | +// len+2 |-----------------------------------| +// | two bytes reserved | +// | possibly for hash seed | +// len_with_meta |-----------------------------------| +class FilterMetadata { + public: + // Metadata trailer size for Speedb's filters. (This is separate from + // block-based table block trailer). Starting at len in the diagram above + static constexpr uint32_t kMetadataLen = 4U; + + struct Fields { + size_t num_probes; + FilterType filter_type; + }; + + public: + static void WriteMetadata(char* metadata, size_t len, const Fields& fields); + static Fields ReadMetadata(const char* metadata); +}; + +} // namespace speedb_filter + +// =========================================================================================================== +class SpdbPairedBloomBitsBuilder : public XXPH3FilterBitsBuilder { + public: + // Callback function to create a compatible reader. This is needed when + // performing post-verify during filter construction / filter block writing + // (See BlockBasedTableBuilder::WriteRawBlock() + using FilterBitsReaderCreateFunc = + std::function; + + public: + // Non-null aggregate_rounding_balance implies optimize_filters_for_memory + explicit SpdbPairedBloomBitsBuilder( + const int millibits_per_key, + std::atomic* aggregate_rounding_balance, + const std::shared_ptr& cache_res_mgr, + bool detect_filter_construct_corruption, + const FilterBitsReaderCreateFunc& reader_create_func); + + ~SpdbPairedBloomBitsBuilder() override {} + + // No Copy allowed + SpdbPairedBloomBitsBuilder(const SpdbPairedBloomBitsBuilder&) = delete; + void operator=(const SpdbPairedBloomBitsBuilder&) = delete; + + protected: + size_t RoundDownUsableSpace(size_t available_size) override; + + FilterBitsReader* GetBitsReader(const Slice& filter_content) override; + + private: + // Stores the per-block information used to sort and pair blocks in the + // algorithm + struct BlockHistogramInfo { + // Number of keys mapped to this block + uint16_t num_keys = 0U; + + // Records the original in-batch block idx of the block before sorting + uint8_t original_in_batch_block_idx = std::numeric_limits::max(); + + // Allows block to be sorted using std sorting algorithms + bool operator<(const BlockHistogramInfo& other) const { + return (num_keys < other.num_keys); + } + }; + + // Records the info about a block's pair in the batch + struct PairingInfo { + uint8_t pair_in_batch_block_idx; + uint8_t hash_set_selector; + }; + + using BatchBlocksHistogram = + std::array; + using BatchPairingInfo = + std::array; + + public: + Slice Finish(std::unique_ptr* buf) override { + return Finish(buf, nullptr); + } + + Slice Finish(std::unique_ptr* buf, Status* status) override; + + size_t ApproximateNumEntries(size_t len_with_metadata) override; + size_t CalculateSpace(size_t num_entries) override; + double EstimatedFpRate(size_t /*num_entries*/, + size_t /*len_with_metadata*/) override; + + private: + size_t GetNumProbes(); + + void InitVars(uint64_t len_no_metadata); + void InitBlockHistogram(); + void BuildBlocksHistogram(uint32_t data_len_bytes); + void SortBatchBlocks(uint32_t batch_idx); + void PairBatchBlocks(uint32_t batch_idx); + void PairBlocks(); + void SetBlocksPairs(char* data); + void BuildBlocks(char* data, uint32_t data_len_bytes); + void CleanupBuildData(); + + void AddAllEntries(char* data, uint32_t data_len_bytes); + + void AddCacheReservation(std::size_t incremental_memory_used); + + private: + // Target allocation per added key, in thousandths of a bit. + int millibits_per_key_; + + size_t num_blocks_ = 0U; + size_t num_batches_ = 0U; + size_t num_probes_ = 0U; + + std::vector blocks_histogram_; + std::vector pairing_table_; + + // For managing cache reservations needed for the building of the filter + std::vector> + internal_cache_res_handles_; + + FilterBitsReaderCreateFunc reader_create_func_; +}; + +class SpdbPairedBloomBitsReader : public BuiltinFilterBitsReader { + public: + SpdbPairedBloomBitsReader(const char* data, size_t num_probes, + uint32_t data_len_bytes) + : data_(data), num_probes_(num_probes), data_len_bytes_(data_len_bytes) {} + + ~SpdbPairedBloomBitsReader() override {} + + // No Copy allowed + SpdbPairedBloomBitsReader(const SpdbPairedBloomBitsReader&) = delete; + void operator=(const SpdbPairedBloomBitsReader&) = delete; + + bool HashMayMatch(const uint64_t /*hash*/) override; + bool MayMatch(const Slice& key) override; + void MayMatch(int num_keys, Slice** keys, bool* may_match) override; + + private: + const char* data_; + const size_t num_probes_; + const uint32_t data_len_bytes_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/speedb.mk b/plugin/speedb/speedb.mk new file mode 100644 index 0000000000..a19cfd6967 --- /dev/null +++ b/plugin/speedb/speedb.mk @@ -0,0 +1,29 @@ +# Copyright (C) 2022 Speedb Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +speedb_SOURCES = \ + speedb_registry.cc \ + memtable/hash_spd_rep.cc \ + paired_filter/speedb_paired_bloom.cc \ + paired_filter/speedb_paired_bloom_internal.cc \ + + +speedb_FUNC = register_SpeedbPlugins + +speedb_HEADERS = \ + paired_filter/speedb_paired_bloom.h \ + +speedb_TESTS = \ + speedb_customizable_test.cc \ + paired_filter/speedb_db_bloom_filter_test.cc \ diff --git a/plugin/speedb/speedb_customizable_test.cc b/plugin/speedb/speedb_customizable_test.cc new file mode 100644 index 0000000000..48d7b420da --- /dev/null +++ b/plugin/speedb/speedb_customizable_test.cc @@ -0,0 +1,117 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "db/db_test_util.h" +#include "plugin/speedb/paired_filter/speedb_paired_bloom.h" +#include "port/stack_trace.h" +#include "rocksdb/customizable.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/block_based/filter_policy_internal.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +#ifdef GFLAGS +#include "util/gflags_compat.h" +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +DEFINE_bool(enable_print, false, "Print options generated to console."); +#endif // GFLAGS + +namespace ROCKSDB_NAMESPACE { + +class LoadCustomizableTest : public testing::Test { + public: + LoadCustomizableTest() { + config_options_.ignore_unsupported_options = false; + config_options_.invoke_prepare_options = false; + } + bool RegisterTests(const std::string& arg) { + (void)arg; + return false; + } + + protected: + DBOptions db_opts_; + ColumnFamilyOptions cf_opts_; + ConfigOptions config_options_; +}; + +// ========================================================================================== +TEST_F(LoadCustomizableTest, LoadSpdbPairedFilterPolicyTest) { + std::shared_ptr table; + std::shared_ptr result; + ASSERT_NOK(FilterPolicy::CreateFromString( + config_options_, SpdbPairedBloomFilterPolicy::kClassName(), &result)); + + ASSERT_OK(FilterPolicy::CreateFromString(config_options_, "", &result)); + ASSERT_EQ(result, nullptr); + ASSERT_OK(FilterPolicy::CreateFromString( + config_options_, ReadOnlyBuiltinFilterPolicy::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), ReadOnlyBuiltinFilterPolicy::kClassName()); + +#ifndef ROCKSDB_LITE + std::string table_opts = "id=BlockBasedTable; filter_policy="; + ASSERT_OK(TableFactory::CreateFromString(config_options_, + table_opts + "nullptr", &table)); + ASSERT_NE(table.get(), nullptr); + auto bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_EQ(bbto->filter_policy.get(), nullptr); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, table_opts + ReadOnlyBuiltinFilterPolicy::kClassName(), + &table)); + bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_NE(bbto->filter_policy.get(), nullptr); + ASSERT_STREQ(bbto->filter_policy->Name(), + ReadOnlyBuiltinFilterPolicy::kClassName()); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, table_opts + SpdbPairedBloomFilterPolicy::kClassName(), + &table)); + bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_EQ(bbto->filter_policy.get(), nullptr); + if (RegisterTests("Test")) { + ASSERT_OK(FilterPolicy::CreateFromString( + config_options_, SpdbPairedBloomFilterPolicy::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), SpdbPairedBloomFilterPolicy::kClassName()); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, table_opts + SpdbPairedBloomFilterPolicy::kClassName(), + &table)); + bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_NE(bbto->filter_policy.get(), nullptr); + ASSERT_STREQ(bbto->filter_policy->Name(), + SpdbPairedBloomFilterPolicy::kClassName()); + } +#endif // ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); +#ifdef GFLAGS + ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} diff --git a/plugin/speedb/speedb_registry.cc b/plugin/speedb/speedb_registry.cc new file mode 100644 index 0000000000..fe4436d8f4 --- /dev/null +++ b/plugin/speedb/speedb_registry.cc @@ -0,0 +1,65 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "plugin/speedb/speedb_registry.h" + +#include "paired_filter/speedb_paired_bloom.h" +#include "plugin/speedb/memtable/hash_spd_rep.h" +#include "rocksdb/utilities/object_registry.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +// Similar to the NewBuiltinFilterPolicyWithBits template for RocksDB built-in +// filters +SpdbPairedBloomFilterPolicy* NewSpdbPairedBloomFilterWithBits( + const std::string& uri) { + return new SpdbPairedBloomFilterPolicy( + FilterPolicy::ExtractBitsPerKeyFromUri(uri)); +} + +int register_SpeedbPlugins(ObjectLibrary& library, const std::string&) { + library.AddFactory( + ObjectLibrary::PatternEntry(HashSpdRepFactory::kClassName(), true) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t buckets = ParseSizeT(uri.substr(colon + 1)); + guard->reset(new HashSpdRepFactory(buckets)); + } else { + guard->reset(new HashSpdRepFactory()); + } + return guard->get(); + }); + + library.AddFactory( + ObjectLibrary::PatternEntry(SpdbPairedBloomFilterPolicy::kClassName(), + false) + .AnotherName(SpdbPairedBloomFilterPolicy::kNickName()) + .AddNumber(":", false), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(NewSpdbPairedBloomFilterWithBits(uri)); + return guard->get(); + }); + + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/speedb_registry.h b/plugin/speedb/speedb_registry.h new file mode 100644 index 0000000000..e5419d2b77 --- /dev/null +++ b/plugin/speedb/speedb_registry.h @@ -0,0 +1,29 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +// Forward Declarations +class ObjectLibrary; + +extern "C" { +int register_SpeedbPlugins(ROCKSDB_NAMESPACE::ObjectLibrary& library, + const std::string&); +} // extern "C" +} // namespace ROCKSDB_NAMESPACE diff --git a/python.mk b/python.mk deleted file mode 100644 index d92cdec47f..0000000000 --- a/python.mk +++ /dev/null @@ -1,9 +0,0 @@ -ifndef PYTHON - -# Default to python3. Some distros like CentOS 8 do not have `python`. -ifeq ($(origin PYTHON), undefined) - PYTHON := $(shell which python3 || which python || echo python3) -endif -export PYTHON - -endif diff --git a/speedb/version.h b/speedb/version.h new file mode 100644 index 0000000000..557664a8b7 --- /dev/null +++ b/speedb/version.h @@ -0,0 +1,29 @@ +// Copyright (C) 2022 Speedb Ltd. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#pragma once + +#define SPEEDB_MAJOR 2 +#define SPEEDB_MINOR 1 +#define SPEEDB_PATCH 0 + +namespace ROCKSDB_NAMESPACE { + +// Returns the current version of Speedb as a string (e.g. "1.5.0"). +// If with_patch is true, the patch is included (1.5.x). +// Otherwise, only major and minor version is included (1.5) +std::string GetSpeedbVersionAsString(bool with_patch = true); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src.mk b/src.mk index 72c4d5f54e..75006196b7 100644 --- a/src.mk +++ b/src.mk @@ -1,4 +1,4 @@ -# These are the sources from which librocksdb.a is built: +# These are the sources from which libspeedb.a is built: LIB_SOURCES = \ cache/cache.cc \ cache/cache_entry_roles.cc \ diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 2a2258a40b..9329b7c336 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -716,6 +716,19 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const { snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n", table_options_.pin_top_level_index_and_filter); ret.append(buffer); + ret.append(" metadata_cache_options:\n"); + snprintf(buffer, kBufferSize, " top_level_index_pinning: %d\n", + static_cast( + table_options_.metadata_cache_options.top_level_index_pinning)); + ret.append(buffer); + snprintf(buffer, kBufferSize, " partition_pinning: %d\n", + static_cast( + table_options_.metadata_cache_options.partition_pinning)); + ret.append(buffer); + snprintf(buffer, kBufferSize, " unpartitioned_pinning: %d\n", + static_cast( + table_options_.metadata_cache_options.unpartitioned_pinning)); + ret.append(buffer); snprintf(buffer, kBufferSize, " index_type: %d\n", table_options_.index_type); ret.append(buffer); diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 99bb020011..02b79e9463 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -616,7 +616,7 @@ Status BlockBasedTable::Open( if (!IsSupportedFormatVersion(footer.format_version())) { return Status::Corruption( "Unknown Footer version. Maybe this file was created with newer " - "version of RocksDB?"); + "version of Speedb?"); } BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 0671fc09f1..c9c656c3e0 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -666,7 +666,8 @@ struct BlockBasedTable::Rep { fpb->reset(new FilePrefetchBuffer( readahead_size, max_readahead_size, !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */, - implicit_auto_readahead, async_io, ioptions.fs.get())); + implicit_auto_readahead, async_io, ioptions.fs.get(), ioptions.clock, + ioptions.stats)); } void CreateFilePrefetchBufferIfNotExists( diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index d7a3105a3d..0c7fc37332 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -5,6 +5,7 @@ #include "table/block_based/block_based_table_reader.h" +#include #include #include @@ -330,9 +331,9 @@ class BlockBasedTableReaderCapMemoryTest CacheEntryRole::kBlockBasedTableReader>:: GetDummyEntrySize() == 0 && - cache_capacity > 2 * CacheReservationManagerImpl< - CacheEntryRole::kBlockBasedTableReader>:: - GetDummyEntrySize()); + cache_capacity >= 2 * CacheReservationManagerImpl< + CacheEntryRole::kBlockBasedTableReader>:: + GetDummyEntrySize()); // We need to subtract 1 for max_num_dummy_entry to account for dummy // entries' overhead, assumed the overhead is no greater than 1 dummy entry @@ -347,11 +348,11 @@ class BlockBasedTableReaderCapMemoryTest max_num_dummy_entry * CacheReservationManagerImpl< CacheEntryRole::kBlockBasedTableReader>::GetDummyEntrySize(); - std::size_t max_table_reader_num = static_cast( + std::size_t max_table_reader_num_capped = static_cast( std::floor(1.0 * cache_capacity_rounded_to_dummy_entry_multiples / approx_table_reader_mem)); - return max_table_reader_num; + return max_table_reader_num_capped; } void SetUp() override { @@ -361,7 +362,7 @@ class BlockBasedTableReaderCapMemoryTest compression_type_ = CompressionType::kNoCompression; table_reader_res_only_cache_.reset(new BlockBasedTableReaderResOnlyCache( - NewLRUCache(6 * CacheReservationManagerImpl< + NewLRUCache(4 * CacheReservationManagerImpl< CacheEntryRole::kBlockBasedTableReader>:: GetDummyEntrySize(), 0 /* num_shard_bits */, true /* strict_capacity_limit */))); @@ -420,22 +421,44 @@ class BlockBasedTableReaderCapMemoryTest }; INSTANTIATE_TEST_CASE_P(CapMemoryUsageUnderCacheCapacity, - BlockBasedTableReaderCapMemoryTest, ::testing::Bool()); + BlockBasedTableReaderCapMemoryTest, + ::testing::Values(true, false)); TEST_P(BlockBasedTableReaderCapMemoryTest, CapMemoryUsageUnderCacheCapacity) { - const std::size_t max_table_reader_num = BlockBasedTableReaderCapMemoryTest:: - CalculateMaxTableReaderNumBeforeCacheFull( - table_reader_res_only_cache_->GetCapacity(), - approx_table_reader_mem_); + const std::size_t max_table_reader_num_capped = + BlockBasedTableReaderCapMemoryTest:: + CalculateMaxTableReaderNumBeforeCacheFull( + table_reader_res_only_cache_->GetCapacity(), + approx_table_reader_mem_); + + // Acceptable estimtation errors coming from + // 1. overstimate max_table_reader_num_capped due to # dummy entries is high + // and results in metadata charge overhead greater than 1 dummy entry size + // (violating our assumption in calculating max_table_reader_num_capped) + // 2. overestimate/underestimate max_table_reader_num_capped due to the gap + // between ApproximateTableReaderMem() and actual table reader mem + std::size_t max_table_reader_num_capped_upper_bound = + (std::size_t)(max_table_reader_num_capped * 1.01); + std::size_t max_table_reader_num_capped_lower_bound = + (std::size_t)(max_table_reader_num_capped * 0.99); + std::size_t max_table_reader_num_uncapped = + (std::size_t)(max_table_reader_num_capped * 1.1); + ASSERT_GT(max_table_reader_num_uncapped, + max_table_reader_num_capped_upper_bound) + << "We need `max_table_reader_num_uncapped` > " + "`max_table_reader_num_capped_upper_bound` to differentiate cases " + "between " + "reserve_table_reader_memory_ == false and == true)"; Status s = Status::OK(); std::size_t opened_table_reader_num = 0; std::string table_name; std::vector> tables; - // Keep creating BlockBasedTableReader till hiting the memory limit based on - // cache capacity and creation fails or reaching a big number of table readers - while (s.ok() && opened_table_reader_num < 2 * max_table_reader_num) { + // cache capacity and creation fails (when reserve_table_reader_memory_ == + // true) or reaching a specfied big number of table readers (when + // reserve_table_reader_memory_ == false) + while (s.ok() && opened_table_reader_num < max_table_reader_num_uncapped) { table_name = "table_" + std::to_string(opened_table_reader_num); CreateTable(table_name, compression_type_, kv_); tables.push_back(std::unique_ptr()); @@ -449,23 +472,14 @@ TEST_P(BlockBasedTableReaderCapMemoryTest, CapMemoryUsageUnderCacheCapacity) { } if (reserve_table_reader_memory_) { - EXPECT_TRUE(s.IsMemoryLimit() && - opened_table_reader_num < 2 * max_table_reader_num) - << "s: " << s.ToString() << " opened_table_reader_num: " - << std::to_string(opened_table_reader_num); + EXPECT_TRUE(s.IsMemoryLimit()) << "s: " << s.ToString(); EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") != std::string::npos); - // Acceptable estimtation errors coming from - // 1. overstimate max_table_reader_num due to # dummy entries is high and - // results in metadata charge overhead greater than 1 dummy entry size - // (violating our assumption in calculating max_table_reader_nums) - // 2. overestimate/underestimate max_table_reader_num due to the gap between - // ApproximateTableReaderMem() and actual table reader mem - EXPECT_GE(opened_table_reader_num, max_table_reader_num * 0.99); - EXPECT_LE(opened_table_reader_num, max_table_reader_num * 1.01); + EXPECT_GE(opened_table_reader_num, max_table_reader_num_capped_lower_bound); + EXPECT_LE(opened_table_reader_num, max_table_reader_num_capped_upper_bound); - std::size_t updated_max_table_reader_num = + std::size_t updated_max_table_reader_num_capped = BlockBasedTableReaderCapMemoryTest:: CalculateMaxTableReaderNumBeforeCacheFull( table_reader_res_only_cache_->GetCapacity() / 2, @@ -473,7 +487,7 @@ TEST_P(BlockBasedTableReaderCapMemoryTest, CapMemoryUsageUnderCacheCapacity) { // Keep deleting BlockBasedTableReader to lower down memory usage from the // memory limit to make the next creation succeeds - while (opened_table_reader_num >= updated_max_table_reader_num) { + while (opened_table_reader_num >= updated_max_table_reader_num_capped) { tables.pop_back(); --opened_table_reader_num; } @@ -489,7 +503,8 @@ TEST_P(BlockBasedTableReaderCapMemoryTest, CapMemoryUsageUnderCacheCapacity) { tables.clear(); EXPECT_EQ(table_reader_res_only_cache_->GetPinnedUsage(), 0); } else { - EXPECT_TRUE(s.ok() && opened_table_reader_num == 2 * max_table_reader_num) + EXPECT_TRUE(s.ok() && + opened_table_reader_num == max_table_reader_num_uncapped) << "s: " << s.ToString() << " opened_table_reader_num: " << std::to_string(opened_table_reader_num); EXPECT_EQ(table_reader_res_only_cache_->GetPinnedUsage(), 0); diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 6ac4b91422..a200c4b6b1 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -12,12 +12,10 @@ #include #include #include -#include #include #include #include "cache/cache_entry_roles.h" -#include "cache/cache_reservation_manager.h" #include "logging/logging.h" #include "port/lang.h" #include "rocksdb/convenience.h" @@ -54,83 +52,67 @@ Slice FinishAlwaysFalse(std::unique_ptr* /*buf*/) { return Slice(nullptr, 0); } -Slice FinishAlwaysTrue(std::unique_ptr* /*buf*/) { - return Slice("\0\0\0\0\0\0", 6); -} +} // namespace + +// Number of hash entries to accumulate before charging their memory usage to +// the cache when cache reservation is available +const std::size_t XXPH3FilterBitsBuilder::kUint64tHashEntryCacheResBucketSize = + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(uint64_t); // Base class for filter builders using the XXH3 preview hash, // also known as Hash64 or GetSliceHash64. -class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { - public: - explicit XXPH3FilterBitsBuilder( - std::atomic* aggregate_rounding_balance, - std::shared_ptr cache_res_mgr, - bool detect_filter_construct_corruption) - : aggregate_rounding_balance_(aggregate_rounding_balance), - cache_res_mgr_(cache_res_mgr), - detect_filter_construct_corruption_( - detect_filter_construct_corruption) {} - - ~XXPH3FilterBitsBuilder() override {} - - virtual void AddKey(const Slice& key) override { - uint64_t hash = GetSliceHash64(key); - // Especially with prefixes, it is common to have repetition, - // though only adjacent repetition, which we want to immediately - // recognize and collapse for estimating true filter space - // requirements. - if (hash_entries_info_.entries.empty() || - hash != hash_entries_info_.entries.back()) { - if (detect_filter_construct_corruption_) { - hash_entries_info_.xor_checksum ^= hash; - } - hash_entries_info_.entries.push_back(hash); - if (cache_res_mgr_ && - // Traditional rounding to whole bucket size - ((hash_entries_info_.entries.size() % - kUint64tHashEntryCacheResBucketSize) == - kUint64tHashEntryCacheResBucketSize / 2)) { - hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr); - Status s = cache_res_mgr_->MakeCacheReservation( - kUint64tHashEntryCacheResBucketSize * sizeof(hash), - &hash_entries_info_.cache_res_bucket_handles.back()); - s.PermitUncheckedError(); - } +XXPH3FilterBitsBuilder::XXPH3FilterBitsBuilder( + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption) + : aggregate_rounding_balance_(aggregate_rounding_balance), + cache_res_mgr_(cache_res_mgr), + detect_filter_construct_corruption_(detect_filter_construct_corruption) {} + +void XXPH3FilterBitsBuilder::AddKey(const Slice& key) { + uint64_t hash = GetSliceHash64(key); + // Especially with prefixes, it is common to have repetition, + // though only adjacent repetition, which we want to immediately + // recognize and collapse for estimating true filter space + // requirements. + if (hash_entries_info_.entries.empty() || + hash != hash_entries_info_.entries.back()) { + if (detect_filter_construct_corruption_) { + hash_entries_info_.xor_checksum ^= hash; + } + hash_entries_info_.entries.push_back(hash); + if (cache_res_mgr_ && + // Traditional rounding to whole bucket size + ((hash_entries_info_.entries.size() % + kUint64tHashEntryCacheResBucketSize) == + kUint64tHashEntryCacheResBucketSize / 2)) { + hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr); + Status s = cache_res_mgr_->MakeCacheReservation( + kUint64tHashEntryCacheResBucketSize * sizeof(hash), + &hash_entries_info_.cache_res_bucket_handles.back()); + s.PermitUncheckedError(); } } +} - virtual size_t EstimateEntriesAdded() override { - return hash_entries_info_.entries.size(); - } - - virtual Status MaybePostVerify(const Slice& filter_content) override; - - protected: - static constexpr uint32_t kMetadataLen = 5; - - // Number of hash entries to accumulate before charging their memory usage to - // the cache when cache reservation is available - static const std::size_t kUint64tHashEntryCacheResBucketSize = - CacheReservationManagerImpl< - CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / - sizeof(uint64_t); +size_t XXPH3FilterBitsBuilder::EstimateEntriesAdded() { + return hash_entries_info_.entries.size(); +} // For delegating between XXPH3FilterBitsBuilders - void SwapEntriesWith(XXPH3FilterBitsBuilder* other) { - assert(other != nullptr); - hash_entries_info_.Swap(&(other->hash_entries_info_)); - } - - void ResetEntries() { hash_entries_info_.Reset(); } - - virtual size_t RoundDownUsableSpace(size_t available_size) = 0; +void XXPH3FilterBitsBuilder::SwapEntriesWith(XXPH3FilterBitsBuilder* other) { + assert(other != nullptr); + hash_entries_info_.Swap(&(other->hash_entries_info_)); +} // To choose size using malloc_usable_size, we have to actually allocate. - size_t AllocateMaybeRounding(size_t target_len_with_metadata, - size_t num_entries, - std::unique_ptr* buf) { - // Return value set to a default; overwritten in some cases - size_t rv = target_len_with_metadata; +size_t XXPH3FilterBitsBuilder::AllocateMaybeRounding( + size_t target_len_with_metadata, size_t num_entries, + std::unique_ptr* buf) { + // Return value set to a default; overwritten in some cases + size_t rv = target_len_with_metadata; #ifdef ROCKSDB_MALLOC_USABLE_SIZE if (aggregate_rounding_balance_ != nullptr) { // Do optimize_filters_for_memory, using malloc_usable_size. @@ -221,7 +203,7 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { buf->reset(new char[rv]()); #endif // ROCKSDB_MALLOC_USABLE_SIZE return rv; - } +} // TODO: Ideally we want to verify the hash entry // as it is added to the filter and eliminate this function @@ -230,73 +212,25 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // Possible solution: // pass a custom iterator that tracks the xor checksum as // it iterates to ResetAndFindSeedToSolve - Status MaybeVerifyHashEntriesChecksum() { - if (!detect_filter_construct_corruption_) { - return Status::OK(); - } - - uint64_t actual_hash_entries_xor_checksum = 0; - for (uint64_t h : hash_entries_info_.entries) { - actual_hash_entries_xor_checksum ^= h; - } - - if (actual_hash_entries_xor_checksum == hash_entries_info_.xor_checksum) { - return Status::OK(); - } else { - // Since these hash entries are corrupted and they will not be used - // anymore, we can reset them and release memory. - ResetEntries(); - return Status::Corruption("Filter's hash entries checksum mismatched"); - } +Status XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum() { + if (!detect_filter_construct_corruption_) { + return Status::OK(); } - // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, - // always "round up" like historic behavior. - std::atomic* aggregate_rounding_balance_; - - // For reserving memory used in (new) Bloom and Ribbon Filter construction - std::shared_ptr cache_res_mgr_; - - // For managing cache reservation for final filter in (new) Bloom and Ribbon - // Filter construction - std::deque> - final_filter_cache_res_handles_; - - bool detect_filter_construct_corruption_; - - struct HashEntriesInfo { - // A deque avoids unnecessary copying of already-saved values - // and has near-minimal peak memory use. - std::deque entries; - - // If cache_res_mgr_ != nullptr, - // it manages cache reservation for buckets of hash entries in (new) Bloom - // or Ribbon Filter construction. - // Otherwise, it is empty. - std::deque> - cache_res_bucket_handles; - - // If detect_filter_construct_corruption_ == true, - // it records the xor checksum of hash entries. - // Otherwise, it is 0. - uint64_t xor_checksum = 0; - - void Swap(HashEntriesInfo* other) { - assert(other != nullptr); - std::swap(entries, other->entries); - std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles); - std::swap(xor_checksum, other->xor_checksum); - } - - void Reset() { - entries.clear(); - cache_res_bucket_handles.clear(); - xor_checksum = 0; - } - }; + uint64_t actual_hash_entries_xor_checksum = 0; + for (uint64_t h : hash_entries_info_.entries) { + actual_hash_entries_xor_checksum ^= h; + } - HashEntriesInfo hash_entries_info_; -}; + if (actual_hash_entries_xor_checksum == hash_entries_info_.xor_checksum) { + return Status::OK(); + } else { + // Since these hash entries are corrupted and they will not be used + // anymore, we can reset them and release memory. + ResetEntries(); + return Status::Corruption("Filter's hash entries checksum mismatched"); + } +} // #################### FastLocalBloom implementation ################## // // ############## also known as format_version=5 Bloom filter ########## // @@ -1260,21 +1194,10 @@ class LegacyBloomBitsReader : public BuiltinFilterBitsReader { const uint32_t log2_cache_line_size_; }; -class AlwaysTrueFilter : public BuiltinFilterBitsReader { - public: - bool MayMatch(const Slice&) override { return true; } - using FilterBitsReader::MayMatch; // inherit overload - bool HashMayMatch(const uint64_t) override { return true; } - using BuiltinFilterBitsReader::HashMayMatch; // inherit overload -}; - -class AlwaysFalseFilter : public BuiltinFilterBitsReader { - public: - bool MayMatch(const Slice&) override { return false; } - using FilterBitsReader::MayMatch; // inherit overload - bool HashMayMatch(const uint64_t) override { return false; } - using BuiltinFilterBitsReader::HashMayMatch; // inherit overload -}; +FilterBitsReader* XXPH3FilterBitsBuilder::GetBitsReader( + const Slice& filter_content) { + return BuiltinFilterPolicy::GetBuiltinFilterBitsReader(filter_content); +} Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) { Status s = Status::OK(); @@ -1283,8 +1206,7 @@ Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) { return s; } - std::unique_ptr bits_reader( - BuiltinFilterPolicy::GetBuiltinFilterBitsReader(filter_content)); + std::unique_ptr bits_reader(GetBitsReader(filter_content)); for (uint64_t h : hash_entries_info_.entries) { // The current approach will not detect corruption from XXPH3Filter to @@ -1301,7 +1223,6 @@ Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) { ResetEntries(); return s; } -} // namespace const char* BuiltinFilterPolicy::kClassName() { return "rocksdb.internal.BuiltinFilter"; @@ -1380,7 +1301,7 @@ const char* DeprecatedBlockBasedBloomFilterPolicy::kClassName() { } std::string BloomLikeFilterPolicy::GetId() const { - return Name() + GetBitsPerKeySuffix(); + return Name() + GetBitsPerKeySuffix(millibits_per_key_); } DeprecatedBlockBasedBloomFilterPolicy::DeprecatedBlockBasedBloomFilterPolicy( @@ -1541,9 +1462,9 @@ BloomLikeFilterPolicy::GetStandard128RibbonBuilderWithContext( context.info_log); } -std::string BloomLikeFilterPolicy::GetBitsPerKeySuffix() const { - std::string rv = ":" + ROCKSDB_NAMESPACE::ToString(millibits_per_key_ / 1000); - int frac = millibits_per_key_ % 1000; +std::string BloomLikeFilterPolicy::GetBitsPerKeySuffix(int millibits_per_key) { + std::string rv = ":" + ROCKSDB_NAMESPACE::ToString(millibits_per_key / 1000); + int frac = millibits_per_key % 1000; if (frac > 0) { rv.push_back('.'); rv.push_back(static_cast('0' + (frac / 100))); @@ -1884,9 +1805,7 @@ static ObjectLibrary::PatternEntry FilterPatternEntryWithBits( template T* NewBuiltinFilterPolicyWithBits(const std::string& uri) { - const std::vector vals = StringSplit(uri, ':'); - double bits_per_key = ParseDouble(vals[1]); - return new T(bits_per_key); + return new T(FilterPolicy::ExtractBitsPerKeyFromUri(uri)); } static int RegisterBuiltinFilterPolicies(ObjectLibrary& library, const std::string& /*arg*/) { @@ -1995,6 +1914,11 @@ static int RegisterBuiltinFilterPolicies(ObjectLibrary& library, } // namespace #endif // ROCKSDB_LITE +double FilterPolicy::ExtractBitsPerKeyFromUri(const std::string& uri) { + const std::vector vals = StringSplit(uri, ':'); + return ParseDouble(vals[1]); +} + Status FilterPolicy::CreateFromString( const ConfigOptions& options, const std::string& value, std::shared_ptr* policy) { @@ -2046,4 +1970,14 @@ const std::vector& BloomLikeFilterPolicy::GetAllFixedImpls() { return impls; } +int BloomLikeFilterPolicy::GetAllFixedImplIndex(const std::string& name) { + const auto& all_names = GetAllFixedImpls(); + for (size_t idx = 0; idx < all_names.size(); idx++) { + if (name == all_names[idx]) { + return static_cast(idx); + } + } + return -1; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index 06566f8712..c474d81442 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -9,10 +9,12 @@ #pragma once #include +#include #include #include #include +#include "cache/cache_reservation_manager.h" #include "rocksdb/filter_policy.h" #include "rocksdb/table.h" @@ -95,6 +97,8 @@ class FilterBitsReader { may_match[i] = MayMatch(*keys[i]); } } + + virtual bool HashMayMatch(const uint64_t /* h */) = 0; }; // Exposes any extra information needed for testing built-in @@ -115,12 +119,102 @@ class BuiltinFilterBitsBuilder : public FilterBitsBuilder { virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0; }; +class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit XXPH3FilterBitsBuilder( + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption); + + ~XXPH3FilterBitsBuilder() override {} + + virtual void AddKey(const Slice& key) override; + virtual size_t EstimateEntriesAdded() override; + virtual Status MaybePostVerify(const Slice& filter_content) override; + + protected: + static constexpr uint32_t kMetadataLen = 5; + + // Number of hash entries to accumulate before charging their memory usage to + // the cache when cache reservation is available + static const std::size_t kUint64tHashEntryCacheResBucketSize; + + // For delegating between XXPH3FilterBitsBuilders + void SwapEntriesWith(XXPH3FilterBitsBuilder* other); + void ResetEntries() { hash_entries_info_.Reset(); } + + virtual size_t RoundDownUsableSpace(size_t available_size) = 0; + + // To choose size using malloc_usable_size, we have to actually allocate. + size_t AllocateMaybeRounding(size_t target_len_with_metadata, + size_t num_entries, + std::unique_ptr* buf); + + // TODO: Ideally we want to verify the hash entry + // as it is added to the filter and eliminate this function + // for speeding up and leaving fewer spaces for undetected memory/CPU + // corruption. For Ribbon Filter, it's bit harder. + // Possible solution: + // pass a custom iterator that tracks the xor checksum as + // it iterates to ResetAndFindSeedToSolve + Status MaybeVerifyHashEntriesChecksum(); + + virtual FilterBitsReader* GetBitsReader(const Slice& filter_content); + + // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, + // always "round up" like historic behavior. + std::atomic* aggregate_rounding_balance_; + + // For reserving memory used in (new) Bloom and Ribbon Filter construction + std::shared_ptr cache_res_mgr_; + + // For managing cache reservation for final filter in (new) Bloom and Ribbon + // Filter construction + std::deque> + final_filter_cache_res_handles_; + + bool detect_filter_construct_corruption_; + + struct HashEntriesInfo { + // A deque avoids unnecessary copying of already-saved values + // and has near-minimal peak memory use. + std::deque entries; + + // If cache_res_mgr_ != nullptr, + // it manages cache reservation for buckets of hash entries in (new) Bloom + // or Ribbon Filter construction. + // Otherwise, it is empty. + std::deque> + cache_res_bucket_handles; + + // If detect_filter_construct_corruption_ == true, + // it records the xor checksum of hash entries. + // Otherwise, it is 0. + uint64_t xor_checksum = 0; + + void Swap(HashEntriesInfo* other) { + assert(other != nullptr); + std::swap(entries, other->entries); + std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles); + std::swap(xor_checksum, other->xor_checksum); + } + + void Reset() { + entries.clear(); + cache_res_bucket_handles.clear(); + xor_checksum = 0; + } + }; + + HashEntriesInfo hash_entries_info_; +}; + // Base class for RocksDB built-in filter reader with // extra useful functionalities for inernal. class BuiltinFilterBitsReader : public FilterBitsReader { public: // Check if the hash of the entry match the bits in filter - virtual bool HashMayMatch(const uint64_t /* h */) { return true; } + bool HashMayMatch(const uint64_t /* h */) override { return true; } }; // Base class for RocksDB built-in filter policies. This provides the @@ -195,6 +289,8 @@ class BloomLikeFilterPolicy : public BuiltinFilterPolicy { std::string GetId() const override; + static std::string GetBitsPerKeySuffix(int millibits_per_key); + // Essentially for testing only: configured millibits/key int GetMillibitsPerKey() const { return millibits_per_key_; } // Essentially for testing only: legacy whole bits/key @@ -205,6 +301,9 @@ class BloomLikeFilterPolicy : public BuiltinFilterPolicy { // "always use this implementation." Only appropriate for unit tests. static const std::vector& GetAllFixedImpls(); + // Returns the index in GetAllFixedImpls of "name" if found, -1 if not + static int GetAllFixedImplIndex(const std::string& name); + // Convenience function for creating by name for fixed impls static std::shared_ptr Create(const std::string& name, double bits_per_key); @@ -218,8 +317,6 @@ class BloomLikeFilterPolicy : public BuiltinFilterPolicy { FilterBitsBuilder* GetStandard128RibbonBuilderWithContext( const FilterBuildingContext& context) const; - std::string GetBitsPerKeySuffix() const; - private: // Bits per key settings are for configuring Bloom filters. @@ -324,6 +421,26 @@ class DeprecatedBlockBasedBloomFilterPolicy : public BloomLikeFilterPolicy { static bool KeyMayMatch(const Slice& key, const Slice& bloom_filter); }; +class AlwaysTrueFilter : public BuiltinFilterBitsReader { + public: + bool MayMatch(const Slice&) override { return true; } + using FilterBitsReader::MayMatch; // inherit overload + bool HashMayMatch(const uint64_t) override { return true; } + using BuiltinFilterBitsReader::HashMayMatch; // inherit overload +}; + +class AlwaysFalseFilter : public BuiltinFilterBitsReader { + public: + bool MayMatch(const Slice&) override { return false; } + using FilterBitsReader::MayMatch; // inherit overload + bool HashMayMatch(const uint64_t) override { return false; } + using BuiltinFilterBitsReader::HashMayMatch; // inherit overload +}; + +inline Slice FinishAlwaysTrue(std::unique_ptr* /*buf*/) { + return Slice("\0\0\0\0\0\0", 6); +} + // For testing only, but always constructable with internal names namespace test { diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 24d870d4cd..4926606623 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -67,6 +67,10 @@ class TestFilterBitsReader : public FilterBitsReader { using FilterBitsReader::MayMatch; bool MayMatch(const Slice& entry) override { uint32_t h = Hash(entry.data(), entry.size(), 1); + return HashMayMatch(h); + } + + bool HashMayMatch(const uint64_t h) override { for (size_t i = 0; i + 4 <= len_; i += 4) { if (h == DecodeFixed32(data_ + i)) { return true; diff --git a/test_util/testutil.h b/test_util/testutil.h index 712862f2e4..1fc454dcd8 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -819,11 +819,6 @@ bool IsPrefetchSupported(const std::shared_ptr& fs, // Return the number of lines where a given pattern was found in a file. size_t GetLinesCount(const std::string& fname, const std::string& pattern); -// TEST_TMPDIR may be set to /dev/shm in Makefile, -// but /dev/shm does not support direct IO. -// Tries to set TEST_TMPDIR to a directory supporting direct IO. -void ResetTmpDirForDirectIO(); - Status CorruptFile(Env* env, const std::string& fname, int offset, int bytes_to_corrupt, bool verify_checksum = true); Status TruncateFile(Env* env, const std::string& fname, uint64_t length); diff --git a/third-party/.clang-format b/third-party/.clang-format new file mode 100644 index 0000000000..37f3d57668 --- /dev/null +++ b/third-party/.clang-format @@ -0,0 +1 @@ +DisableFormat: true \ No newline at end of file diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc index 9f2b3d5653..b19c9f2a81 100644 --- a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc +++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc @@ -8676,7 +8676,7 @@ static void StackLowerThanAddress(const void* ptr, bool* result) { // Make sure AddressSanitizer does not tamper with the stack here. GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ static bool StackGrowsDown() { - int dummy; + int dummy = 0; bool result; StackLowerThanAddress(&dummy, &result); return result; diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest.h b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h index 2d82d8e4d0..56f1a43152 100644 --- a/third-party/gtest-1.8.1/fused-src/gtest/gtest.h +++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h @@ -3008,7 +3008,7 @@ class ThreadWithParam : public ThreadWithParamBase { } } - virtual void Run() { + virtual void Run() override { if (thread_can_start_ != NULL) thread_can_start_->WaitForNotification(); func_(param_); @@ -3192,7 +3192,7 @@ class ThreadWithParam : public ThreadWithParamBase { param_(param) { } virtual ~RunnableImpl() {} - virtual void Run() { + virtual void Run() override { func_(param_); } @@ -9202,7 +9202,7 @@ class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ public:\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ private:\ - virtual void TestBody();\ + virtual void TestBody() override;\ static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ GTEST_DISALLOW_COPY_AND_ASSIGN_(\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ @@ -11639,7 +11639,7 @@ class RangeGenerator : public ParamGeneratorInterface { virtual const ParamGeneratorInterface* BaseGenerator() const { return base_; } - virtual void Advance() { + virtual void Advance() override { value_ = static_cast(value_ + step_); index_++; } @@ -11726,7 +11726,7 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { virtual const ParamGeneratorInterface* BaseGenerator() const { return base_; } - virtual void Advance() { + virtual void Advance() override { ++iterator_; value_.reset(); } @@ -11952,7 +11952,7 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { // This method should not be called more then once on any single // instance of a ParameterizedTestCaseInfoBase derived class. // UnitTest has a guard to prevent from calling this method more then once. - virtual void RegisterTests() { + virtual void RegisterTests() override { for (typename TestInfoContainer::iterator test_it = tests_.begin(); test_it != tests_.end(); ++test_it) { linked_ptr test_info = *test_it; @@ -15740,7 +15740,7 @@ class CartesianProductGenerator2 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current2_; if (current2_ == end2_) { @@ -15859,7 +15859,7 @@ class CartesianProductGenerator3 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current3_; if (current3_ == end3_) { @@ -15996,7 +15996,7 @@ class CartesianProductGenerator4 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current4_; if (current4_ == end4_) { @@ -16150,7 +16150,7 @@ class CartesianProductGenerator5 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current5_; if (current5_ == end5_) { @@ -16323,7 +16323,7 @@ class CartesianProductGenerator6 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current6_; if (current6_ == end6_) { @@ -16513,7 +16513,7 @@ class CartesianProductGenerator7 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current7_; if (current7_ == end7_) { @@ -16722,7 +16722,7 @@ class CartesianProductGenerator8 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current8_; if (current8_ == end8_) { @@ -16947,7 +16947,7 @@ class CartesianProductGenerator9 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current9_; if (current9_ == end9_) { @@ -17190,7 +17190,7 @@ class CartesianProductGenerator10 } // Advance should not be called on beyond-of-range iterators // so no component iterators must be beyond end of range, either. - virtual void Advance() { + virtual void Advance() override { assert(!AtEnd()); ++current10_; if (current10_ == end10_) { @@ -18873,7 +18873,7 @@ internal::CartesianProductHolder10parameterized_test_registry(). \ @@ -19157,7 +19157,7 @@ class GTEST_API_ HasNewFatalFailureHelper public: HasNewFatalFailureHelper(); virtual ~HasNewFatalFailureHelper(); - virtual void ReportTestPartResult(const TestPartResult& result); + virtual void ReportTestPartResult(const TestPartResult& result) override; bool has_new_fatal_failure() const { return has_new_fatal_failure_; } private: bool has_new_fatal_failure_; @@ -19377,7 +19377,7 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); private: \ typedef CaseName TestFixture; \ typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ + virtual void TestBody() override; \ }; \ static bool gtest_##CaseName##_##TestName##_registered_ \ GTEST_ATTRIBUTE_UNUSED_ = \ @@ -19439,7 +19439,7 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); private: \ typedef CaseName TestFixture; \ typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ + virtual void TestBody() override; \ }; \ static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\ @@ -20867,21 +20867,21 @@ class TestEventListener { // above. class EmptyTestEventListener : public TestEventListener { public: - virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} + virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} virtual void OnTestIterationStart(const UnitTest& /*unit_test*/, - int /*iteration*/) {} - virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {} - virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} - virtual void OnTestStart(const TestInfo& /*test_info*/) {} - virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {} - virtual void OnTestEnd(const TestInfo& /*test_info*/) {} - virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} - virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {} - virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} + int /*iteration*/) override {} + virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {} + virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} + virtual void OnTestCaseStart(const TestCase& /*test_case*/) override {} + virtual void OnTestStart(const TestInfo& /*test_info*/) override {} + virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {} + virtual void OnTestEnd(const TestInfo& /*test_info*/) override {} + virtual void OnTestCaseEnd(const TestCase& /*test_case*/) override {} + virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {} + virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/, - int /*iteration*/) {} - virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} + int /*iteration*/) override {} + virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} }; // TestEventListeners lets users add listeners to track events in Google Test. diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 19030e84b6..147282474e 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -18,6 +18,7 @@ if(WITH_TOOLS) dump/rocksdb_undump.cc) foreach(src ${TOOLS}) get_filename_component(exename ${src} NAME_WE) + string(REPLACE rocksdb speedb exename ${exename}) add_executable(${exename}${ARTIFACT_SUFFIX} ${src}) target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS}) diff --git a/tools/benchmark.sh b/tools/benchmark.sh index 2381be2ff1..8b21a83660 100755 --- a/tools/benchmark.sh +++ b/tools/benchmark.sh @@ -230,7 +230,7 @@ function summarize_result { # that "Compaction Stats" is written to stdout at least once. If it won't # happen then empty output from grep when searching for "Sum" will cause # syntax errors. - version=$( grep ^RocksDB: $test_out | awk '{ print $3 }' ) + version=$( grep ^Speedb: $test_out | awk '{ print $3 }' ) date=$( grep ^Date: $test_out | awk '{ print $6 "-" $3 "-" $4 "T" $5 ".000" }' ) iso_date=$( month_to_num $date ) tz=$( date "+%z" ) @@ -259,7 +259,7 @@ function summarize_result { # if the report TSV (Tab Separate Values) file does not yet exist, create it and write the header row to it if [ ! -f "$report" ]; then - echo -e "ops_sec\tmb_sec\ttotal_size_gb\tlevel0_size_gb\tsum_gb\twrite_amplification\twrite_mbps\tusec_op\tpercentile_50\tpercentile_75\tpercentile_99\tpercentile_99.9\tpercentile_99.99\tuptime\tstall_time\tstall_percent\ttest_name\ttest_date\trocksdb_version\tjob_id" \ + echo -e "ops_sec\tmb_sec\ttotal_size_gb\tlevel0_size_gb\tsum_gb\twrite_amplification\twrite_mbps\tusec_op\tpercentile_50\tpercentile_75\tpercentile_99\tpercentile_99.9\tpercentile_99.99\tuptime\tstall_time\tstall_percent\ttest_name\ttest_date\tversion\tjob_id" \ >> $report fi diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc index 896a6ced16..bb275ff468 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -652,7 +652,7 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) { /*is_block_cache_human_readable_trace=*/false, /*simulator=*/nullptr); // The analyzer ends when it detects an incomplete access record. - ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze()); + ASSERT_TRUE(analyzer.Analyze().IsIncomplete()); const uint64_t expected_num_cfs = 1; std::vector expected_fds{kSSTStoringOddKeys, kSSTStoringEvenKeys}; const std::vector expected_types{ diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index 50c0a1b4c1..de65bb0a9a 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -39,9 +39,9 @@ tmp_origin=_tmp_origin set -e git remote remove $tmp_origin 2>/dev/null || true if [ "$USE_SSH" ]; then - git remote add $tmp_origin "git@github.com:facebook/rocksdb.git" + git remote add $tmp_origin "git@github.com:speedb-io/speedb.git" else - git remote add $tmp_origin "https://github.com/facebook/rocksdb.git" + git remote add $tmp_origin "https://github.com/speedb-io/speedb.git" fi git fetch $tmp_origin @@ -60,7 +60,7 @@ trap cleanup EXIT # Always clean up, even on failure or Ctrl+C scriptpath=`dirname ${BASH_SOURCE[0]}` -test_dir=${TEST_TMPDIR:-"/tmp"}"/rocksdb_format_compatible_$USER" +test_dir=${TEST_TMPDIR:-"/tmp"}"/speedb_format_compatible_$USER" rm -rf ${test_dir:?} # For saving current version of scripts as we checkout different versions to test diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index f356e08f43..f91b059bd4 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -71,6 +71,7 @@ #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "rocksdb/write_batch.h" +#include "speedb/version.h" #include "test_util/testutil.h" #include "test_util/transaction_test_util.h" #include "tools/simulated_hybrid_file_system.h" @@ -121,6 +122,7 @@ DEFINE_string( "newiterator," "newiteratorwhilewriting," "seekrandom," + "seekrandomwriterandom," "seekrandomwhilewriting," "seekrandomwhilemerging," "readseq," @@ -207,6 +209,8 @@ IF_ROCKSDB_LITE("", "\tnewiterator -- repeated iterator creation\n" "\tseekrandom -- N random seeks, call Next seek_nexts times " "per seek\n" + "\tseekrandomwriterandom -- N threads doing random overwrite and " + "random seek\n" "\tseekrandomwhilewriting -- seekrandom and 1 thread doing " "overwrite\n" "\tseekrandomwhilemerging -- seekrandom and 1 thread doing " @@ -633,6 +637,21 @@ DEFINE_bool( pin_top_level_index_and_filter, false, "Pin top-level index of partitioned index/filter blocks in block cache."); +DEFINE_bool( + top_level_index_pinning, false, + "Pin top-level block of partitioned index/filter blocks in block cache." + " Note: `cache_index_and_filter_blocks` must be true for this option to" + " have any effect."); + +DEFINE_bool(partition_pinning, false, + "Pin index/filter partitions in block cache."); + +DEFINE_bool( + unpartitioned_pinning, false, + "Pin unpartitioned index/filter blocks in block cache." + " Note `cache_index_and_filter_blocks` must be true for this option to have" + " any effect."); + DEFINE_int32(block_size, static_cast( ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size), @@ -707,9 +726,9 @@ DEFINE_int32(random_access_max_buffer_size, 1024 * 1024, DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024, "Maximum write buffer for Writable File"); -DEFINE_int32(bloom_bits, -1, - "Bloom filter bits per key. Negative means use default." - "Zero disables."); +DEFINE_double(bloom_bits, -1, + "Bloom filter bits per key. Negative means use default." + "Zero disables."); DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter"); @@ -881,7 +900,7 @@ DEFINE_bool(optimize_filters_for_hits, "level of the LSM to reduce metadata that should fit in RAM. "); DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks, - "RocksDB will aggressively check consistency of the data."); + "Aggressively checks for consistency of the data."); DEFINE_bool(force_consistency_checks, ROCKSDB_NAMESPACE::Options().force_consistency_checks, @@ -937,12 +956,12 @@ DEFINE_uint64(transaction_lock_timeout, 100, " milliseconds before failing a transaction waiting on a lock"); DEFINE_string( options_file, "", - "The path to a RocksDB options file. If specified, then db_bench will " - "run with the RocksDB options in the default column family of the " - "specified options file. " + "The path to an options file. If specified, then db_bench will " + "run with the options in the default column family of the specified " + "options file. " "Note that with this setting, db_bench will ONLY accept the following " - "RocksDB options related command-line arguments, all other arguments " - "that are related to RocksDB options will be ignored:\n" + "database options related command-line arguments, all other arguments " + "that are related to database options will be ignored:\n" "\t--use_existing_db\n" "\t--use_existing_keys\n" "\t--statistics\n" @@ -1061,7 +1080,7 @@ DEFINE_uint64(blob_compaction_readahead_size, // Secondary DB instance Options DEFINE_bool(use_secondary_db, false, - "Open a RocksDB secondary instance. A primary instance can be " + "Open a secondary database instance. A primary instance can be " "running in another db_bench process."); DEFINE_string(secondary_path, "", @@ -1122,7 +1141,7 @@ DEFINE_bool(rate_limit_auto_wal_flush, false, "false) after the user write operation."); DEFINE_bool(async_io, false, - "When set true, RocksDB does asynchronous reads for internal auto " + "When set true, asynchronous reads are used for internal auto " "readahead prefetching."); DEFINE_bool(reserve_table_reader_memory, false, @@ -1210,6 +1229,7 @@ static bool ValidateTableCacheNumshardbits(const char* flagname, return true; } DEFINE_int32(table_cache_numshardbits, 4, ""); +DEFINE_string(filter_uri, "", "URI for registry FilterPolicy"); #ifndef ROCKSDB_LITE DEFINE_string(env_uri, "", @@ -1335,10 +1355,9 @@ DEFINE_double(sine_d, 1, DEFINE_bool(rate_limit_bg_reads, false, "Use options.rate_limiter on compaction reads"); -DEFINE_uint64( - benchmark_write_rate_limit, 0, - "If non-zero, db_bench will rate-limit the writes going into RocksDB. This " - "is the global rate in bytes/second."); +DEFINE_uint64(benchmark_write_rate_limit, 0, + "If non-zero, db_bench will rate-limit the writes going into the " + "database. This is the global rate in bytes/second."); // the parameters of mix_graph DEFINE_double(keyrange_dist_a, 0.0, @@ -1403,7 +1422,7 @@ DEFINE_int64(mix_accesses, -1, DEFINE_uint64( benchmark_read_rate_limit, 0, - "If non-zero, db_bench will rate-limit the reads from RocksDB. This " + "If non-zero, db_bench will rate-limit the reads from the database. This " "is the global rate in ops/second."); DEFINE_uint64(max_compaction_bytes, @@ -1536,6 +1555,10 @@ DEFINE_bool(persist_stats_to_disk, DEFINE_uint64(stats_history_buffer_size, ROCKSDB_NAMESPACE::Options().stats_history_buffer_size, "Max number of stats snapshots to keep in memory"); +DEFINE_bool(avoid_unnecessary_blocking_io, + ROCKSDB_NAMESPACE::Options().avoid_unnecessary_blocking_io, + "If true, some expensive cleaning up operations will be moved from " + "user threads to background threads."); DEFINE_bool(avoid_flush_during_recovery, ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery, "If true, avoids flushing the recovered WAL data where possible."); @@ -1610,9 +1633,6 @@ static Status CreateMemTableRepFactory( #ifndef ROCKSDB_LITE } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) { factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count)); - } else if (!strcasecmp(FLAGS_memtablerep.c_str(), - VectorRepFactory::kNickName())) { - factory->reset(new VectorRepFactory()); } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) { factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count)); #endif // ROCKSDB_LITE @@ -2576,7 +2596,7 @@ class Benchmark { compressed); } - void PrintHeader(const Options& options) { + void PrintHeader() { PrintEnvironment(); fprintf(stdout, "Keys: %d bytes each (+ %d bytes user-defined timestamp)\n", @@ -2626,10 +2646,8 @@ class Benchmark { fprintf(stdout, "Compression: %s\n", compression.c_str()); fprintf(stdout, "Compression sampling rate: %" PRId64 "\n", FLAGS_sample_for_compression); - if (options.memtable_factory != nullptr) { - fprintf(stdout, "Memtablerep: %s\n", - options.memtable_factory->GetId().c_str()); - } + + fprintf(stdout, "Memtablerep: %s\n", FLAGS_memtablerep.c_str()); fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level); PrintWarnings(compression.c_str()); @@ -2684,8 +2702,8 @@ class Benchmark { #endif void PrintEnvironment() { - fprintf(stderr, "RocksDB: version %d.%d\n", - kMajorVersion, kMinorVersion); + fprintf(stderr, "Speedb: version %s\n", + GetSpeedbVersionAsString(false).c_str()); #if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__) time_t now = time(nullptr); @@ -2957,7 +2975,9 @@ class Benchmark { } void DeleteDBs() { - db_.DeleteDBs(); + if (db_.db != nullptr) { + db_.DeleteDBs(); + } for (const DBWithColumnFamilies& dbwcf : multi_dbs_) { delete dbwcf.db; } @@ -3106,7 +3126,7 @@ class Benchmark { ErrorExit(); } Open(&open_options_); - PrintHeader(open_options_); + PrintHeader(); std::stringstream benchmark_stream(FLAGS_benchmarks); std::string name; std::unique_ptr filter; @@ -3276,6 +3296,8 @@ class Benchmark { method = &Benchmark::IteratorCreationWhileWriting; } else if (name == "seekrandom") { method = &Benchmark::SeekRandom; + } else if (name == "seekrandomwriterandom") { + method = &Benchmark::SeekRandomWriteRandom; } else if (name == "seekrandomwhilewriting") { num_threads++; // Add extra thread for writing method = &Benchmark::SeekRandomWhileWriting; @@ -3429,7 +3451,7 @@ class Benchmark { } Options options = open_options_; for (size_t i = 0; i < multi_dbs_.size(); i++) { - delete multi_dbs_[i].db; + multi_dbs_[i].DeleteDBs(); if (!open_options_.wal_dir.empty()) { options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i); } @@ -3827,7 +3849,7 @@ class Benchmark { // options file. bool InitializeOptionsFromFile(Options* opts) { #ifndef ROCKSDB_LITE - printf("Initializing RocksDB Options from the specified file\n"); + printf("Initializing database Options from the specified file\n"); DBOptions db_opts; std::vector cf_descs; if (FLAGS_options_file != "") { @@ -3849,7 +3871,7 @@ class Benchmark { } void InitializeOptionsFromFlags(Options* opts) { - printf("Initializing RocksDB Options from command-line flags\n"); + printf("Initializing database Options from command-line flags\n"); Options& options = *opts; ConfigOptions config_options(options); config_options.ignore_unsupported_options = false; @@ -3941,12 +3963,13 @@ class Benchmark { "HashLinkedList memtablerep is used\n"); exit(1); } + if (FLAGS_use_plain_table) { #ifndef ROCKSDB_LITE if (!options.memtable_factory->IsInstanceOf("prefix_hash") && !options.memtable_factory->IsInstanceOf("hash_linkedlist")) { fprintf(stderr, "Warning: plain table is used with %s\n", - options.memtable_factory->Name()); + FLAGS_memtablerep.c_str()); } int bloom_bits_per_key = FLAGS_bloom_bits; @@ -4054,6 +4077,17 @@ class Benchmark { block_based_options.cache_index_and_filter_blocks_with_high_priority = true; } + + // Metadata Cache Options + block_based_options.metadata_cache_options.top_level_index_pinning = + FLAGS_top_level_index_pinning ? PinningTier::kAll + : PinningTier::kFallback; + block_based_options.metadata_cache_options.partition_pinning = + FLAGS_partition_pinning ? PinningTier::kAll : PinningTier::kFallback; + block_based_options.metadata_cache_options.unpartitioned_pinning = + FLAGS_unpartitioned_pinning ? PinningTier::kAll + : PinningTier::kFallback; + block_based_options.block_cache = cache_; block_based_options.reserve_table_reader_memory = FLAGS_reserve_table_reader_memory; @@ -4299,6 +4333,7 @@ class Benchmark { options.persist_stats_to_disk = FLAGS_persist_stats_to_disk; options.stats_history_buffer_size = static_cast(FLAGS_stats_history_buffer_size); + options.avoid_unnecessary_blocking_io = FLAGS_avoid_unnecessary_blocking_io; options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery; options.compression_opts.level = FLAGS_compression_level; @@ -4316,7 +4351,25 @@ class Benchmark { if (FLAGS_cache_size) { table_options->block_cache = cache_; } - if (FLAGS_bloom_bits < 0) { + std::string bits_str; + if (FLAGS_bloom_bits > 0) { + bits_str = ":" + ROCKSDB_NAMESPACE::ToString(FLAGS_bloom_bits); + fprintf(stderr, "note: appending --bloom-bits (%f) to --filter-uri\n", + FLAGS_bloom_bits); + } + if (!FLAGS_filter_uri.empty()) { + ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + Status s = FilterPolicy::CreateFromString( + config_options, FLAGS_filter_uri + bits_str, + &table_options->filter_policy); + if (!s.ok()) { + fprintf(stderr, "failure creating filter policy[%s%s]: %s\n", + FLAGS_filter_uri.c_str(), bits_str.c_str(), + s.ToString().c_str()); + exit(1); + } + } else if (FLAGS_bloom_bits < 0) { table_options->filter_policy = BlockBasedTableOptions().filter_policy; } else if (FLAGS_bloom_bits == 0) { table_options->filter_policy.reset(); @@ -4332,10 +4385,12 @@ class Benchmark { s.ToString().c_str()); exit(1); } + } else if (FLAGS_use_ribbon_filter) { + table_options->filter_policy.reset( + NewRibbonFilterPolicy(FLAGS_bloom_bits)); } else { table_options->filter_policy.reset( - FLAGS_use_ribbon_filter ? NewRibbonFilterPolicy(FLAGS_bloom_bits) - : NewBloomFilterPolicy(FLAGS_bloom_bits)); + NewBloomFilterPolicy(FLAGS_bloom_bits)); } } if (FLAGS_row_cache_size) { @@ -4559,7 +4614,13 @@ class Benchmark { } #endif // ROCKSDB_LITE } else { - s = DB::Open(options, db_name, &db->db); + std::vector column_families; + column_families.push_back(ColumnFamilyDescriptor( + kDefaultColumnFamilyName, ColumnFamilyOptions(options))); + s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); + db->cfh.resize(1); + db->num_created = 1; + db->num_hot = 1; } if (FLAGS_report_open_timing) { std::cout << "OpenDb: " @@ -5444,6 +5505,10 @@ class Benchmark { thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } } @@ -5495,6 +5560,10 @@ class Benchmark { read % 256 == 255) { thread->shared->read_rate_limiter->Request( 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); @@ -5536,6 +5605,10 @@ class Benchmark { thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } } delete iter; @@ -5591,6 +5664,10 @@ class Benchmark { if (thread->shared->read_rate_limiter.get() != nullptr) { thread->shared->read_rate_limiter->Request( 100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } thread->stats.FinishedOps(nullptr, db, 100, kRead); @@ -5731,6 +5808,10 @@ class Benchmark { read % 256 == 255) { thread->shared->read_rate_limiter->Request( 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); @@ -5837,6 +5918,10 @@ class Benchmark { thread->shared->read_rate_limiter->Request( 256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead); } @@ -6240,6 +6325,10 @@ class Benchmark { if (thread->shared->read_rate_limiter && (gets + seek) % 100 == 0) { thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH, nullptr /*stats*/); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead); } else if (query_type == 1) { @@ -6265,6 +6354,10 @@ class Benchmark { if (thread->shared->write_rate_limiter && puts % 100 == 0) { thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH, nullptr /*stats*/); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite); } else if (query_type == 2) { @@ -6348,6 +6441,7 @@ class Benchmark { int64_t found = 0; int64_t bytes = 0; ReadOptions options = read_options_; + int64_t key_rand = 0; std::unique_ptr ts_guard; Slice ts; if (user_timestamp_size_ > 0) { @@ -6378,7 +6472,9 @@ class Benchmark { Duration duration(FLAGS_duration, reads_); char value_buffer[256]; while (!duration.Done(1)) { - int64_t seek_pos = thread->rand.Next() % FLAGS_num; + key_rand = GetRandomKey(&thread->rand); + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(key_rand); + int64_t seek_pos = key_rand; GenerateKeyFromIntForSeek(static_cast(seek_pos), FLAGS_num, &key); if (FLAGS_max_scan_distance != 0) { @@ -6400,18 +6496,15 @@ class Benchmark { // Pick a Iterator to use uint64_t db_idx_to_use = (db_.db == nullptr) - ? (uint64_t{thread->rand.Next()} % multi_dbs_.size()) + ? (static_cast(key_rand) % multi_dbs_.size()) : 0; std::unique_ptr single_iter; Iterator* iter_to_use; if (FLAGS_use_tailing_iterator) { iter_to_use = tailing_iters[db_idx_to_use]; } else { - if (db_.db != nullptr) { - single_iter.reset(db_.db->NewIterator(options)); - } else { - single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options)); - } + single_iter.reset(db_with_cfh->db->NewIterator( + options, db_with_cfh->GetCfh(key_rand))); iter_to_use = single_iter.get(); } @@ -6440,6 +6533,10 @@ class Benchmark { read % 256 == 255) { thread->shared->read_rate_limiter->Request( 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } thread->stats.FinishedOps(&db_, db_.db, 1, kSeek); @@ -6618,6 +6715,10 @@ class Benchmark { write_rate_limiter->Request( key.size() + val.size(), Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kWrite); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } } thread->stats.AddBytes(bytes); @@ -6863,6 +6964,7 @@ class Benchmark { ReadOptions options = read_options_; RandomGenerator gen; std::string value; + int64_t key_rand = 0; int64_t found = 0; int get_weight = 0; int put_weight = 0; @@ -6880,8 +6982,10 @@ class Benchmark { // the number of iterations is the larger of read_ or write_ while (!duration.Done(1)) { - DB* db = SelectDB(thread); - GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + key_rand = GetRandomKey(&thread->rand); + DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(key_rand); + DB* db = db_with_cfh->db; + GenerateKeyFromInt(key_rand, FLAGS_num, &key); if (get_weight == 0 && put_weight == 0) { // one batch completed, reinitialize for next batch get_weight = FLAGS_readwritepercent; @@ -6895,7 +6999,7 @@ class Benchmark { ts_guard.get()); options.timestamp = &ts; } - Status s = db->Get(options, key, &value); + Status s = db->Get(options, db_with_cfh->GetCfh(key_rand), key, &value); if (!s.ok() && !s.IsNotFound()) { fprintf(stderr, "get error: %s\n", s.ToString().c_str()); // we continue after error rather than exiting so that we can @@ -6905,16 +7009,18 @@ class Benchmark { } get_weight--; reads_done++; - thread->stats.FinishedOps(nullptr, db, 1, kRead); + thread->stats.FinishedOps(db_with_cfh, db, 1, kRead); } else if (put_weight > 0) { // then do all the corresponding number of puts // for all the gets we have done earlier Status s; if (user_timestamp_size_ > 0) { Slice ts = mock_app_clock_->Allocate(ts_guard.get()); - s = db->Put(write_options_, key, ts, gen.Generate()); + s = db->Put(write_options_, db_with_cfh->GetCfh(key_rand), key, ts, + gen.Generate()); } else { - s = db->Put(write_options_, key, gen.Generate()); + s = db->Put(write_options_, db_with_cfh->GetCfh(key_rand), key, + gen.Generate()); } if (!s.ok()) { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); @@ -6922,7 +7028,7 @@ class Benchmark { } put_weight--; writes_done++; - thread->stats.FinishedOps(nullptr, db, 1, kWrite); + thread->stats.FinishedOps(db_with_cfh, db, 1, kWrite); } } char msg[100]; @@ -6932,6 +7038,162 @@ class Benchmark { thread->stats.AddMessage(msg); } + // Each thread does #iterations of either seek or write + // use readwritepercent to set ratio of seek/write + // number of iterations = duration ? duration : readwrites_ + // readwrites_ = max(reads_, writes) or num if zero. + // can pass: seek_nexts, reverse_iterator, max_scan_distance and + // use_tailing_iterator. seek was taken from SeekRandom and write from + // ReadRandomWriteRandom + void SeekRandomWriteRandom(ThreadState* thread) { + // Seek preparation + int64_t seeks = 0; + int64_t found = 0; + int64_t bytes = 0; + ReadOptions options(FLAGS_verify_checksum, true); + options.total_order_seek = FLAGS_total_order_seek; + options.prefix_same_as_start = FLAGS_prefix_same_as_start; + options.tailing = FLAGS_use_tailing_iterator; + options.readahead_size = FLAGS_readahead_size; + + std::unique_ptr single_iter; + std::vector> multi_iters; + if (db_.db != nullptr) { + single_iter.reset(db_.db->NewIterator(options)); + } else { + for (const auto& db_with_cfh : multi_dbs_) { + multi_iters.emplace_back(db_with_cfh.db->NewIterator(options)); + } + } + + std::unique_ptr upper_bound_key_guard; + Slice upper_bound = AllocateKey(&upper_bound_key_guard); + std::unique_ptr lower_bound_key_guard; + Slice lower_bound = AllocateKey(&lower_bound_key_guard); + + // Write preparation + RandomGenerator gen; + int64_t writes_done = 0; + Duration duration(FLAGS_duration, readwrites_); + + std::unique_ptr key_guard; + Slice key = AllocateKey(&key_guard); + + std::unique_ptr ts_guard; + if (user_timestamp_size_ > 0) { + ts_guard.reset(new char[user_timestamp_size_]); + } + + // the number of iterations is the larger of read_ or write_ + while (!duration.Done(1)) { + int prob_op = static_cast(thread->rand.Uniform(100)); + + // Seek + if (prob_op >= 0 && prob_op < static_cast(FLAGS_readwritepercent)) { + Slice ts; + if (user_timestamp_size_ > 0) { + ts = mock_app_clock_->GetTimestampForRead(thread->rand, + ts_guard.get()); + options.timestamp = &ts; + } + + int64_t seek_pos = thread->rand.Next() % FLAGS_num; + GenerateKeyFromIntForSeek(static_cast(seek_pos), FLAGS_num, + &key); + if (FLAGS_max_scan_distance != 0) { + if (FLAGS_reverse_iterator) { + GenerateKeyFromInt(static_cast(std::max( + static_cast(0), + seek_pos - FLAGS_max_scan_distance)), + FLAGS_num, &lower_bound); + options.iterate_lower_bound = &lower_bound; + } else { + auto min_num = + std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance); + GenerateKeyFromInt(static_cast(min_num), FLAGS_num, + &upper_bound); + options.iterate_upper_bound = &upper_bound; + } + } + + if (!FLAGS_use_tailing_iterator) { + if (db_.db != nullptr) { + single_iter.reset(db_.db->NewIterator(options)); + } else { + multi_iters.clear(); + for (const auto& db_with_cfh : multi_dbs_) { + multi_iters.emplace_back(db_with_cfh.db->NewIterator(options)); + } + } + } + + // Pick an Iterator to use + Iterator* iter_to_use = single_iter.get(); + if (iter_to_use == nullptr) { + iter_to_use = + multi_iters[thread->rand.Next() % multi_iters.size()].get(); + } + + iter_to_use->Seek(key); + seeks++; + if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { + found++; + } + + for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) { + // Copy out iterator's value to make sure we read them. + bytes += iter_to_use->key().size() + iter_to_use->value().size(); + + if (!FLAGS_reverse_iterator) { + iter_to_use->Next(); + } else { + iter_to_use->Prev(); + } + assert(iter_to_use->status().ok()); + } + + if (thread->shared->read_rate_limiter.get() != nullptr && + seeks % 256 == 255) { + thread->shared->read_rate_limiter->Request( + 256, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kRead); + } + + thread->stats.FinishedOps(&db_, db_.db, 1, kSeek); + // Write + } else { + DB* db = SelectDB(thread); + GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); + + Status s; + if (user_timestamp_size_ > 0) { + Slice ts = mock_app_clock_->Allocate(ts_guard.get()); + s = db->Put(write_options_, key, ts, gen.Generate()); + } else { + s = db->Put(write_options_, key, gen.Generate()); + } + + if (!s.ok()) { + fprintf(stderr, "put error: %s\n", s.ToString().c_str()); + ErrorExit(); + } + writes_done++; + thread->stats.FinishedOps(nullptr, db, 1, kWrite); + } + } + + char msg[100]; + snprintf(msg, sizeof(msg), + "( seeks:%" PRIu64 " writes:%" PRIu64 " found:%" PRIu64 ")", seeks, + writes_done, found); + thread->stats.AddBytes(bytes); + thread->stats.AddMessage(msg); + if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) { + thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") + + get_perf_context()->ToString()); + } + } + // // Read-modify-write for random keys void UpdateRandom(ThreadState* thread) { @@ -6973,6 +7235,10 @@ class Benchmark { thread->shared->write_rate_limiter->Request( key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/, RateLimiter::OpType::kWrite); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } Slice val = gen.Generate(); @@ -7651,6 +7917,10 @@ class Benchmark { if (thread->shared->read_rate_limiter.get() != nullptr) { thread->shared->read_rate_limiter->Request( 1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } } delete iter; @@ -7723,6 +7993,10 @@ class Benchmark { write_rate_limiter->Request( key.size() + val.size(), Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kWrite); + // Set time at which last op finished to Now() to hide latency and + // sleep from rate limiter. Also, do the check once per batch, not + // once per write. + thread->stats.ResetLastOpTime(); } } } @@ -8055,6 +8329,24 @@ class Benchmark { #endif // ROCKSDB_LITE }; +void ValidateMetadataCacheOptions() { + if (FLAGS_top_level_index_pinning && + (FLAGS_cache_index_and_filter_blocks == false)) { + fprintf(stderr, + "ERROR: --cache_index_and_filter_blocks must be set for " + "--top_level_index_pinning to have any affect.\n"); + exit(1); + } + + if (FLAGS_unpartitioned_pinning && + (FLAGS_cache_index_and_filter_blocks == false)) { + fprintf(stderr, + "ERROR: --cache_index_and_filter_blocks must be set for " + "--unpartitioned_pinning to have any affect.\n"); + exit(1); + } +} + int db_bench_tool(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ConfigOptions config_options; @@ -8208,6 +8500,8 @@ int db_bench_tool(int argc, char** argv) { exit(1); } + ValidateMetadataCacheOptions(); + ROCKSDB_NAMESPACE::Benchmark benchmark; benchmark.Run(); diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 231f793728..4ddeb3decf 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -8,9 +8,11 @@ import random import re import tempfile +import signal import subprocess import shutil import argparse +import datetime # params overwrite priority: # for default: @@ -30,13 +32,23 @@ # default_params < {blackbox,whitebox}_default_params < multiops_txn_params < args +supplied_ops = { + "writepercent": -1, + "delpercent": -1, + "prefixpercent": -1, + "delrangepercent": -1, + "readpercent": -1, + "iterpercent": -1, + "customopspercent": -1, +} + default_params = { "acquire_snapshot_one_in": 10000, "backup_max_size": 100 * 1024 * 1024, # Consider larger number when backups considered more stable "backup_one_in": 100000, "batch_protection_bytes_per_key": lambda: random.choice([0, 8]), - "block_size": 16384, + "block_size": random.choice([16384, 4096]), "bloom_bits": lambda: random.choice([random.randint(0,19), random.lognormvariate(2.3, 1.3)]), "cache_index_and_filter_blocks": lambda: random.randint(0, 1), @@ -59,42 +71,37 @@ "clear_column_family_one_in": 0, "compact_files_one_in": 1000000, "compact_range_one_in": 1000000, - "delpercent": 4, - "delrangepercent": 1, "destroy_db_initially": 0, - "enable_pipelined_write": lambda: random.randint(0, 1), + "enable_pipelined_write": lambda: random.choice([0, 0, 0, 0, 1]), "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]), "expected_values_dir": lambda: setup_expected_values_dir(), "fail_if_options_file_error": lambda: random.randint(0, 1), "flush_one_in": 1000000, "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]), - "get_live_files_one_in": 1000000, + "get_live_files_one_in": 100000, # Note: the following two are intentionally disabled as the corresponding # APIs are not guaranteed to succeed. "get_sorted_wal_files_one_in": 0, "get_current_wal_file_one_in": 0, # Temporarily disable hash index "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]), - "iterpercent": 10, "mark_for_compaction_one_file_in": lambda: 10 * random.randint(0, 1), "max_background_compactions": 20, "max_bytes_for_level_base": 10485760, - "max_key": 25000000, + "max_key": random.choice([100 * 1024, 1024 * 1024, 10 * 1024 * 1024]), "max_write_buffer_number": 3, "mmap_read": lambda: random.randint(0, 1), # Setting `nooverwritepercent > 0` is only possible because we do not vary - # the random seed, so the same keys are chosen by every run for disallowing - # overwrites. - "nooverwritepercent": 1, + # the random seed between runs, so the same keys are chosen by every run + # for disallowing overwrites. + "nooverwritepercent": random.choice([0, 5, 20, 30, 40, 50, 95]), "open_files": lambda : random.choice([-1, -1, 100, 500000]), "optimize_filters_for_memory": lambda: random.randint(0, 1), "partition_filters": lambda: random.randint(0, 1), "partition_pinning": lambda: random.randint(0, 3), "pause_background_one_in": 1000000, "prefix_size" : lambda: random.choice([-1, 1, 5, 7, 8]), - "prefixpercent": 5, "progress_reports": 0, - "readpercent": 45, "recycle_log_file_num": lambda: random.randint(0, 1), "snapshot_hold_ops": 100000, "sst_file_manager_bytes_per_sec": lambda: random.choice([0, 104857600]), @@ -103,23 +110,23 @@ "subcompactions": lambda: random.randint(1, 4), "target_file_size_base": 2097152, "target_file_size_multiplier": 2, - "test_batches_snapshots": lambda: random.randint(0, 1), + "test_batches_snapshots": random.choice([0, 0, 0, 1]), "top_level_index_pinning": lambda: random.randint(0, 3), "unpartitioned_pinning": lambda: random.randint(0, 3), "use_direct_reads": lambda: random.randint(0, 1), "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1), "mock_direct_io": False, "use_clock_cache": 0, # currently broken - "use_full_merge_v1": lambda: random.randint(0, 1), + "use_full_merge_v1": lambda: random.randrange(10) == 0, "use_merge": lambda: random.randint(0, 1), # 999 -> use Bloom API "ribbon_starting_level": lambda: random.choice([random.randint(-1, 10), 999]), "use_block_based_filter": lambda: random.randint(0, 1), "value_size_mult": 32, "verify_checksum": 1, - "write_buffer_size": 4 * 1024 * 1024, - "writepercent": 35, - "format_version": lambda: random.choice([2, 3, 4, 5, 5]), + "write_buffer_size": lambda: random.choice( + [1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024, 1024 * 1024 * 1024]), + "format_version": lambda: random.choice([2, 3, 4, 5, 5, 5, 5, 5, 5]), "index_block_restart_interval": lambda: random.choice(range(1, 16)), "use_multiget" : lambda: random.randint(0, 1), "periodic_compaction_seconds" : @@ -130,13 +137,12 @@ "max_manifest_file_size" : lambda : random.choice( [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1, 30)]), # Sync mode might make test runs slower so running it in a smaller chance - "sync" : lambda : random.choice( - [1 if t == 0 else 0 for t in range(0, 20)]), + "sync" : lambda : random.randrange(20) == 0, # Disable compaction_readahead_size because the test is not passing. #"compaction_readahead_size" : lambda : random.choice( # [0, 0, 1024 * 1024]), "db_write_buffer_size" : lambda: random.choice( - [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024]), + [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024, 1024 * 1024 * 1024]), "avoid_unnecessary_blocking_io" : random.randint(0, 1), "write_dbid_to_manifest" : random.randint(0, 1), "avoid_flush_during_recovery" : lambda: random.choice( @@ -147,8 +153,8 @@ "verify_checksum_one_in": 1000000, "verify_db_one_in": 100000, "continuous_verification_interval" : 0, - "max_key_len": 3, - "key_len_percent_dist": "1,30,69", + "max_key_len": 0, + "key_len_percent_dist": "0", "read_fault_one_in": lambda: random.choice([0, 32, 1000]), "open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]), "open_write_fault_one_in": lambda: random.choice([0, 0, 16]), @@ -167,6 +173,18 @@ "adaptive_readahead": lambda: random.choice([0, 1]), "async_io": lambda: random.choice([0, 1]), "wal_compression": lambda: random.choice(["none", "zstd"]), + # cannot change seed between runs because the seed decides which keys are nonoverwrittenable + "seed": int(time.time() * 1000000) & 0xffffffff, + "verify_before_write": lambda: random.randrange(20) == 0, + "allow_concurrent_memtable_write": lambda: random.randint(0, 1), + # only done when thread#0 does TestAcquireSnapshot. + "compare_full_db_state_snapshot": lambda: random.choice([0, 0, 0, 1]), + "num_iterations": lambda: random.randint(0, 100), + "sync_wal_one_in": 100000, + "data_block_index_type": random.randint(0, 1), + "data_block_hash_table_util_ratio": random.randint(0, 100) / 100.0, + "customopspercent": 0, + "filter_uri": lambda: random.choice(["speedb.PairedBloomFilter", ""]), } _TEST_DIR_ENV_VAR = 'TEST_TMPDIR' @@ -189,6 +207,7 @@ def get_dbname(test_name): os.mkdir(dbname) return dbname + expected_values_dir = None def setup_expected_values_dir(): global expected_values_dir @@ -234,12 +253,40 @@ def is_direct_io_supported(dbname): return True +def generate_key_dist_and_len(params): + # check if user supplied key dist or len + if params["max_key_len"] == 0 and params["key_len_percent_dist"] != "0": + params["max_key_len"] = params["key_len_percent_dist"].count(",") + 1 + return + + if params["max_key_len"] == 0 and params["key_len_percent_dist"] == "0": + params["max_key_len"] = random.randint(1, 10) + + dist = random_distribution(params["max_key_len"] - 1) + params["key_len_percent_dist"] = ",".join(str(i) for i in dist) + + +# Randomly select unique points (cut_points) on the distribution range +# and set the distribution to the differences between these points. +# Inspired by the following post, with changes to disallow 0: +# https://math.stackexchange.com/questions/1276206/method-of-generating-random-numbers-that-sum-to-100-is-this-truly-random/1276225#1276225 +def random_distribution(cuts_count): + cut_points = set() + while len(cut_points) < cuts_count: + cut_points.add(random.randint(1, 100 - 1)) + dist = [] + for x in sorted(cut_points): + dist.append(x - sum(dist)) + dist.append(100 - sum(dist)) + return dist + + blackbox_default_params = { "disable_wal": lambda: random.choice([0, 0, 0, 1]), # total time for this script to test db_stress - "duration": 6000, + "duration": 4000, # time for one db_stress instance to run - "interval": 120, + "interval": 240, # since we will be killing anyway, use large value for ops_per_thread "ops_per_thread": 100000000, "reopen": 0, @@ -253,14 +300,13 @@ def is_direct_io_supported(dbname): # that ran with WAL disabled. "disable_wal": 0, "duration": 10000, - "log2_keys_per_lock": 10, + "disable_kill_points": False, "ops_per_thread": 200000, "random_kill_odd": 888887, "reopen": 20, } simple_default_params = { - "allow_concurrent_memtable_write": lambda: random.randint(0, 1), "column_families": 1, "experimental_mempurge_threshold": lambda: 10.0*random.random(), "max_background_compactions": 1, @@ -292,6 +338,7 @@ def is_direct_io_supported(dbname): # Snapshots are used heavily in this test mode, while they are incompatible # with compaction filter. "enable_compaction_filter": 0, + "test_batches_snapshots": 0, } txn_params = { @@ -403,7 +450,86 @@ def is_direct_io_supported(dbname): "checkpoint_one_in": 0, } -def finalize_and_sanitize(src_params): +narrow_ops_per_thread = 50000 + +narrow_params = { + "duration": 1800, + "expected_values_dir": lambda: setup_expected_values_dir(), + "max_key_len": 8, + "value_size_mult": 8, + "fail_if_options_file_error": True, + "allow_concurrent_memtable_write": True, + "reopen": 2, + "log2_keys_per_lock": 1, + "prefixpercent": 0, + "prefix_size": -1, + "ops_per_thread": narrow_ops_per_thread, + "get_live_files_one_in": narrow_ops_per_thread, + "acquire_snapshot_one_in": int(narrow_ops_per_thread / 4), + "sync_wal_one_in": int(narrow_ops_per_thread / 2), + "verify_db_one_in": int(narrow_ops_per_thread), + "use_multiget": lambda: random.choice([0, 0, 0, 1]), + "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]), + "use_multiget": lambda: random.choice([0, 0, 0, 1]), + "compare_full_db_state_snapshot": lambda: random.choice([0, 0, 0, 1]), + "use_merge": lambda: random.choice([0, 0, 0, 1]), + "nooverwritepercent": random.choice([0, 5, 20, 30, 40, 50, 95]), + "seed": int(time.time() * 1000000) & 0xffffffff, + + # below are params that are incompatible with current settings. + "clear_column_family_one_in": 0, + "get_sorted_wal_files_one_in": 0, + "get_current_wal_file_one_in": 0, + "continuous_verification_interval": 0, + "destroy_db_initially": 0, + "progress_reports": 0, +} + + +def store_ops_supplied(params): + for k in supplied_ops: + supplied_ops[k] = params.get(k, -1) + + +# make sure sum of ops == 100. +# value of -1 means that the op should be initialized. +def randomize_operation_type_percentages(src_params): + num_to_initialize = sum(1 for v in supplied_ops.values() if v == -1) + + params = {k: (v if v != -1 else 0) for k, v in supplied_ops.items()} + + ops_percent_sum = sum(params.get(k, 0) for k in supplied_ops) + current_max = 100 - ops_percent_sum + if ops_percent_sum > 100 or (num_to_initialize == 0 and ops_percent_sum != 100): + raise ValueError("Error - Sum of ops percents should be 100") + + if num_to_initialize != 0: + for k , v in supplied_ops.items(): + if v != -1: + continue + + if num_to_initialize == 1: + params[k] = current_max + break + + if k == "writepercent" and current_max > 60: + params["writepercent"] = random.randint(20, 60) + elif k == "delpercent" and current_max > 35: + params["delpercent"] = random.randint(0, current_max - 35) + elif k == "prefixpercent" and current_max >= 10: + params["prefixpercent"] = random.randint(0, 10) + elif k == "delrangepercent" and current_max >= 5: + params["delrangepercent"] = random.randint(0, 5) + else: + params[k] = random.randint(0, current_max) + + current_max = current_max - params[k] + num_to_initialize -= 1 + + src_params.update(params) + + +def finalize_and_sanitize(src_params, counter): dest_params = dict([(k, v() if callable(v) else v) for (k, v) in src_params.items()]) if dest_params.get("compression_max_dict_bytes") == 0: @@ -412,6 +538,7 @@ def finalize_and_sanitize(src_params): if dest_params.get("compression_type") != "zstd": dest_params["compression_zstd_max_train_bytes"] = 0 if dest_params.get("allow_concurrent_memtable_write", 1) == 1: + # TODO: yuval- add hash_spd memtable dest_params["memtablerep"] = "skip_list" if dest_params["mmap_read"] == 1: dest_params["use_direct_io_for_flush_and_compaction"] = 0 @@ -468,6 +595,22 @@ def finalize_and_sanitize(src_params): dest_params["enable_pipelined_write"] = 0 if dest_params.get("sst_file_manager_bytes_per_sec", 0) == 0: dest_params["sst_file_manager_bytes_per_truncate"] = 0 + # test_batches_snapshots needs to stay const (either 1 or 0) throught + # successive runs. this stops the next check (enable_compaction_filter) + # from switching its value. + if (dest_params.get("test_batches_snapshots", 0) == 1 and + dest_params.get("enable_compaction_filter", 0) == 1): + dest_params["enable_compaction_filter"] = 0 + if dest_params.get("read_only", 0) == 1: + if counter == 0: + dest_params["read_only"] = 0 + else: + dest_params["readpercent"] += dest_params["writepercent"] + dest_params["writepercent"] = 0 + dest_params["iterpercent"] += dest_params["delpercent"] + dest_params["delpercent"] = 0 + dest_params["iterpercent"] += dest_params["delrangepercent"] + dest_params["delrangepercent"] = 0 if dest_params.get("enable_compaction_filter", 0) == 1: # Compaction filter is incompatible with snapshots. Need to avoid taking # snapshots, as well as avoid operations that use snapshots for @@ -475,11 +618,16 @@ def finalize_and_sanitize(src_params): dest_params["acquire_snapshot_one_in"] = 0 dest_params["compact_range_one_in"] = 0 # Give the iterator ops away to reads. - dest_params["readpercent"] += dest_params.get("iterpercent", 10) + dest_params["readpercent"] += dest_params.get("iterpercent", 0) dest_params["iterpercent"] = 0 dest_params["test_batches_snapshots"] = 0 + # this stops the ("prefix_size") == -1 check from changing the value + # of test_batches_snapshots between runs. + if (dest_params.get("prefix_size", 0) == -1 and + dest_params.get("test_batches_snapshots", 0) == 1): + dest_params["prefix_size"] = 7 if dest_params.get("prefix_size") == -1: - dest_params["readpercent"] += dest_params.get("prefixpercent", 20) + dest_params["readpercent"] += dest_params.get("prefixpercent", 0) dest_params["prefixpercent"] = 0 dest_params["test_batches_snapshots"] = 0 if dest_params.get("test_batches_snapshots") == 0: @@ -489,8 +637,13 @@ def finalize_and_sanitize(src_params): dest_params["memtable_prefix_bloom_size_ratio"] = 0 if dest_params.get("two_write_queues") == 1: dest_params["enable_pipelined_write"] = 0 + # make sure bloom_bits is not 0 when filter_uri is used since it fails in CreateFilterPolicy. + if dest_params.get("filter_uri") != "": + dest_params["bloom_bits"] = random.choice([random.randint(1,19), + random.lognormvariate(2.3, 1.3)]) return dest_params + def gen_cmd_params(args): params = {} @@ -531,18 +684,23 @@ def gen_cmd_params(args): for k, v in vars(args).items(): if v is not None: params[k] = v + + if params["max_key_len"] == 0 or params["key_len_percent_dist"] == "0": + generate_key_dist_and_len(params) + return params -def gen_cmd(params, unknown_params): - finalzied_params = finalize_and_sanitize(params) +def gen_cmd(params, unknown_params, counter): + finalzied_params = finalize_and_sanitize(params, counter) cmd = [stress_cmd] + [ '--{0}={1}'.format(k, v) for k, v in [(k, finalzied_params[k]) for k in sorted(finalzied_params)] if k not in set(['test_type', 'simple', 'duration', 'interval', 'random_kill_odd', 'cf_consistency', 'txn', 'test_best_efforts_recovery', 'enable_ts', - 'test_multiops_txn', 'write_policy', 'stress_cmd']) + 'test_multiops_txn', 'write_policy', 'stress_cmd', + 'disable_kill_points']) and v is not None] + unknown_params return cmd @@ -583,26 +741,104 @@ def inject_inconsistencies_to_db_dir(dir_path): with open(os.path.join(dir_path, fname), "w") as fd: fd.write("garbage") + +DEADLY_SIGNALS = { + signal.SIGABRT, signal.SIGBUS, signal.SIGFPE, signal.SIGILL, signal.SIGSEGV +} + + def execute_cmd(cmd, timeout): child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - print("Running db_stress with pid=%d: %s\n\n" - % (child.pid, ' '.join(cmd))) + print("[%s] Running db_stress with pid=%d: %s\n\n" + % (str(datetime.datetime.now()), child.pid, ' '.join(cmd))) try: outs, errs = child.communicate(timeout=timeout) hit_timeout = False - print("WARNING: db_stress ended before kill: exitcode=%d\n" - % child.returncode) + if child.returncode < 0 and (-child.returncode in DEADLY_SIGNALS): + msg = ("[%s] ERROR: db_stress (pid=%d) failed before kill: " + "exitcode=%d, signal=%s\n") % ( + str(datetime.datetime.now()), child.pid, child.returncode, + signal.Signals(-child.returncode).name) + print(outs) + print(errs, file=sys.stderr) + print(msg) + raise SystemExit(msg) + print("[%s] WARNING: db_stress (pid=%d) ended before kill: exitcode=%d\n" + % (str(datetime.datetime.now()), child.pid, child.returncode)) except subprocess.TimeoutExpired: hit_timeout = True child.kill() - print("KILLED %d\n" % child.pid) + print("[%s] KILLED %d\n" % (str(datetime.datetime.now()), child.pid)) outs, errs = child.communicate() return hit_timeout, child.returncode, outs.decode('utf-8'), errs.decode('utf-8') +# old copy of the db is kept at same src dir as new db. +def copy_tree_and_remove_old(counter, dbname): + dest = dbname + "_" + str(counter) + shutil.copytree(dbname, dest) + shutil.copytree(expected_values_dir, dest + "/" + "expected_values_dir") + old_db = dbname + "_" + str(counter - 2) + if counter > 1: + shutil.rmtree(old_db, True) + + +def gen_narrow_cmd_params(args): + params = {} + params.update(narrow_params) + # add these to avoid a key error in finalize_and_sanitize + params["mmap_read"] = 0 + params["use_direct_io_for_flush_and_compaction"] = 0 + params["partition_filters"] = 0 + params["use_direct_reads"] = 0 + params["user_timestamp_size"] = 0 + params["ribbon_starting_level"] = 0 + + for k, v in vars(args).items(): + if v is not None: + params[k] = v + + return params + + +def narrow_crash_main(args, unknown_args): + cmd_params = gen_narrow_cmd_params(args) + dbname = get_dbname('narrow') + exit_time = time.time() + cmd_params['duration'] + + store_ops_supplied(cmd_params) + + print("Running narrow-crash-test\n") + + counter = 0 + + while time.time() < exit_time: + randomize_operation_type_percentages(cmd_params) + cmd = gen_cmd(dict(cmd_params, **{'db': dbname}), unknown_args, counter) + + hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params['duration']) + copy_tree_and_remove_old(counter, dbname) + counter += 1 + + for line in errs.splitlines(): + if line and not line.startswith('WARNING'): + run_had_errors = True + print('stderr has error message:') + print('***' + line + '***') + + if retcode != 0: + raise SystemExit('TEST FAILED. See kill option and exit code above!!!\n') + + time.sleep(2) # time to stabilize before the next run + + shutil.rmtree(dbname, True) + for ctr in range(max(0, counter - 2), counter): + shutil.rmtree('{}_{}'.format(dbname, ctr), True) + + # This script runs and kills db_stress multiple times. It checks consistency # in case of unsafe crashes in RocksDB. def blackbox_crash_main(args, unknown_args): @@ -610,16 +846,21 @@ def blackbox_crash_main(args, unknown_args): dbname = get_dbname('blackbox') exit_time = time.time() + cmd_params['duration'] + store_ops_supplied(cmd_params) + print("Running blackbox-crash-test with \n" + "interval_between_crash=" + str(cmd_params['interval']) + "\n" + "total-duration=" + str(cmd_params['duration']) + "\n") + + counter = 0 while time.time() < exit_time: - cmd = gen_cmd(dict( - list(cmd_params.items()) - + list({'db': dbname}.items())), unknown_args) + randomize_operation_type_percentages(cmd_params) + cmd = gen_cmd(dict(cmd_params, **{'db': dbname}), unknown_args, counter) hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params['interval']) + copy_tree_and_remove_old(counter, dbname) + counter+=1 if not hit_timeout: print('Exit Before Killing') @@ -643,6 +884,8 @@ def blackbox_crash_main(args, unknown_args): # we need to clean up after ourselves -- only do this on test success shutil.rmtree(dbname, True) + for ctr in range(max(0, counter - 2), counter): + shutil.rmtree('{}_{}'.format(dbname, ctr), True) # This python script runs db_stress multiple times. Some runs with @@ -655,6 +898,8 @@ def whitebox_crash_main(args, unknown_args): exit_time = cur_time + cmd_params['duration'] half_time = cur_time + cmd_params['duration'] // 2 + store_ops_supplied(cmd_params) + print("Running whitebox-crash-test with \n" + "total-duration=" + str(cmd_params['duration']) + "\n") @@ -663,7 +908,11 @@ def whitebox_crash_main(args, unknown_args): kill_random_test = cmd_params['random_kill_odd'] kill_mode = 0 + counter = 0 + while time.time() < exit_time: + if cmd_params["disable_kill_points"]: + check_mode = 3 if check_mode == 0: additional_opts = { # use large ops per thread since we will kill it anyway @@ -726,12 +975,8 @@ def whitebox_crash_main(args, unknown_args): "kill_random_test": None, "ops_per_thread": cmd_params['ops_per_thread'], } - - cmd = gen_cmd(dict(list(cmd_params.items()) - + list(additional_opts.items()) - + list({'db': dbname}.items())), unknown_args) - - print("Running:" + ' '.join(cmd) + "\n") # noqa: E999 T25377293 Grandfathered in + randomize_operation_type_percentages(cmd_params) + cmd = gen_cmd(dict(cmd_params, **{'db': dbname}, **additional_opts), unknown_args, counter) # If the running time is 15 minutes over the run time, explicit kill and # exit even if white box kill didn't hit. This is to guarantee run time @@ -747,6 +992,9 @@ def whitebox_crash_main(args, unknown_args): print(msg) print(stdoutdata) print(stderrdata) + + copy_tree_and_remove_old(counter, dbname) + counter+=1 if hit_timeout: print("Killing the run for running too long") @@ -785,18 +1033,33 @@ def whitebox_crash_main(args, unknown_args): # success shutil.rmtree(dbname, True) os.mkdir(dbname) - cmd_params.pop('expected_values_dir', None) + global expected_values_dir + if os.path.exists(expected_values_dir): + shutil.rmtree(expected_values_dir) + expected_values_dir = None check_mode = (check_mode + 1) % total_check_mode time.sleep(1) # time to stabilize after a kill + for ctr in range(max(0, counter - 2), counter): + shutil.rmtree('{}_{}'.format(dbname, ctr), True) + + +def bool_converter(v): + s = v.lower().strip() + if s in ('false', '0', 'no'): + return False + elif s in ('true', '1', 'yes'): + return True + raise ValueError('Failed to parse `%s` as a boolean value' % v) + def main(): global stress_cmd parser = argparse.ArgumentParser(description="This script runs and kills \ db_stress multiple times") - parser.add_argument("test_type", choices=["blackbox", "whitebox"]) + parser.add_argument("test_type", choices=["blackbox", "whitebox", "narrow"]) parser.add_argument("--simple", action="store_true") parser.add_argument("--cf_consistency", action='store_true') parser.add_argument("--txn", action='store_true') @@ -814,17 +1077,22 @@ def main(): + list(whitebox_simple_default_params.items()) + list(blob_params.items()) + list(ts_params.items()) + + list(supplied_ops.items()) + + list(narrow_params.items()) + list(multiops_txn_default_params.items()) + list(multiops_wc_txn_params.items()) + list(multiops_wp_txn_params.items())) for k, v in all_params.items(): - parser.add_argument("--" + k, type=type(v() if callable(v) else v)) + t = type(v() if callable(v) else v) + if t is bool: + t = bool_converter + parser.add_argument("--" + k, type=t) # unknown_args are passed directly to db_stress args, unknown_args = parser.parse_known_args() test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) - if test_tmpdir is not None and not os.path.isdir(test_tmpdir): + if test_tmpdir and not os.path.isdir(test_tmpdir): print('%s env var is set to a non-existent directory: %s' % (_TEST_DIR_ENV_VAR, test_tmpdir)) sys.exit(1) @@ -835,8 +1103,10 @@ def main(): blackbox_crash_main(args, unknown_args) if args.test_type == 'whitebox': whitebox_crash_main(args, unknown_args) + if args.test_type == 'narrow': + narrow_crash_main(args, unknown_args) # Only delete the `expected_values_dir` if test passes - if expected_values_dir is not None: + if expected_values_dir and os.path.exists(expected_values_dir): shutil.rmtree(expected_values_dir) if multiops_txn_key_spaces_file is not None: os.remove(multiops_txn_key_spaces_file) diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 2a0bd59854..fbd960c169 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -2681,8 +2681,10 @@ void GetCommand::DoCommand() { std::string value; Status st = db_->Get(ReadOptions(), GetCfHandle(), key_, &value); if (st.ok()) { - fprintf(stdout, "%s\n", - (is_value_hex_ ? StringToHex(value) : value).c_str()); + if (is_value_hex_) { + value = StringToHex(value); + } + fprintf(stdout, "%*s\n", int(value.size()), value.c_str()); } else { std::stringstream oss; oss << "Get failed: " << st.ToString(); @@ -3870,7 +3872,7 @@ IngestExternalSstFilesCommand::IngestExternalSstFilesCommand( if (!write_global_seqno_) { fprintf(stderr, "Warning: not writing global_seqno to the ingested SST can\n" - "prevent older versions of RocksDB from being able to open it\n"); + "prevent older versions of Speedb from being able to open it\n"); } } else { if (write_global_seqno_) { diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index e3c684b668..52c7987a54 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -5,7 +5,9 @@ // #ifndef ROCKSDB_LITE #include "rocksdb/ldb_tool.h" + #include "rocksdb/utilities/ldb_cmd.h" +#include "speedb/version.h" #include "tools/ldb_cmd_impl.h" namespace ROCKSDB_NAMESPACE { @@ -134,8 +136,7 @@ int LDBCommandRunner::RunCommand( PrintHelp(ldb_options, argv[0], /*to_stderr*/ true); return 1; } else if (std::string(argv[1]) == "--version") { - printf("ldb from RocksDB %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR, - ROCKSDB_PATCH); + printf("%s\n", GetRocksBuildInfoAsString("ldb").c_str()); return 0; } else if (std::string(argv[1]) == "--help") { PrintHelp(ldb_options, argv[0], /*to_stderr*/ false); diff --git a/tools/rocksdb_dump_test.sh b/tools/rocksdb_dump_test.sh index 532c532678..8d057c689a 100755 --- a/tools/rocksdb_dump_test.sh +++ b/tools/rocksdb_dump_test.sh @@ -1,9 +1,9 @@ # shellcheck disable=SC2148 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -TESTDIR=`mktemp -d ${TMPDIR:-/tmp}/rocksdb-dump-test.XXXXX` +TESTDIR=`mktemp -d ${TMPDIR:-/tmp}/speedb-dump-test.XXXXX` DUMPFILE="tools/sample-dump.dmp" # Verify that the sample dump file is undumpable and then redumpable. -./rocksdb_undump --dump_location=$DUMPFILE --db_path=$TESTDIR/db -./rocksdb_dump --anonymous --db_path=$TESTDIR/db --dump_location=$TESTDIR/dump +./speedb_undump --dump_location=$DUMPFILE --db_path=$TESTDIR/db +./speedb_dump --anonymous --db_path=$TESTDIR/db --dump_location=$TESTDIR/dump cmp $DUMPFILE $TESTDIR/dump diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc index 37b95852b7..bd3211025c 100644 --- a/trace_replay/trace_replay.cc +++ b/trace_replay/trace_replay.cc @@ -73,7 +73,7 @@ Status TracerHelper::ParseTraceHeader(const Trace& header, int* trace_version, Status s; s = ParseVersionStr(t_v_str, trace_version); - if (s != Status::OK()) { + if (!s.ok()) { return s; } s = ParseVersionStr(db_v_str, db_version); diff --git a/util/bloom_impl.h b/util/bloom_impl.h index fadd012d30..46ba8319f9 100644 --- a/util/bloom_impl.h +++ b/util/bloom_impl.h @@ -11,6 +11,7 @@ #include #include +#include #include #include "port/port.h" // for PREFETCH @@ -24,6 +25,18 @@ namespace ROCKSDB_NAMESPACE { class BloomMath { + public: + // Powers of 32-bit golden ratio, mod 2**32. + static constexpr size_t kNumGoldenRatioPowers = 30U; + static constexpr std::array + GoldenRatioPowers{ + 0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, 0x35fbe861, + 0xdeb7c719, 0x0448b211, 0x3459b749, 0xab25f4c1, 0x52941879, + 0x9c95e071, 0xf5ab9aa9, 0x2d6ba521, 0x8bededd9, 0x9bfb72d1, + 0x3ae1c209, 0x7fca7981, 0xc576c739, 0xd23ee931, 0x0335ad69, + 0xc04ff1e1, 0x98702499, 0x7535c391, 0x9f70dcc9, 0x0e198e41, + 0xf2ab85f9, 0xe6c581f1, 0xc7ecd029, 0x6f54cea1, 0x4c8a6b59}; + public: // False positive rate of a standard Bloom filter, for given ratio of // filter memory bits to added keys, and number of probes per operation. @@ -228,6 +241,105 @@ class FastLocalBloomImpl { return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line); } +#ifdef HAVE_AVX2 + // Receives an intrinsic (__m256i) hash_vector comprised of num_probes (1-8) + // 32-bits bit positions (0-511) to test within a 512 bits bloom block + // + // Returns a pair: + // first: Whether testing is complete + // second: If testing is complete, the answer, otherwise N/A + // + // IMPORTANT: THIS CODE ASSUMES A BLOCK (CACHE-LINE) SIZE OF 64 BYTES !!!! + // + static inline std::pair CheckBitsPositionsInBloomBlock( + int num_probes, __m256i &hash_vector, const char *const block_address_) { + // Now the top 9 bits of each of the eight 32-bit values in + // hash_vector are bit addresses for probes within the cache line. + // While the platform-independent code uses byte addressing (6 bits + // to pick a byte + 3 bits to pick a bit within a byte), here we work + // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit + // within a word) because that works well with AVX2 and is equivalent + // under little-endian. + + // Shift each right by 28 bits to get 4-bit word addresses. + const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28); + + // Gather 32-bit values spread over 512 bits by 4-bit address. In + // essence, we are dereferencing eight pointers within the cache + // line. + // + // Option 1: AVX2 gather (seems to be a little slow - understandable) + // const __m256i value_vector = + // _mm256_i32gather_epi32(static_cast(data_at_cache_line), + // word_addresses, + // /*bytes / i32*/ 4); + // END Option 1 + // Potentially unaligned as we're not *always* cache-aligned -> loadu + const __m256i *mm_data = reinterpret_cast(block_address_); + // lower = block[0:255], higher = block[256:511] + __m256i lower = _mm256_loadu_si256(mm_data); + __m256i upper = _mm256_loadu_si256(mm_data + 1); + + // Option 2: AVX512VL permute hack + // Only negligibly faster than Option 3, so not yet worth supporting + // const __m256i value_vector = + // _mm256_permutex2var_epi32(lower, word_addresses, upper); + // END Option 2 + // Option 3: AVX2 permute+blend hack + // Use lowest three bits to order probing values, as if all from same + // 256 bit piece. + + // UDI: The last 3 bits of each integer of b are used as addresses into + // the 8 integers of a. + lower = _mm256_permutevar8x32_epi32(lower, word_addresses); + upper = _mm256_permutevar8x32_epi32(upper, word_addresses); + // Just top 1 bit of address, to select between lower and upper. + // UDI: Shifts packed 32-bit integers in a right by IMM8 while shifting in + // sign bits. + const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31); + // Finally: the next 8 probed 32-bit values, in probing sequence order. + const __m256i value_vector = + _mm256_blendv_epi8(lower, upper, upper_lower_selector); + // END Option 3 + + // We might not need to probe all 8, so build a mask for selecting only + // what we need. (The k_selector(s) could be pre-computed but that + // doesn't seem to make a noticeable performance difference.) + const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + // Subtract num_probes from each of those constants + __m256i k_selector = + _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(num_probes)); + // Negative after subtract -> use/select + // Keep only high bit (logical shift right each by 31). + k_selector = _mm256_srli_epi32(k_selector, 31); + + // Strip off the 4 bit word address (shift LEFT) + // Strips the 4 MSB bits + __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4); + + // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses. + // Shifts RIGHT 27 => 5 lower bit pos bits remain + bit_addresses = _mm256_srli_epi32(bit_addresses, 27); + // Build a bit mask + // Performs a logical shift of 32 (doublewords) in the individual data + // elements in k_selector to the left by the bit_addresses value + const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses); + + // Like ((~value_vector) & bit_mask) == 0) + bool match = _mm256_testc_si256(value_vector, bit_mask) != 0; + + // This check first so that it's easy for branch predictor to optimize + // num_probes <= 8 case, making it free of unpredictable branches. + if (num_probes <= 8) { + return {true, match}; + } else if (!match) { + return {true, false}; + } + return {false, false}; + } +#endif // HAVE_AVX2 + static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes, const char *data_at_cache_line) { uint32_t h = h2; @@ -242,9 +354,11 @@ class FastLocalBloomImpl { // in doubt, don't add unnecessary code. // Powers of 32-bit golden ratio, mod 2**32. - const __m256i multipliers = - _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, - 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749); + const __m256i multipliers = _mm256_setr_epi32( + BloomMath::GoldenRatioPowers[0], BloomMath::GoldenRatioPowers[1], + BloomMath::GoldenRatioPowers[2], BloomMath::GoldenRatioPowers[3], + BloomMath::GoldenRatioPowers[4], BloomMath::GoldenRatioPowers[5], + BloomMath::GoldenRatioPowers[6], BloomMath::GoldenRatioPowers[7]); for (;;) { // Eight copies of hash @@ -254,77 +368,10 @@ class FastLocalBloomImpl { // associativity of multiplication. hash_vector = _mm256_mullo_epi32(hash_vector, multipliers); - // Now the top 9 bits of each of the eight 32-bit values in - // hash_vector are bit addresses for probes within the cache line. - // While the platform-independent code uses byte addressing (6 bits - // to pick a byte + 3 bits to pick a bit within a byte), here we work - // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit - // within a word) because that works well with AVX2 and is equivalent - // under little-endian. - - // Shift each right by 28 bits to get 4-bit word addresses. - const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28); - - // Gather 32-bit values spread over 512 bits by 4-bit address. In - // essence, we are dereferencing eight pointers within the cache - // line. - // - // Option 1: AVX2 gather (seems to be a little slow - understandable) - // const __m256i value_vector = - // _mm256_i32gather_epi32(static_cast(data_at_cache_line), - // word_addresses, - // /*bytes / i32*/ 4); - // END Option 1 - // Potentially unaligned as we're not *always* cache-aligned -> loadu - const __m256i *mm_data = - reinterpret_cast(data_at_cache_line); - __m256i lower = _mm256_loadu_si256(mm_data); - __m256i upper = _mm256_loadu_si256(mm_data + 1); - // Option 2: AVX512VL permute hack - // Only negligibly faster than Option 3, so not yet worth supporting - // const __m256i value_vector = - // _mm256_permutex2var_epi32(lower, word_addresses, upper); - // END Option 2 - // Option 3: AVX2 permute+blend hack - // Use lowest three bits to order probing values, as if all from same - // 256 bit piece. - lower = _mm256_permutevar8x32_epi32(lower, word_addresses); - upper = _mm256_permutevar8x32_epi32(upper, word_addresses); - // Just top 1 bit of address, to select between lower and upper. - const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31); - // Finally: the next 8 probed 32-bit values, in probing sequence order. - const __m256i value_vector = - _mm256_blendv_epi8(lower, upper, upper_lower_selector); - // END Option 3 - - // We might not need to probe all 8, so build a mask for selecting only - // what we need. (The k_selector(s) could be pre-computed but that - // doesn't seem to make a noticeable performance difference.) - const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - // Subtract rem_probes from each of those constants - __m256i k_selector = - _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes)); - // Negative after subtract -> use/select - // Keep only high bit (logical shift right each by 31). - k_selector = _mm256_srli_epi32(k_selector, 31); - - // Strip off the 4 bit word address (shift left) - __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4); - // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses. - bit_addresses = _mm256_srli_epi32(bit_addresses, 27); - // Build a bit mask - const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses); - - // Like ((~value_vector) & bit_mask) == 0) - bool match = _mm256_testc_si256(value_vector, bit_mask) != 0; - - // This check first so that it's easy for branch predictor to optimize - // num_probes <= 8 case, making it free of unpredictable branches. - if (rem_probes <= 8) { - return match; - } else if (!match) { - return false; + auto [is_done, answer] = CheckBitsPositionsInBloomBlock( + rem_probes, hash_vector, data_at_cache_line); + if (is_done) { + return answer; } // otherwise // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power diff --git a/util/bloom_test.cc b/util/bloom_test.cc index e20d3d4035..5f2891dbbe 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -1009,7 +1009,7 @@ struct RawFilterTester { // Points five bytes from the end char* metadata_ptr_; - RawFilterTester() : metadata_ptr_(&*(data_.end() - 5)) {} + RawFilterTester() : data_(), metadata_ptr_(&*(data_.end() - 5)) {} Slice ResetNoFill(uint32_t len_without_metadata, uint32_t num_lines, uint32_t num_probes) { diff --git a/util/build_version.cc.in b/util/build_version.cc.in index 64c86a5630..d775219772 100644 --- a/util/build_version.cc.in +++ b/util/build_version.cc.in @@ -3,22 +3,23 @@ #include #include "rocksdb/version.h" +#include "speedb/version.h" #include "rocksdb/utilities/object_registry.h" #include "util/string_util.h" // The build script may replace these values with real values based // on whether or not GIT is available and the platform settings -static const std::string rocksdb_build_git_sha = "rocksdb_build_git_sha:@GIT_SHA@"; -static const std::string rocksdb_build_git_tag = "rocksdb_build_git_tag:@GIT_TAG@"; +static const std::string speedb_build_git_sha = "speedb_build_git_sha:@GIT_SHA@"; +static const std::string speedb_build_git_tag = "speedb_build_git_tag:@GIT_TAG@"; #define HAS_GIT_CHANGES @GIT_MOD@ #if HAS_GIT_CHANGES == 0 // If HAS_GIT_CHANGES is 0, the GIT date is used. // Use the time the branch/tag was last modified -static const std::string rocksdb_build_date = "rocksdb_build_date:@GIT_DATE@"; +static const std::string speedb_build_date = "speedb_build_date:@GIT_DATE@"; #else // If HAS_GIT_CHANGES is > 0, the branch/tag has modifications. // Use the time the build was created. -static const std::string rocksdb_build_date = "rocksdb_build_date:@BUILD_DATE@"; +static const std::string speedb_build_date = "speedb_build_date:@BUILD_DATE@"; #endif #ifndef ROCKSDB_LITE @@ -46,9 +47,9 @@ static void AddProperty(std::unordered_map *props, con static std::unordered_map* LoadPropertiesSet() { auto * properties = new std::unordered_map(); - AddProperty(properties, rocksdb_build_git_sha); - AddProperty(properties, rocksdb_build_git_tag); - AddProperty(properties, rocksdb_build_date); + AddProperty(properties, speedb_build_git_sha); + AddProperty(properties, speedb_build_git_tag); + AddProperty(properties, speedb_build_date); return properties; } @@ -65,9 +66,19 @@ std::string GetRocksVersionAsString(bool with_patch) { return version; } } - + +std::string GetSpeedbVersionAsString(bool with_patch) { + std::string version = ToString(SPEEDB_MAJOR) + "." + ToString(SPEEDB_MINOR); + if (with_patch) { + return version + "." + ToString(SPEEDB_PATCH); + } else { + return version; + } +} + std::string GetRocksBuildInfoAsString(const std::string& program, bool verbose) { - std::string info = program + " (RocksDB) " + GetRocksVersionAsString(true); + std::string info = program + " (Speedb) " + GetSpeedbVersionAsString(true) + + " (" + GetRocksVersionAsString(true) + ")"; if (verbose) { for (const auto& it : GetRocksBuildProperties()) { info.append("\n "); diff --git a/util/filter_bench.cc b/util/filter_bench.cc index 6160cac8c0..b1f36f37ca 100644 --- a/util/filter_bench.cc +++ b/util/filter_bench.cc @@ -14,12 +14,14 @@ int main() { #include #include #include +#include #include #include "memory/arena.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" +#include "rocksdb/convenience.h" #include "rocksdb/system_clock.h" #include "table/block_based/filter_policy_internal.h" #include "table/block_based/full_filter_block.h" @@ -81,10 +83,11 @@ DEFINE_bool(use_plain_table_bloom, false, DEFINE_bool(new_builder, false, "Whether to create a new builder for each new filter"); -DEFINE_uint32(impl, 0, +DEFINE_string(impl, "0", "Select filter implementation. Without -use_plain_table_bloom:" "0 = legacy full Bloom filter, 1 = block-based Bloom filter, " - "2 = format_version 5 Bloom filter, 3 = Ribbon128 filter. With " + "2 = format_version 5 Bloom filter, 3 = Ribbon128 filter, " + "name and options of the filter to use. With " "-use_plain_table_bloom: 0 = no locality, 1 = locality."); DEFINE_bool(net_includes_hashing, false, @@ -136,33 +139,7 @@ void _always_assert_fail(int line, const char *file, const char *expr) { // accurate speed tests #define PREDICT_FP_RATE #endif - -using ROCKSDB_NAMESPACE::Arena; -using ROCKSDB_NAMESPACE::BlockContents; -using ROCKSDB_NAMESPACE::BloomFilterPolicy; -using ROCKSDB_NAMESPACE::BloomHash; -using ROCKSDB_NAMESPACE::BloomLikeFilterPolicy; -using ROCKSDB_NAMESPACE::BuiltinFilterBitsBuilder; -using ROCKSDB_NAMESPACE::CachableEntry; -using ROCKSDB_NAMESPACE::Cache; -using ROCKSDB_NAMESPACE::EncodeFixed32; -using ROCKSDB_NAMESPACE::FastRange32; -using ROCKSDB_NAMESPACE::FilterBitsReader; -using ROCKSDB_NAMESPACE::FilterBuildingContext; -using ROCKSDB_NAMESPACE::FilterPolicy; -using ROCKSDB_NAMESPACE::FullFilterBlockReader; -using ROCKSDB_NAMESPACE::GetSliceHash; -using ROCKSDB_NAMESPACE::GetSliceHash64; -using ROCKSDB_NAMESPACE::Lower32of64; -using ROCKSDB_NAMESPACE::LRUCacheOptions; -using ROCKSDB_NAMESPACE::ParsedFullFilterBlock; -using ROCKSDB_NAMESPACE::PlainTableBloomV1; -using ROCKSDB_NAMESPACE::Random32; -using ROCKSDB_NAMESPACE::Slice; -using ROCKSDB_NAMESPACE::static_cast_with_check; -using ROCKSDB_NAMESPACE::Status; -using ROCKSDB_NAMESPACE::StderrLogger; -using ROCKSDB_NAMESPACE::mock::MockBlockBasedTableTester; +namespace ROCKSDB_NAMESPACE { struct KeyMaker { KeyMaker(size_t avg_size) @@ -203,17 +180,6 @@ struct KeyMaker { } }; -void PrintWarnings() { -#if defined(__GNUC__) && !defined(__OPTIMIZE__) - fprintf(stdout, - "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"); -#endif -#ifndef NDEBUG - fprintf(stdout, - "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); -#endif -} - void PrintError(const char *error) { fprintf(stderr, "ERROR: %s\n", error); } struct FilterInfo { @@ -290,17 +256,7 @@ static uint32_t DryRunHash64(Slice &s) { return Lower32of64(GetSliceHash64(s)); } -const std::shared_ptr &GetPolicy() { - static std::shared_ptr policy; - if (!policy) { - policy = BloomLikeFilterPolicy::Create( - BloomLikeFilterPolicy::GetAllFixedImpls().at(FLAGS_impl), - FLAGS_bits_per_key); - } - return policy; -} - -struct FilterBench : public MockBlockBasedTableTester { +struct FilterBench : public mock::MockBlockBasedTableTester { std::vector kms_; std::vector infos_; Random32 random_; @@ -308,11 +264,14 @@ struct FilterBench : public MockBlockBasedTableTester { Arena arena_; double m_queries_; StderrLogger stderr_logger_; + int filter_index_; - FilterBench() - : MockBlockBasedTableTester(GetPolicy()), + FilterBench(const std::shared_ptr &filter_policy, + int filter_index) + : MockBlockBasedTableTester(filter_policy), random_(FLAGS_seed), - m_queries_(0) { + m_queries_(0), + filter_index_(filter_index) { for (uint32_t i = 0; i < FLAGS_batch_size; ++i) { kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size); } @@ -340,30 +299,6 @@ struct FilterBench : public MockBlockBasedTableTester { }; void FilterBench::Go() { - if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) { - throw std::runtime_error( - "Can't combine -use_plain_table_bloom and -use_full_block_reader"); - } - if (FLAGS_use_plain_table_bloom) { - if (FLAGS_impl > 1) { - throw std::runtime_error( - "-impl must currently be >= 0 and <= 1 for Plain table"); - } - } else { - if (FLAGS_impl == 1) { - throw std::runtime_error( - "Block-based filter not currently supported by filter_bench"); - } - if (FLAGS_impl > 3) { - throw std::runtime_error( - "-impl must currently be 0, 2, or 3 for Block-based table"); - } - } - - if (FLAGS_vary_key_count_ratio < 0.0 || FLAGS_vary_key_count_ratio > 1.0) { - throw std::runtime_error("-vary_key_count_ratio must be >= 0.0 and <= 1.0"); - } - // For example, average_keys_per_filter = 100, vary_key_count_ratio = 0.1. // Varys up to +/- 10 keys. variance_range = 21 (generating value 0..20). // variance_offset = 10, so value - offset average value is always 0. @@ -389,7 +324,7 @@ void FilterBench::Go() { std::unique_ptr builder; - size_t total_memory_used = 0; + [[maybe_unused]] size_t total_memory_used = 0; size_t total_size = 0; size_t total_keys_added = 0; #ifdef PREDICT_FP_RATE @@ -426,7 +361,7 @@ void FilterBench::Go() { info.plain_table_bloom_.reset(new PlainTableBloomV1()); info.plain_table_bloom_->SetTotalBits( &arena_, static_cast(keys_to_add * FLAGS_bits_per_key), - FLAGS_impl, 0 /*huge_page*/, nullptr /*logger*/); + filter_index_, 0 /*huge_page*/, nullptr /*logger*/); for (uint32_t i = 0; i < keys_to_add; ++i) { uint32_t hash = GetSliceHash(kms_[0].Get(filter_id, i)); info.plain_table_bloom_->AddHash(hash); @@ -595,7 +530,8 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run, auto dry_run_hash_fn = DryRunNoHash; if (!FLAGS_net_includes_hashing) { - if (FLAGS_impl < 2 || FLAGS_use_plain_table_bloom) { + if ((filter_index_ >= 0 && filter_index_ < 2) || + FLAGS_use_plain_table_bloom) { dry_run_hash_fn = DryRunHash32; } else { dry_run_hash_fn = DryRunHash64; @@ -786,6 +722,19 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run, return ns; } +} // namespace ROCKSDB_NAMESPACE + +void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif +} + int main(int argc, char **argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + @@ -821,13 +770,61 @@ int main(int argc, char **argv) { << " \"Skewed X% in Y%\" - like \"Random filter\" except Y% of" << "\n the filters are designated as \"hot\" and receive X%" << "\n of queries." << std::endl; + } else if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) { + throw std::runtime_error( + "Can't combine -use_plain_table_bloom and -use_full_block_reader"); + } else if (FLAGS_vary_key_count_ratio < 0.0 || + FLAGS_vary_key_count_ratio > 1.0) { + throw std::runtime_error("-vary_key_count_ratio must be >= 0.0 and <= 1.0"); + } + std::shared_ptr policy; + + int bloom_idx = -1; + uint64_t id; + const auto &bloom_like_filters = + ROCKSDB_NAMESPACE::BloomLikeFilterPolicy::GetAllFixedImpls(); + ROCKSDB_NAMESPACE::Slice impl(FLAGS_impl); + if (ROCKSDB_NAMESPACE::ConsumeDecimalNumber(&impl, &id) && + id < bloom_like_filters.size() && impl.empty()) { + policy = ROCKSDB_NAMESPACE::BloomLikeFilterPolicy::Create( + bloom_like_filters.at(id), FLAGS_bits_per_key); + if (!policy) { + fprintf(stderr, "Failed to create BloomLikeFilterPolicy: %s\n", + FLAGS_impl.c_str()); + exit(-1); + } else { + bloom_idx = static_cast(id); + } } else { - FilterBench b; - for (uint32_t i = 0; i < FLAGS_runs; ++i) { - b.Go(); - FLAGS_seed += 100; - b.random_.Seed(FLAGS_seed); + ROCKSDB_NAMESPACE::ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + std::string bits_str; + if (FLAGS_bits_per_key > 0) { + bits_str = ":" + ROCKSDB_NAMESPACE::ToString(FLAGS_bits_per_key); + } + auto s = ROCKSDB_NAMESPACE::FilterPolicy::CreateFromString( + config_options, FLAGS_impl + bits_str, &policy); + if (!s.ok() || !policy) { + fprintf(stderr, "Failed to create FilterPolicy[%s%s]: %s\n", + FLAGS_impl.c_str(), bits_str.c_str(), s.ToString().c_str()); + exit(-1); + } + } + if (FLAGS_use_plain_table_bloom) { + if (bloom_idx < 0 || bloom_idx > 1) { + fprintf(stderr, "-impl must currently be 0 or 1 for Plain table"); + exit(-1); } + } else if (bloom_idx == 1) { + fprintf(stderr, + "Block-based filter not currently supported by filter_bench"); + exit(-1); + } + ROCKSDB_NAMESPACE::FilterBench b(policy, bloom_idx); + for (uint32_t i = 0; i < FLAGS_runs; ++i) { + b.Go(); + FLAGS_seed += 100; + b.random_.Seed(FLAGS_seed); } return 0; diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h index c75ad7c49f..78cd78fdb0 100644 --- a/util/repeatable_thread.h +++ b/util/repeatable_thread.h @@ -24,7 +24,7 @@ class RepeatableThread { const std::string& thread_name, SystemClock* clock, uint64_t delay_us, uint64_t initial_delay_us = 0) : function_(function), - thread_name_("rocksdb:" + thread_name), + thread_name_("speedb:" + thread_name), clock_(clock), delay_us_(delay_us), initial_delay_us_(initial_delay_us), diff --git a/util/status.cc b/util/status.cc index 2c9aa50152..b52f7dc247 100644 --- a/util/status.cc +++ b/util/status.cc @@ -15,6 +15,10 @@ #include #include "port/port.h" +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED +#include "port/stack_trace.h" +#endif + namespace ROCKSDB_NAMESPACE { std::unique_ptr Status::CopyState(const char* s) { @@ -44,6 +48,13 @@ static const char* msgs[static_cast(Status::kMaxSubCode)] = { "IO fenced off", // kIOFenced }; +void Status::PrintFailure() { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + fprintf(stderr, "Failed to check Status %p\n", this); + port::PrintStack(); +#endif +} + Status::Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2, Severity sev) : code_(_code), diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc index dc166e13cf..27fd7be823 100644 --- a/util/threadpool_imp.cc +++ b/util/threadpool_imp.cc @@ -343,7 +343,7 @@ void ThreadPoolImpl::Impl::StartBGThreads() { auto th_handle = p_t.native_handle(); std::string thread_priority = Env::PriorityToString(GetThreadPriority()); std::ostringstream thread_name_stream; - thread_name_stream << "rocksdb:"; + thread_name_stream << "speedb:"; for (char c : thread_priority) { thread_name_stream << static_cast(tolower(c)); } diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc index 8585dbf121..e93ec5bd0c 100644 --- a/utilities/backup/backup_engine_test.cc +++ b/utilities/backup/backup_engine_test.cc @@ -1256,8 +1256,8 @@ TEST_F(BackupEngineTest, NoDoubleCopy_And_AutoGC) { ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst")); // 00011.sst was only in backup 1, should be deleted - ASSERT_EQ(Status::NotFound(), - test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst")); + ASSERT_TRUE(test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst") + .IsNotFound()); ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); // MANIFEST file size should be only 100 @@ -1293,16 +1293,16 @@ TEST_F(BackupEngineTest, NoDoubleCopy_And_AutoGC) { // Make sure dangling sst file has been removed (somewhere along this // process). GarbageCollect should not be needed. - ASSERT_EQ(Status::NotFound(), - test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst")); + ASSERT_TRUE(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst") + .IsNotFound()); ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst")); ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst")); // Now actually purge a good one ASSERT_OK(backup_engine_->PurgeOldBackups(1)); - ASSERT_EQ(Status::NotFound(), - test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst")); + ASSERT_TRUE(test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst") + .IsNotFound()); ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst")); CloseDBAndBackupEngine(); @@ -1389,22 +1389,18 @@ TEST_F(BackupEngineTest, CorruptionsTest) { ASSERT_OK(backup_engine_->DeleteBackup(2)); // Should not be needed anymore with auto-GC on DeleteBackup //(void)backup_engine_->GarbageCollect(); - ASSERT_EQ(Status::NotFound(), - file_manager_->FileExists(backupdir_ + "/meta/5")); - ASSERT_EQ(Status::NotFound(), - file_manager_->FileExists(backupdir_ + "/private/5")); - ASSERT_EQ(Status::NotFound(), - file_manager_->FileExists(backupdir_ + "/meta/4")); - ASSERT_EQ(Status::NotFound(), - file_manager_->FileExists(backupdir_ + "/private/4")); - ASSERT_EQ(Status::NotFound(), - file_manager_->FileExists(backupdir_ + "/meta/3")); - ASSERT_EQ(Status::NotFound(), - file_manager_->FileExists(backupdir_ + "/private/3")); - ASSERT_EQ(Status::NotFound(), - file_manager_->FileExists(backupdir_ + "/meta/2")); - ASSERT_EQ(Status::NotFound(), - file_manager_->FileExists(backupdir_ + "/private/2")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5").IsNotFound()); + ASSERT_TRUE( + file_manager_->FileExists(backupdir_ + "/private/5").IsNotFound()); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4").IsNotFound()); + ASSERT_TRUE( + file_manager_->FileExists(backupdir_ + "/private/4").IsNotFound()); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3").IsNotFound()); + ASSERT_TRUE( + file_manager_->FileExists(backupdir_ + "/private/3").IsNotFound()); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2").IsNotFound()); + ASSERT_TRUE( + file_manager_->FileExists(backupdir_ + "/private/2").IsNotFound()); CloseBackupEngine(); AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5); @@ -2546,7 +2542,7 @@ TEST_F(BackupEngineTest, DeleteTmpFiles) { } CloseDBAndBackupEngine(); for (std::string file_or_dir : tmp_files_and_dirs) { - if (file_manager_->FileExists(file_or_dir) != Status::NotFound()) { + if (!file_manager_->FileExists(file_or_dir).IsNotFound()) { FAIL() << file_or_dir << " was expected to be deleted." << cleanup_fn; } } diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 18c437dbbc..5bbcf57d8c 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -449,16 +449,22 @@ TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) { // Export onto existing directory ASSERT_OK(env_->CreateDirIfMissing(export_path_)); - ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), - export_path_, &metadata_), - Status::InvalidArgument("Specified export_dir exists")); + Status s = checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), + export_path_, &metadata_); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_NE(s.getState(), nullptr); + ASSERT_NE(strstr(s.getState(), "Specified export_dir exists"), nullptr) + << s.getState(); ASSERT_OK(DestroyDir(env_, export_path_)); // Export with invalid directory specification export_path_ = ""; - ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), - export_path_, &metadata_), - Status::InvalidArgument("Specified export_dir invalid")); + s = checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(), export_path_, + &metadata_); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_NE(s.getState(), nullptr); + ASSERT_NE(strstr(s.getState(), "Specified export_dir invalid"), nullptr) + << s.getState(); delete checkpoint; } diff --git a/utilities/env_mirror.cc b/utilities/env_mirror.cc index 3ea323b429..627ce0664c 100644 --- a/utilities/env_mirror.cc +++ b/utilities/env_mirror.cc @@ -24,7 +24,7 @@ class SequentialFileMirror : public SequentialFile { Status Read(size_t n, Slice* result, char* scratch) override { Slice aslice; Status as = a_->Read(n, &aslice, scratch); - if (as == Status::OK()) { + if (as.ok()) { char* bscratch = new char[n]; Slice bslice; #ifndef NDEBUG @@ -34,7 +34,8 @@ class SequentialFileMirror : public SequentialFile { while (left) { Status bs = b_->Read(left, &bslice, bscratch); #ifndef NDEBUG - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); assert(memcmp(bscratch, scratch + off, bslice.size()) == 0); off += bslice.size(); #endif @@ -44,7 +45,8 @@ class SequentialFileMirror : public SequentialFile { *result = aslice; } else { Status bs = b_->Read(n, result, scratch); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); } return as; } @@ -52,13 +54,15 @@ class SequentialFileMirror : public SequentialFile { Status Skip(uint64_t n) override { Status as = a_->Skip(n); Status bs = b_->Skip(n); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status InvalidateCache(size_t offset, size_t length) override { Status as = a_->InvalidateCache(offset, length); Status bs = b_->InvalidateCache(offset, length); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; }; }; @@ -72,14 +76,15 @@ class RandomAccessFileMirror : public RandomAccessFile { Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override { Status as = a_->Read(offset, n, result, scratch); - if (as == Status::OK()) { + if (as.ok()) { char* bscratch = new char[n]; Slice bslice; size_t off = 0; size_t left = result->size(); while (left) { Status bs = b_->Read(offset + off, left, &bslice, bscratch); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); assert(memcmp(bscratch, scratch + off, bslice.size()) == 0); off += bslice.size(); left -= bslice.size(); @@ -87,7 +92,8 @@ class RandomAccessFileMirror : public RandomAccessFile { delete[] bscratch; } else { Status bs = b_->Read(offset, n, result, scratch); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); } return as; } @@ -108,7 +114,8 @@ class WritableFileMirror : public WritableFile { Status Append(const Slice& data) override { Status as = a_->Append(data); Status bs = b_->Append(data); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status Append(const Slice& data, @@ -118,7 +125,8 @@ class WritableFileMirror : public WritableFile { Status PositionedAppend(const Slice& data, uint64_t offset) override { Status as = a_->PositionedAppend(data, offset); Status bs = b_->PositionedAppend(data, offset); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status PositionedAppend( @@ -129,31 +137,36 @@ class WritableFileMirror : public WritableFile { Status Truncate(uint64_t size) override { Status as = a_->Truncate(size); Status bs = b_->Truncate(size); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status Close() override { Status as = a_->Close(); Status bs = b_->Close(); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status Flush() override { Status as = a_->Flush(); Status bs = b_->Flush(); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status Sync() override { Status as = a_->Sync(); Status bs = b_->Sync(); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status Fsync() override { Status as = a_->Fsync(); Status bs = b_->Fsync(); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } bool IsSyncThreadSafe() const override { @@ -186,7 +199,8 @@ class WritableFileMirror : public WritableFile { Status InvalidateCache(size_t offset, size_t length) override { Status as = a_->InvalidateCache(offset, length); Status bs = b_->InvalidateCache(offset, length); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } @@ -194,13 +208,15 @@ class WritableFileMirror : public WritableFile { Status Allocate(uint64_t offset, uint64_t length) override { Status as = a_->Allocate(offset, length); Status bs = b_->Allocate(offset, length); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } Status RangeSync(uint64_t offset, uint64_t nbytes) override { Status as = a_->RangeSync(offset, nbytes); Status bs = b_->RangeSync(offset, nbytes); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); return as; } }; @@ -214,7 +230,8 @@ Status EnvMirror::NewSequentialFile(const std::string& f, SequentialFileMirror* mf = new SequentialFileMirror(f); Status as = a_->NewSequentialFile(f, &mf->a_, options); Status bs = b_->NewSequentialFile(f, &mf->b_, options); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); if (as.ok()) r->reset(mf); else @@ -231,7 +248,8 @@ Status EnvMirror::NewRandomAccessFile(const std::string& f, RandomAccessFileMirror* mf = new RandomAccessFileMirror(f); Status as = a_->NewRandomAccessFile(f, &mf->a_, options); Status bs = b_->NewRandomAccessFile(f, &mf->b_, options); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); if (as.ok()) r->reset(mf); else @@ -246,7 +264,8 @@ Status EnvMirror::NewWritableFile(const std::string& f, WritableFileMirror* mf = new WritableFileMirror(f, options); Status as = a_->NewWritableFile(f, &mf->a_, options); Status bs = b_->NewWritableFile(f, &mf->b_, options); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); if (as.ok()) r->reset(mf); else @@ -263,7 +282,8 @@ Status EnvMirror::ReuseWritableFile(const std::string& fname, WritableFileMirror* mf = new WritableFileMirror(fname, options); Status as = a_->ReuseWritableFile(fname, old_fname, &mf->a_, options); Status bs = b_->ReuseWritableFile(fname, old_fname, &mf->b_, options); - assert(as == bs); + assert(as.code() == bs.code()); + assert(as.subcode() == bs.subcode()); if (as.ok()) r->reset(mf); else diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index a07476bcdb..9a6acc766e 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -81,9 +81,11 @@ IOStatus FSFileState::DropUnsyncedData() { } IOStatus FSFileState::DropRandomUnsyncedData(Random* rand) { - int range = static_cast(buffer_.size()); - size_t truncated_size = static_cast(rand->Uniform(range)); - buffer_.resize(truncated_size); + const int range = static_cast(buffer_.size()); + if (range > 0) { + size_t truncated_size = static_cast(rand->Uniform(range)); + buffer_.resize(truncated_size); + } return IOStatus::OK(); } diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index bef22fc2f3..ba55cfbfe7 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -272,7 +272,7 @@ int main(int argc, char** argv) { #include int main(int /*argc*/, char** /*argv*/) { - printf("Skipped in RocksDBLite as utilities are not supported.\n"); + printf("Skipped in LITE mode as utilities are not supported.\n"); return 0; } #endif // !ROCKSDB_LITE diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index 2522072fcc..1068faedb8 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -772,7 +772,7 @@ int main(int argc, char** argv) { #include int main(int /*argc*/, char** /*argv*/) { - printf("Skipped in RocksDBLite as utilities are not supported.\n"); + printf("Skipped in LITE mode as utilities are not supported.\n"); return 0; } #endif // !ROCKSDB_LITE diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc index 97784efe43..feddc56012 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc @@ -238,7 +238,7 @@ int main(int argc, char** argv) { } #else int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as RocksDBLite does not include utilities.\n"); + fprintf(stderr, "SKIPPED as LITE mode does not include utilities.\n"); return 0; } #endif // !ROCKSDB_LITE diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 798fb2ad08..42883f0e65 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -941,7 +941,7 @@ TEST_P(TransactionTest, CommitTimeBatchFailTest) { // fails due to non-empty commit-time batch s = txn1->Commit(); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); delete txn1; } @@ -1058,7 +1058,7 @@ TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) { // we already committed s = txn->Commit(); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // no longer is prepared results db->GetAllPreparedTransactions(&prepared_trans); @@ -1131,15 +1131,15 @@ TEST_P(TransactionTest, TwoPhaseNameTest) { // cant prepare txn without name s = txn1->Prepare(); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // name too short s = txn1->SetName(""); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // name too long s = txn1->SetName(std::string(513, 'x')); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // valid set name s = txn1->SetName("name1"); @@ -1147,11 +1147,11 @@ TEST_P(TransactionTest, TwoPhaseNameTest) { // cant have duplicate name s = txn2->SetName("name1"); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // shouldn't be able to prepare s = txn2->Prepare(); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // valid name set s = txn2->SetName("name2"); @@ -1159,7 +1159,7 @@ TEST_P(TransactionTest, TwoPhaseNameTest) { // cant reset name s = txn2->SetName("name3"); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_EQ(txn1->GetName(), "name1"); ASSERT_EQ(txn2->GetName(), "name2"); @@ -1169,7 +1169,7 @@ TEST_P(TransactionTest, TwoPhaseNameTest) { // can't rename after prepare s = txn1->SetName("name4"); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_OK(txn1->Rollback()); ASSERT_OK(txn2->Rollback()); @@ -1272,7 +1272,7 @@ TEST_P(TransactionStressTest, TwoPhaseExpirationTest) { ASSERT_OK(s); s = txn2->Prepare(); - ASSERT_EQ(s, Status::Expired()); + ASSERT_TRUE(s.IsExpired()); delete txn1; delete txn2; @@ -1338,11 +1338,11 @@ TEST_P(TransactionTest, TwoPhaseRollbackTest) { // make commit s = txn->Commit(); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // try rollback again s = txn->Rollback(); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); delete txn; } @@ -1437,7 +1437,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) { // we already committed s = txn->Commit(); - ASSERT_EQ(s, Status::InvalidArgument()); + ASSERT_TRUE(s.IsInvalidArgument()); // no longer is prepared results prepared_trans.clear(); @@ -1618,7 +1618,7 @@ TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) { // verify data txn data s = db->Get(read_options, "foo", &value); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(value, "bar"); // verify non txn data @@ -1626,7 +1626,7 @@ TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) { std::string key(i, 'k'); std::string val(1000, 'v'); s = db->Get(read_options, key, &value); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(value, val); } @@ -1675,7 +1675,7 @@ TEST_P(TransactionTest, TwoPhaseSequenceTest) { // value is now available s = db->Get(read_options, "foo4", &value); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(value, "bar4"); } #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) @@ -1718,7 +1718,7 @@ TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) { ASSERT_OK(s); s = db->Get(read_options, "foo", &value); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(value, "bar"); delete txn; @@ -1745,11 +1745,11 @@ TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) { // value is now available s = db->Get(read_options, "foo", &value); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(value, "bar"); s = db->Get(read_options, "foo2", &value); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ(value, "bar2"); } diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 2780cf24de..67f32907da 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -223,12 +223,11 @@ class TransactionTestBase : public ::testing::Test { std::atomic expected_commits = {0}; // Without Prepare, the commit does not write to WAL std::atomic with_empty_commits = {0}; - std::function txn_t0_with_status = [&](size_t index, - Status exp_s) { + std::function txn_t0_ok = [&](size_t index) { // Test DB's internal txn. It involves no prepare phase nor a commit marker. WriteOptions wopts; auto s = db->Put(wopts, "key" + std::to_string(index), "value"); - ASSERT_EQ(exp_s, s); + ASSERT_OK(s); if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { // Consume one seq per key exp_seq++; @@ -243,7 +242,7 @@ class TransactionTestBase : public ::testing::Test { with_empty_commits++; }; std::function txn_t0 = [&](size_t index) { - return txn_t0_with_status(index, Status::OK()); + return txn_t0_ok(index); }; std::function txn_t1 = [&](size_t index) { // Testing directly writing a write batch. Functionality-wise it is diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index f1f65e17a9..0dd39a2c41 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -2193,7 +2193,8 @@ void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s, Status s; PinnableSlice v; s = db->Get(roptions, db->DefaultColumnFamily(), key, &v); - ASSERT_EQ(exp_s, s); + ASSERT_EQ(exp_s.code(), s.code()); + ASSERT_EQ(exp_s.subcode(), s.subcode()); ASSERT_TRUE(s.ok() || s.IsNotFound()); if (s.ok()) { ASSERT_TRUE(exp_v == v); @@ -2206,7 +2207,8 @@ void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s, ASSERT_EQ(1, values.size()); ASSERT_EQ(1, s_vec.size()); s = s_vec[0]; - ASSERT_EQ(exp_s, s); + ASSERT_EQ(exp_s.code(), s.code()); + ASSERT_EQ(exp_s.subcode(), s.subcode()); ASSERT_TRUE(s.ok() || s.IsNotFound()); if (s.ok()) { ASSERT_TRUE(exp_v == values[0]);